diff --git a/.travis.yml b/.travis.yml
index 092c7f53..9dad6b1d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,5 +1,5 @@
 language: cpp
-dist: bionic
+dist: focal
 
 matrix:
   include:
@@ -7,33 +7,33 @@ matrix:
       compiler: gcc
       addons:
         apt:
-          packages: ['g++-8', 'g++-8-multilib', 'g++-multilib', 'valgrind', 'expect', 'curl']
-      env:
-        - COMPILER=g++-8
-        - COMP=gcc
-
-    - os: linux
-      compiler: clang
-      addons:
-        apt:
-          packages: ['clang-10', 'llvm-10-dev', 'g++-multilib', 'valgrind', 'expect', 'curl']
-      env:
-        - COMPILER=clang++-10
-        - COMP=clang
-
-    - os: osx
-      osx_image: xcode12
-      compiler: gcc
+          packages: ['g++-multilib', 'valgrind', 'expect', 'curl', 'libopenblas-dev']
       env:
         - COMPILER=g++
         - COMP=gcc
 
-    - os: osx
-      osx_image: xcode12
-      compiler: clang
-      env:
-        - COMPILER=clang++
-        - COMP=clang
+#    - os: linux
+#      compiler: clang
+#      addons:
+#        apt:
+#          packages: ['clang-10', 'llvm-10-dev', 'g++-multilib', 'valgrind', 'expect', 'curl', 'openblas']
+#      env:
+#        - COMPILER=clang++-10
+#        - COMP=clang
+#
+#    - os: osx
+#      osx_image: xcode12
+#      compiler: gcc
+#      env:
+#        - COMPILER=g++
+#        - COMP=gcc
+#
+#    - os: osx
+#      osx_image: xcode12
+#      compiler: clang
+#      env:
+#        - COMPILER=clang++
+#        - COMP=clang
 
 branches:
   only:
@@ -65,16 +65,13 @@ script:
   - make clean && make -j2 ARCH=x86-64-ssse3 build && ../tests/signature.sh $benchref
   - make clean && make -j2 ARCH=x86-64-sse3-popcnt build && ../tests/signature.sh $benchref
   - make clean && make -j2 ARCH=x86-64 build && ../tests/signature.sh $benchref
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-64 build && ../tests/signature.sh $benchref; fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 optimize=no debug=yes build && ../tests/signature.sh $benchref; fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32-sse41-popcnt build && ../tests/signature.sh $benchref; fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32-sse2 build && ../tests/signature.sh $benchref; fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 build && ../tests/signature.sh $benchref; fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-32 build && ../tests/signature.sh $benchref; fi
-  # workaround: exclude a custom version of llvm+clang, which doesn't find llvm-profdata on ubuntu
-  - if [[ "$TRAVIS_OS_NAME" != "linux" || "$COMP" == "gcc" ]]; then make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref; fi
+  # TODO avoid _mm_malloc
+  # - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-64 build && ../tests/signature.sh $benchref; fi
+  - make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref
 
   # compile only for some more advanced architectures (might not run in travis)
+  - make clean && make -j2 ARCH=x86-64-avx2 blas=yes build
+
   - make clean && make -j2 ARCH=x86-64-avx2 build
   - make clean && make -j2 ARCH=x86-64-bmi2 build
   - make clean && make -j2 ARCH=x86-64-avx512 build
@@ -91,11 +88,16 @@ script:
   # Valgrind
   #
   - export CXXFLAGS="-O1 -fno-inline"
-  - if [ -x "$(command -v valgrind )" ]; then make clean && make -j2 ARCH=x86-64-modern debug=yes optimize=no build > /dev/null && ../tests/instrumented.sh --valgrind; fi
-  - if [ -x "$(command -v valgrind )" ]; then ../tests/instrumented.sh --valgrind-thread; fi
+  - make clean && make -j2 ARCH=x86-64-modern debug=yes optimize=no build > /dev/null && ../tests/instrumented.sh --valgrind
+  - ../tests/instrumented.sh --valgrind-thread
 
   #
   # Sanitizer
   #
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-undefined; fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64-modern sanitize=thread    optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-thread; fi
+  - make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-undefined
+  - make clean && make -j2 ARCH=x86-64-modern sanitize=thread    optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-thread
+
+  # NNUE testing
+  - export CXXFLAGS="-O1 -fno-inline"
+  - make clean && make -j2 ARCH=x86-64-modern debug=no optimize=no build > /dev/null && ../tests/instrumented_learn.sh --valgrind
+  - make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=yes build > /dev/null && ../tests/instrumented_learn.sh --sanitizer-undefined
diff --git a/AUTHORS b/AUTHORS
index c96f870a..b31a36e9 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -19,6 +19,7 @@ Alain Savard (Rocky640)
 Alayan Feh (Alayan-stk-2)
 Alexander Kure
 Alexander Pagel (Lolligerhans)
+Alfredo Menezes (lonfom169)
 Ali AlZhrani (Cooffe)
 Andrew Grant (AndyGrant)
 Andrey Neporada (nepal)
@@ -36,12 +37,14 @@ Bryan Cross (crossbr)
 candirufish
 Chess13234
 Chris Cain (ceebo)
+Dale Weiler (graphitemaster)
 Dan Schmidt (dfannius)
 Daniel Axtens (daxtens)
 Daniel Dugovic (ddugovic)
-Dariusz Orzechowski
+Dariusz Orzechowski (dorzechowski)
 David Zar
 Daylen Yang (daylen)
+Deshawn Mohan-Smith (GoldenRare)
 DiscanX
 Dominik Schlösser (domschl)
 double-beep
@@ -83,7 +86,7 @@ Jekaa
 Jerry Donald Watson (jerrydonaldwatson)
 jjoshua2
 Jonathan Calovski (Mysseno)
-Jonathan Dumale (SFisGOD)
+Jonathan Buladas Dumale (SFisGOD)
 Joost VandeVondele (vondele)
 Jörg Oster (joergoster)
 Joseph Ellis (jhellis3)
@@ -109,6 +112,7 @@ Mark Tenzer (31m059)
 marotear
 Matthew Lai (matthewlai)
 Matthew Sullivan (Matt14916)
+Maxim Molchanov (Maxim)
 Michael An (man)
 Michael Byrne (MichaelB7)
 Michael Chaly (Vizvezdenec)
diff --git a/README.md b/README.md
index 56ce7d3e..99168e3f 100644
--- a/README.md
+++ b/README.md
@@ -5,9 +5,37 @@
 <h1 align="center">Stockfish NNUE</h1>
 
 ## Overview
+
 Stockfish NNUE is a port of a shogi neural network named NNUE (efficiently updateable neural network backwards) to Stockfish 11. To learn more about the Stockfish chess engine, look [here](stockfish.md) for an overview and [here](https://github.com/official-stockfish/Stockfish) for the official repository.
 
-## Compilation Instructions for Mac
+=======
+## Building
+
+To compile:
+```
+make -jN ARCH=... build
+```
+
+To compile with Profile Guided Optimizations. Requires that the computer that is used for compilation supports the selected `ARCH`.
+```
+make -jN ARCH=... profile-build
+```
+
+`N` is the number of threads to use for compilation.
+
+`ARCH` is one of:
+`x86-64-vnni512`, `x86-64-vnni256`, `x86-64-avx512`, `x86-64-bmi2`, `x86-64-avx2`,
+`x86-64-sse41-popcnt`, `x86-64-modern`, `x86-64-ssse3`, `x86-64-sse3-popcnt`,
+`x86-64`, `x86-32-sse41-popcnt`, `x86-32-sse2`, `x86-32`, `ppc-64`, `ppc-32,
+armv7`, `armv7-neon`, `armv8`, `apple-silicon`, `general-64`, `general-32`.
+
+`ARCH` needs to be chosen based based on the instruction set of the CPU that will run stockfish. `x86-64-modern` will produce a binary that works on most common processors, but other options may increase performance for specific hardware.
+
+Additional options:
+
+- `blas=[yes/no]` - whether to use an external BLAS library. Default is `no`. Using an external BLAS library may have a significantly improve learning performance and by default expects openBLAS to be installed.
+
+### Building Instructions for Mac
 
 1. Ensure that you have OpenBlas Installed
 ```
@@ -24,62 +52,91 @@ cd src
 make profile-learn ARCH=x86-64 COMP=gcc
 ```
 
-
 ## Training Guide
+
 ### Generating Training Data
-To generate training data from the classic eval, use the gensfen command with the setting "Use NNUE" set to "false". The given example is generation in its simplest form. There are more commands. 
+
+To generate training data from the classic eval, use the gensfen command with the setting "Use NNUE" set to "false". The given example is generation in its simplest form. There are more commands.
+
 ```
 uci
+setoption name PruneAtShallowDepth value false
 setoption name Use NNUE value false
 setoption name Threads value x
 setoption name Hash value y
 setoption name SyzygyPath value path
 isready
-gensfen depth a loop b use_draw_in_training_data_generation 1 eval_limit 32000 use_raw_nnue_eval 0
+gensfen depth a loop b use_draw_in_training_data_generation 1 eval_limit 32000
 ```
-Specify how many threads and how much memory you would like to use with the x and y values. The option SyzygyPath is not necessary, but if you would like to use it, you must first have Syzygy endgame tablebases on your computer, which you can find [here](http://oics.olympuschess.com/tracker/index.php). You will need to have a torrent client to download these tablebases, as that is probably the fastest way to obtain them. The path is the path to the folder containing those tablebases. It does not have to be surrounded in quotes.
 
-use_raw_nnue_eval controls if the training data generator or trainer uses raw NNUE eval values.  Don't forget to set use_raw_nnue_eval 0 when initial training data are generated.  Otherwise, the gensfen command will crash.
+- `depth` is the searched depth per move, or how far the engine looks forward. This value is an integer.
+- `loop` is the amount of positions generated. This value is also an integer.
+
+Specify how many threads and how much memory you would like to use with the `x` and `y` values. The option SyzygyPath is not necessary, but if you would like to use it, you must first have Syzygy endgame tablebases on your computer, which you can find [here](http://oics.olympuschess.com/tracker/index.php). You will need to have a torrent client to download these tablebases, as that is probably the fastest way to obtain them. The `path` is the path to the folder containing those tablebases. It does not have to be surrounded in quotes.
+
+This will create a file named "generated_kifu.binpack" in the same folder as the binary containing the generated training data. Once generation is done, you can rename the file to something like "1billiondepth12.binpack" to remember the depth and quantity of the positions and move it to a folder named "trainingdata" in the same directory as the binaries.
+
+You will also need validation data that is used for loss calculation and accuracy computation. Validation data is generated in the same way as training data, but generally at most 1 million positions should be used as there's no need for more and it would just slow the learning process down. It may also be better to slightly increase the depth for validation data. After generation you can rename the validation data file to "val.binpack" and drop it in a folder named "validationdata" in the same directory to make it easier.
+
+More information about gensfen and available options can be found in the [docs](docs/gensfen.md)
+
+### Training a network
+
+#### Training a Completely New Network
+
+Whether a new network is created or not is controlled by the UCI option `SkipLoadingEval`. If set to true then a new network will be created, which allows learning from scratch. If left at its default (false) then a network will be loaded and trained further. The second scenario is described in the reinforcement learning paragraph.
+
+A simple command chain to start with training could look like this:
 
-This will save a file named "generated_kifu.bin" in the same folder as the binary. Once generation is done, rename the file to something like "1billiondepth12.bin" to remember the depth and quantity of the positions and move it to a folder named "trainingdata" in the same directory as the binaries.
-#### Generation Parameters
-- Depth is the searched depth per move, or how far the engine looks forward. This value is an integer.
-- Loop is the amount of positions generated. This value is also an integer
-### Generating Validation Data
-The process is the same as the generation of training data, except for the fact that you need to set loop to 1 million, because you don't need a lot of validation data. The depth should be the same as before or slightly higher than the depth of the training data. After generation rename the validation data file to val.bin and drop it in a folder named "validationdata" in the same directory to make it easier. 
-### Training a Completely New Network
-Use the "learn" binary. Create an empty folder named "evalsave" in the same directory as the binaries.
 ```
 uci
+setoption name EnableTranspositionTable value false
+setoption name PruneAtShallowDepth value false
 setoption name SkipLoadingEval value true
-setoption name Use NNUE value true
+setoption name Use NNUE value pure
 setoption name Threads value x
 isready
-learn targetdir trainingdata loop 100 batchsize 1000000 use_draw_in_training 1 use_draw_in_validation 1 eta 1 lambda 1 eval_limit 32000 nn_batch_size 1000 newbob_decay 0.5 eval_save_interval 250000000 loss_output_interval 1000000 mirror_percentage 50 validation_set_file_name validationdata\val.bin
+learn targetdir trainingdata epochs 10000 batchsize 1000000 use_draw_in_training 1 use_draw_in_validation 1 lr 1 lambda 1 eval_limit 32000 nn_batch_size 1000 newbob_decay 0.5 eval_save_interval 250000000 loss_output_interval 1000000 validation_set_file_name validationdata\val.binpack
 ```
-Nets get saved in the "evalsave" folder. 
 
-#### Training Parameters
-- eta is the learning rate
-- lambda is the amount of weight it puts to eval of learning data vs win/draw/loss results. 1 puts all weight on eval, lambda 0 puts all weight on WDL results.
+This will utilize training data files in the "trainingdata" directory and validation data from file "validationdata\val.bin". Produced nets are saved in the "evalsave" folder.
 
-### Reinforcement Learning
-If you would like to do some reinforcement learning on your original network, you must first generate training data using the learn binaries with the setting `Use NNUE` set to true. Make sure that your previously trained network is in the eval folder. Use the commands specified above. Make sure `SkipLoadingEval` is set to false so that the data generated is using the neural net's eval by typing the command `setoption name SkipLoadingEval value false` before typing the `isready` command. You should aim to generate less positions than the first run, around 1/10 of the number of positions generated in the first run. The depth should be higher as well. You should also do the same for validation data, with the depth being higher than the last run.
+More information about learn and available parameters can be found in the [docs](docs/learn.md)
 
-After you have generated the training data, you must move it into your training data folder and delete the older data so that the binary does not accidentally train on the same data again. Do the same for the validation data and name it to val-1.bin to make it less confusing. Make sure the evalsave folder is empty. Then, using the same binary, type in the training commands shown above. Do __NOT__ set `SkipLoadingEval` to true, it must be false or you will get a completely new network, instead of a network trained with reinforcement learning. You should also set eval_save_interval to a number that is lower than the amount of positions in your training data, perhaps also 1/10 of the original value. The validation file should be set to the new validation data, not the old data.
+#### Reinforcement Learning
 
-After training is finished, your new net should be located in the "final" folder under the "evalsave" directory. You should test this new network against the older network to see if there are any improvements.
+If you would like to do some reinforcement learning on your original network, you must first generate training data with the setting `Use NNUE` set to `pure` and using the previous network (either name it "nn.bin" and put into alongside the binary or provide the `EvalFile` UCI option). Use the commands specified above. You should aim to generate less positions than the first run, around 1/10 of the number of positions generated in the first run. The depth should be higher as well. You should also do the same for validation data, with the depth being higher than the last run.
+
+After you have generated the training data, you must move it into your training data folder and move the older data so that the binary does not train on the same data again. Do the same for the validation data. Make sure the "evalsave" folder is empty. Then, using the same binary, type in the training commands shown above. Do __NOT__ set `SkipLoadingEval` to true, it must be false or you will get a completely new network, instead of a network trained with reinforcement learning. You should also set `eval_save_interval` to a number that is lower than the amount of positions in your training data, perhaps also 1/10 of the original value.
+
+After training is finished, your new net should be located in the "final" folder under the "evalsave" directory. You should test this new network against the older network to see if there are any improvements. Don't rely on the automatic rejection for network quality, sometimes even rejected nets can be better than the previous ones.
 
 ## Using Your Trained Net
+
 If you want to use your generated net, copy the net located in the "final" folder under the "evalsave" directory and move it into a new folder named "eval" under the directory with the binaries. You can then use the halfkp_256x2 binaries pertaining to your CPU with a standard chess GUI, such as Cutechess. Refer to the [releases page](https://abrok.eu/stockfish) to find out which binary is best for your CPU.
 
-If the engine does not load any net file, or shows "Error! *** not found or wrong format", please try to specify the net with the full file path with the "EvalFile" option by typing the command `setoption name EvalFile value path` where path is the full file path. The "Use NNUE" option must be set to true with the command `setoption name Use NNUE value true`.
+If the engine does not load any net file, or shows "Error! *** not found or wrong format", please try to specify the net with the full file path with the `EvalFile` UCI option by typing the command `setoption name EvalFile value path` where path is the full file path. The `Use NNUE` UCI option must be set either to `true` or `pure` with the command `setoption name Use NNUE value true/pure`.
+
+## Training data formats.
+
+Currently there are 3 training data formats. Two of them are supported directly.
+
+- `.bin` - the original training data format. Uses 40 bytes per entry. Is supported directly by the `gensfen` and `learn` commands.
+- `.plain` - a human readable training data format. This one is not supported directly by the `gensfen` and `learn` commands. It should not be used for data exchange because it's less compact than other formats. It is mostly useful for inspection of the data.
+- `.binpack` - a compact binary training data format that exploits positions chains to further reduce size. It uses on average between 2 to 3 bytes per entry when generating data with `gensfen`. It is supported directly by `gensfen` and `learn` commands. It is currently the default for the `gensfen` command. A more in depth description can be found [here](docs/binpack.md)
+
+### Conversion between formats.
+
+There is a builting converted that support all 3 formats described above. Any of them can be converted to any other. For more information and usage guide see [here](docs/convert.md).
 
 ## Resources
+
+- [Training NNUE for SF](https://docs.google.com/document/d/1os5GH8GGJbV0nKAfXD-qySBclFzKKtXKHbAnA-un8tA/edit) google document with important information and coding priorities
+- [Gensfen data (vondele)](https://drive.google.com/drive/folders/1mftuzYdl9o6tBaceR3d_VBQIrgKJsFpl) over 2b fens available
 - [Stockfish NNUE Wiki](https://www.qhapaq.org/shogi/shogiwiki/stockfish-nnue/)
 - [Training instructions](https://twitter.com/mktakizawa/status/1273042640280252416) from the creator of the Elmo shogi engine
 - [Original Talkchess thread](http://talkchess.com/forum3/viewtopic.php?t=74059) discussing Stockfish NNUE
-- [Guide to Stockfish NNUE](http://yaneuraou.yaneu.com/2020/06/19/stockfish-nnue-the-complete-guide/) 
+- [Guide to Stockfish NNUE](http://yaneuraou.yaneu.com/2020/06/19/stockfish-nnue-the-complete-guide/)
 - [Unofficial Stockfish Discord](https://discord.gg/nv8gDtt)
 
 A more updated list can be found in the #sf-nnue-resources channel in the Discord.
diff --git a/Top CPU Contributors.txt b/Top CPU Contributors.txt
index 0ea5ac72..482e9000 100644
--- a/Top CPU Contributors.txt	
+++ b/Top CPU Contributors.txt	
@@ -1,154 +1,173 @@
-Contributors with >10,000 CPU hours as of January 7, 2020
+Contributors with >10,000 CPU hours as of Sept 2, 2020
 Thank you!
 
 Username                  CPU Hours   Games played
 --------------------------------------------------
-noobpwnftw                  9305707      695548021
-mlang                        780050       61648867
-dew                          621626       43921547
-mibere                       524702       42238645
-crunchy                      354587       27344275
-cw                           354495       27274181
-fastgm                       332801       22804359
-JojoM                        295750       20437451
-CSU_Dynasty                  262015       21828122
-Fisherman                    232181       18939229
-ctoks                        218866       17622052
-glinscott                    201989       13780820
-tvijlbrief                   201204       15337115
-velislav                     188630       14348485
-gvreuls                      187164       15149976
-bking_US                     180289       11876016
-nordlandia                   172076       13467830
-leszek                       157152       11443978
-Thanar                       148021       12365359
-spams                        141975       10319326
-drabel                       138073       11121749
-vdv                          137850        9394330
-mgrabiak                     133578       10454324
-TueRens                      132485       10878471
-bcross                       129683       11557084
-marrco                       126078        9356740
-sqrt2                        125830        9724586
-robal                        122873        9593418
-vdbergh                      120766        8926915
-malala                       115926        8002293
-CoffeeOne                    114241        5004100
-dsmith                       113189        7570238
-BrunoBanani                  104644        7436849
-Data                          92328        8220352
-mhoram                        89333        6695109
-davar                         87924        7009424
-xoto                          81094        6869316
-ElbertoOne                    80899        7023771
-grandphish2                   78067        6160199
-brabos                        77212        6186135
-psk                           75733        5984901
-BRAVONE                       73875        5054681
-sunu                          70771        5597972
-sterni1971                    70605        5590573
-MaZePallas                    66886        5188978
-Vizvezdenec                   63708        4967313
-nssy                          63462        5259388
-jromang                       61634        4940891
-teddybaer                     61231        5407666
-Pking_cda                     60099        5293873
-solarlight                    57469        5028306
-dv8silencer                   56913        3883992
-tinker                        54936        4086118
-renouve                       49732        3501516
-Freja                         49543        3733019
-robnjr                        46972        4053117
-rap                           46563        3219146
-Bobo1239                      46036        3817196
-ttruscott                     45304        3649765
-racerschmacer                 44881        3975413
-finfish                       44764        3370515
-eva42                         41783        3599691
-biffhero                      40263        3111352
-bigpen0r                      39817        3291647
-mhunt                         38871        2691355
-ronaldjerum                   38820        3240695
-Antihistamine                 38785        2761312
-pb00067                       38038        3086320
-speedycpu                     37591        3003273
-rkl                           37207        3289580
-VoyagerOne                    37050        3441673
-jbwiebe                       35320        2805433
-cuistot                       34191        2146279
-homyur                        33927        2850481
-manap                         32873        2327384
-gri                           32538        2515779
-oryx                          31267        2899051
-EthanOConnor                  30959        2090311
-SC                            30832        2730764
-csnodgrass                    29505        2688994
-jmdana                        29458        2205261
-strelock                      28219        2067805
-jkiiski                       27832        1904470
-Pyafue                        27533        1902349
-Garf                          27515        2747562
-eastorwest                    27421        2317535
-slakovv                       26903        2021889
-Prcuvu                        24835        2170122
-anst                          24714        2190091
-hyperbolic.tom                24319        2017394
-Patrick_G                     23687        1801617
-Sharaf_DG                     22896        1786697
-nabildanial                   22195        1519409
-chriswk                       21931        1868317
-achambord                     21665        1767323
-Zirie                         20887        1472937
-team-oh                       20217        1636708
-Isidor                        20096        1680691
-ncfish1                       19931        1520927
-nesoneg                       19875        1463031
-Spprtr                        19853        1548165
-JanErik                       19849        1703875
-agg177                        19478        1395014
-SFTUser                       19231        1567999
-xor12                         19017        1680165
-sg4032                        18431        1641865
-rstoesser                     18118        1293588
-MazeOfGalious                 17917        1629593
-j3corre                       17743         941444
-cisco2015                     17725        1690126
-ianh2105                      17706        1632562
-dex                           17678        1467203
-jundery                       17194        1115855
-iisiraider                    17019        1101015
-horst.prack                   17012        1465656
-Adrian.Schmidt123             16563        1281436
-purplefishies                 16342        1092533
-wei                           16274        1745989
-ville                         16144        1384026
-eudhan                        15712        1283717
-OuaisBla                      15581         972000
-DragonLord                    15559        1162790
-dju                           14716         875569
-chris                         14479        1487385
-0xB00B1ES                     14079        1001120
-OssumOpossum                  13776        1007129
-enedene                       13460         905279
-bpfliegel                     13346         884523
-Ente                          13198        1156722
-IgorLeMasson                  13087        1147232
-jpulman                       13000         870599
-ako027ako                     12775        1173203
-Nikolay.IT                    12352        1068349
-Andrew Grant                  12327         895539
-joster                        12008         950160
-AdrianSA                      11996         804972
-Nesa92                        11455        1111993
-fatmurphy                     11345         853210
-Dark_wizzie                   11108        1007152
-modolief                      10869         896470
-mschmidt                      10757         803401
-infinity                      10594         727027
-mabichito                     10524         749391
-Thomas A. Anderson            10474         732094
-thijsk                        10431         719357
-Flopzee                       10339         894821
-crocogoat                     10104        1013854
-SapphireBrand                 10104         969604
-stocky                        10017         699440
+noobpwnftw                 19352969     1231459677
+mlang                        957168       61657446
+dew                          949885       56893432
+mibere                       703817       46865007
+crunchy                      427035       27344275
+cw                           416006       27521077
+JojoM                        415904       24479564
+fastgm                       404873       23953472
+CSU_Dynasty                  335774       22850550
+tvijlbrief                   335199       21871270
+Fisherman                    325053       21786603
+gvreuls                      311480       20751516
+ctoks                        275877       18710423
+velislav                     241267       15596372
+glinscott                    217799       13780820
+nordlandia                   211692       13484886
+bcross                       206213       14934233
+bking_US                     198894       11876016
+leszek                       189170       11446821
+mgrabiak                     183896       11778092
+drabel                       181408       12489478
+TueRens                      181349       12192000
+Thanar                       179852       12365359
+vdv                          175171        9881246
+robal                        166948       10702862
+spams                        157128       10319326
+marrco                       149947        9376421
+sqrt2                        147963        9724586
+vdbergh                      137041        8926915
+CoffeeOne                    136294        5004100
+malala                       136182        8002293
+mhoram                       128934        8177193
+davar                        122092        7960001
+dsmith                       122059        7570238
+xoto                         119696        8222144
+grandphish2                  116481        7582197
+Data                         113305        8220352
+BrunoBanani                  112960        7436849
+ElbertoOne                    99028        7023771
+MaZePallas                    98571        6362619
+brabos                        92118        6186135
+psk                           89957        5984901
+sunu                          88463        6007033
+sterni1971                    86948        5613788
+Vizvezdenec                   83752        5343724
+BRAVONE                       81239        5054681
+nssy                          76497        5259388
+teddybaer                     75125        5407666
+Pking_cda                     73776        5293873
+jromang                       70695        4940891
+solarlight                    70517        5028306
+dv8silencer                   70287        3883992
+Bobo1239                      68515        4652287
+racerschmacer                 67468        4935996
+manap                         66273        4121774
+tinker                        63458        4213726
+linrock                       59082        4516053
+robnjr                        57262        4053117
+Freja                         56938        3733019
+ttruscott                     56005        3679485
+renouve                       53811        3501516
+cuistot                       52532        3014920
+finfish                       51360        3370515
+eva42                         51272        3599691
+rkl                           50759        3840947
+rap                           49985        3219146
+pb00067                       49727        3298270
+ronaldjerum                   47654        3240695
+bigpen0r                      47278        3291647
+biffhero                      46564        3111352
+VoyagerOne                    45386        3445881
+speedycpu                     43842        3003273
+jbwiebe                       43305        2805433
+Antihistamine                 41788        2761312
+mhunt                         41735        2691355
+eastorwest                    40387        2812173
+homyur                        39893        2850481
+gri                           39871        2515779
+oryx                          38228        2941656
+0x3C33                        37773        2529097
+SC                            37290        2731014
+csnodgrass                    36207        2688994
+jmdana                        36108        2205261
+strelock                      34716        2074055
+Garf                          33800        2747562
+EthanOConnor                  33370        2090311
+slakovv                       32915        2021889
+Spprtr                        32591        2139601
+Prcuvu                        30377        2170122
+anst                          30301        2190091
+jkiiski                       30136        1904470
+hyperbolic.tom                29840        2017394
+Pyafue                        29650        1902349
+OuaisBla                      27629        1578000
+chriswk                       26902        1868317
+achambord                     26582        1767323
+Patrick_G                     26276        1801617
+yorkman                       26193        1992080
+SFTUser                       25182        1675689
+nabildanial                   24942        1519409
+Sharaf_DG                     24765        1786697
+ncfish1                       24411        1520927
+agg177                        23890        1395014
+JanErik                       23408        1703875
+Isidor                        23388        1680691
+Norabor                       22976        1587862
+cisco2015                     22880        1759669
+Zirie                         22542        1472937
+team-oh                       22272        1636708
+MazeOfGalious                 21978        1629593
+sg4032                        21945        1643065
+ianh2105                      21725        1632562
+xor12                         21628        1680365
+dex                           21612        1467203
+nesoneg                       21494        1463031
+horst.prack                   20878        1465656
+0xB00B1ES                     20590        1208666
+j3corre                       20405         941444
+Adrian.Schmidt123             20316        1281436
+wei                           19973        1745989
+rstoesser                     19569        1293588
+eudhan                        19274        1283717
+Ente                          19070        1373058
+jundery                       18445        1115855
+iisiraider                    18247        1101015
+ville                         17883        1384026
+chris                         17698        1487385
+purplefishies                 17595        1092533
+DragonLord                    17014        1162790
+dju                           16515         929427
+IgorLeMasson                  16064        1147232
+ako027ako                     15671        1173203
+Nikolay.IT                    15154        1068349
+Andrew Grant                  15114         895539
+yurikvelo                     15027        1165616
+OssumOpossum                  14857        1007129
+enedene                       14476         905279
+bpfliegel                     14298         884523
+jpulman                       13982         870599
+joster                        13794         950160
+Nesa92                        13786        1114691
+Dark_wizzie                   13422        1007152
+Hjax                          13350         900887
+Fifis                         13313         965473
+mabichito                     12903         749391
+thijsk                        12886         722107
+crocogoat                     12876        1048802
+AdrianSA                      12860         804972
+Flopzee                       12698         894821
+fatmurphy                     12547         853210
+SapphireBrand                 12416         969604
+modolief                      12386         896470
+scuzzi                        12362         833465
+pgontarz                      12151         848794
+stocky                        11954         699440
+mschmidt                      11941         803401
+infinity                      11470         727027
+torbjo                        11387         728873
+Thomas A. Anderson            11372         732094
+snicolet                      11106         869170
+amicic                        10779         733593
+rpngn                         10712         688203
+d64                           10680         771144
+basepi                        10637         744851
+jjoshua2                      10559         670905
+dzjp                          10343         732529
+ols                           10259         570669
+lbraesch                      10252         647825
diff --git a/appveyor.yml b/appveyor.yml
index a3732a23..ab608409 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -63,7 +63,7 @@ build_script:
   - cmake --build . --config %CONFIGURATION% -- /verbosity:minimal
   - ps: |
       # Download default NNUE net from fishtest
-      $nnuenet = Get-Content -Path src\ucioption.cpp | Select-String -CaseSensitive -Pattern "Option" | Select-String -CaseSensitive -Pattern "nn-[a-z0-9]{12}.nnue"
+      $nnuenet = Get-Content -Path src\evaluate.h | Select-String -CaseSensitive -Pattern "EvalFileDefaultName" | Select-String -CaseSensitive -Pattern "nn-[a-z0-9]{12}.nnue"
       $dummy = $nnuenet -match "(?<nnuenet>nn-[a-z0-9]{12}.nnue)"
       $nnuenet = $Matches.nnuenet
       Write-Host "Default net:" $nnuenet
diff --git a/docs/binpack.md b/docs/binpack.md
new file mode 100644
index 00000000..1940a5dc
--- /dev/null
+++ b/docs/binpack.md
@@ -0,0 +1,42 @@
+# Binpack
+
+Binpack is a binary training data storage format designed to take advantage of position chains differing by a single move. Therefore it is very good at compactly storing data generated from real games (as opposed to random positions for example sourced from an opening book).
+
+It is currently implemented through a single header library in `extra/nnue_data_binpack_format.h`.
+
+Below follows a rough description of the format in a BNF-like notation.
+
+```
+[[nodiscard]] std::uint16_t signedToUnsigned(std::int16_t a) {
+    std::uint16_t r;
+    std::memcpy(&r, &a, sizeof(std::uint16_t));
+    if (r & 0x8000) r ^= 0x7FFF; // flip value bits if negative
+    r = (r << 1) | (r >> 15); // store sign bit at bit 0
+    return r;
+}
+
+file := <block>*
+block := BINP<chain>*
+chain := <stem><movetext>
+stem := <pos><move><score><ply_and_result><rule50> (32 bytes)
+pos := https://github.com/Sopel97/nnue_data_compress/blob/master/src/chess/Position.h#L1166 (24 bytes)
+move := https://github.com/Sopel97/nnue_data_compress/blob/master/src/chess/Chess.h#L1044 (2 bytes)
+score := signedToUnsigned(score) (2 bytes, big endian)
+ply_and_result := ply bitwise_or (signedToUnsigned(result) << 14) (2 bytes, big endian)
+rule50 := rule_50_counter (2 bytes, big endian)
+    // this is a small defect from old version,
+    I didn't want to break backwards compatibility. Effectively means that there's
+    one byte left for something else in the future because rule50 always fits in one byte.
+
+movetext := <count><move_and_score>*
+count := number of plies in the movetext (2 bytes, big endian). Can be 0.
+move_and_score := <encoded_move><encoded_score> (~2 bytes)
+encoded_move := oof this one is complicated to explain.
+    https://github.com/Sopel97/nnue_data_compress/blob/master/src/compress_file.cpp#L827.
+    https://github.com/Sopel97/chess_pos_db/blob/master/docs/bcgn/variable_length.md
+
+encoded_score := https://en.wikipedia.org/wiki/Variable-width_encoding
+    with block size of 4 bits + 1 bit for extension bit.
+    Encoded value is signedToUnsigned(-prev_score - current_score)
+    (scores are always seen from the perspective of side to move in <pos>, that's why the '-' before prev_score)
+```
\ No newline at end of file
diff --git a/docs/convert.md b/docs/convert.md
new file mode 100644
index 00000000..132f66e0
--- /dev/null
+++ b/docs/convert.md
@@ -0,0 +1,18 @@
+# Convert
+
+`convert` allows conversion of training data between any of `.plain`, `.bin`, and `.binpack`.
+
+As all commands in stockfish `convert` can be invoked either from command line (as `stockfish.exe convert ...`) or in the interactive prompt.
+
+The syntax of this command is as follows:
+```
+convert from_path to_path [append] [validate]
+```
+
+`from_path` is the path to the file to convert from. The type of the data is deduced based on its extension (one of `.plain`, `.bin`, `.binpack`).
+`to_path` is the path to an output file. The type of the data is deduced from its extension. If the file does not exist it is created.
+
+`append` and `validate` can come in any order and are optional.
+If `append` not specified then the output file will be truncated prior to any writes. If `append` is specified then the converted training data will be appended to the end of the output file.
+
+If `validate` is specified then the conversion will stop on the first illegal move found and a diagnostic will be shown.
\ No newline at end of file
diff --git a/docs/gensfen.md b/docs/gensfen.md
new file mode 100644
index 00000000..48f7f5e7
--- /dev/null
+++ b/docs/gensfen.md
@@ -0,0 +1,67 @@
+# Gensfen
+
+`gensfen` command allows generation of training data from self-play in a manner that suits training better than traditional games. It introduces random moves to diversify openings, and fixed depth evaluation.
+
+As all commands in stockfish `gensfen` can be invoked either from command line (as `stockfish.exe gensfen ...`, but this is not recommended because it's not possible to specify UCI options before `gensfen` executes) or in the interactive prompt.
+
+It is recommended to set the `PruneAtShallowDepth` UCI option to `false` as it will increase the quality of fixed depth searches.
+
+It is recommended to keep the `EnableTranspositionTable` UCI option at the default `true` value as it will make the generation process faster without noticably harming the uniformity of the data.
+
+`gensfen` takes named parameters in the form of `gensfen param_1_name param_1_value param_2_name param_2_value ...`.
+
+Currently the following options are available:
+
+`set_recommended_uci_options` - this is a modifier not a parameter, no value follows it. If specified then some UCI options are set to recommended values.
+
+`depth` - minimum depth of evaluation of each position. Default: 3.
+
+`depth2` - maximum depth of evaluation of each position. If not specified then the same as `depth`.
+
+`nodes` - the number of nodes to use for evaluation of each position. This number is multiplied by the number of PVs of the current search. This does NOT override the `depth` and `depth2` options. If specified then whichever of depth or nodes limit is reached first applies.
+
+`loop` - the number of training data entries to generate. 1 entry == 1 position. Default: 8000000000 (8B).
+
+`output_file_name` - the name of the file to output to. If the extension is not present or doesn't match the selected training data format the right extension will be appened. Default: generated_kifu
+
+`eval_limit` - evaluations with higher absolute value than this will not be written and will terminate a self-play game. Should not exceed 10000 which is VALUE_KNOWN_WIN, but is only hardcapped at mate in 2 (\~30000). Default: 3000
+
+`random_move_minply` - the minimal ply at which a random move may be executed instead of a move chosen by search. Default: 1.
+
+`random_move_maxply` - the maximal ply at which a random move may be executed instead of a move chosen by search. Default: 24.
+
+`random_move_count` - maximum number of random moves in a single self-play game. Default: 5.
+
+`random_move_like_apery` - either 0 or 1. If 1 then random king moves will be followed by a random king move from the opponent whenever possible with 50% probability. Default: 0.
+
+`random_multi_pv` - the number of PVs used for determining the random move. If not specified then a truly random move will be chosen. If specified then a multiPV search will be performed the random move will be one of the moves chosen by the search.
+
+`random_multi_pv_diff` - Makes the multiPV random move selection consider only moves that are at most `random_multi_pv_diff` worse than the next best move. Default: 30000 (all multiPV moves).
+
+`random_multi_pv_depth` - the depth to use for multiPV search for random move. Default: `depth2`.
+
+`write_minply` - minimum ply for which the training data entry will be emitted. Default: 16.
+
+`write_maxply` - maximum ply for which the training data entry will be emitted. Default: 400.
+
+`book` - a path to an opening book to use for the starting positions. Currently only .epd format is supported. If not specified then the starting position is always the standard chess starting position.
+
+`save_every` - the number of training data entries per file. If not specified then there will be always one file. If specified there may be more than one file generated (each having at most `save_every` training data entries) and each file will have a unique number attached.
+
+`random_file_name` - if specified then the output filename will be chosen randomly. Overrides `output_file_name`.
+
+`write_out_draw_game_in_training_data_generation` - either 0 or 1. If 1 then training data from drawn games will be emitted too. Default: 1.
+
+`use_draw_in_training_data_generation` - deprecated, alias for `write_out_draw_game_in_training_data_generation`
+
+`detect_draw_by_consecutive_low_score` - either 0 or 1. If 1 then drawn games will be adjudicated when the score remains 0 for at least 8 plies after ply 80. Default: 1.
+
+`use_game_draw_adjudication` - deprecated, alias for `detect_draw_by_consecutive_low_score`
+
+`detect_draw_by_insufficient_mating_material` - either 0 or 1. If 1 then position with insufficient material will be adjudicated as draws. Default: 1.
+
+`sfen_format` - format of the training data to use. Either `bin` or `binpack`. Default: `binpack`.
+
+`ensure_quiet` - this is a flag option. When specified the positions will be from the qsearch leaf.
+
+`seed` - seed for the PRNG. Can be either a number or a string. If it's a string then its hash will be used. If not specified then the current time will be used.
diff --git a/docs/learn.md b/docs/learn.md
new file mode 100644
index 00000000..30a7c951
--- /dev/null
+++ b/docs/learn.md
@@ -0,0 +1,114 @@
+# Learn
+
+`learn` command allows training a network from training data.
+
+As all commands in stockfish `learn` can be invoked either from command line (as `stockfish.exe learn ...`, but this is not recommended because it's not possible to specify UCI options before `learn` executes) or in the interactive prompt.
+
+`learn` takes named parameters in the form of `learn param_1_name param_1_value param_2_name param_2_value ...`. Unrecognized parameters form a list of paths to training data files.
+
+It is recommended to set the `EnableTranspositionTable` UCI option to `false` to reduce the interference between qsearches which are used to provide shallow evaluation. Using TT may cause the shallow evaluation to diverge from the real evaluation of the net, hiding imperfections.
+
+It is recommended to set the `PruneAtShallowDepth` UCI option to `false` as it will provide more accurate shallow evaluation.
+
+It is **required** to set the `Use NNUE` UCI option to `pure` as otherwise the function being optimized will not always match the function being probed, in which case not much can be learned.
+
+Currently the following options are available:
+
+`set_recommended_uci_options` - this is a modifier not a parameter, no value follows it. If specified then some UCI options are set to recommended values.
+
+`bat` - the size of a batch in multiples of 10000. This determines how many entries are read and shuffled at once during training. Default: 100 (meaning batch size of 1000000).
+
+`targetdir` - path to the direction from which training data will be read. All files in this directory are read sequentially. If not specified then only the list of files from positional arguments will be used. If specified then files from the given directory will be used after the explicitly specified files.
+
+`epochs` - the number of weight update cycles (epochs) to train the network for. One such cycle is `batchsize` positions. If not specified then the training will loop forever.
+
+`basedir` - the base directory for the paths. Default: "" (current directory)
+
+`batchsize` - same as `bat` but doesn't scale by 10000. Default: 1000000
+
+`lr` - initial learning rate. Default: 1.
+
+`use_draw_games_in_training` - either 0 or 1. If 1 then draws will be used in training too. Default: 1.
+
+`use_draw_in_training` - deprecated, alias for `use_draw_games_in_training`
+
+`use_draw_games_in_validation` - either 0 or 1. If 1 then draws will be used in validation too. Default: 1.
+
+`use_draw_in_validation` - deprecated, alias for `use_draw_games_in_validation`
+
+`skip_duplicated_positions_in_training` - either 0 or 1. If 1 then a small hashtable will be used to try to eliminate duplicated position from training. Default: 0.
+
+`use_hash_in_training` - deprecated, alias for `skip_duplicated_positions_in_training`
+
+`winning_probability_coefficient` - some magic value for winning probability. If you need to read this then don't touch it. Default: 1.0 / PawnValueEg / 4.0 * std::log(10.0)
+
+`use_wdl` - either 0 or 1. If 1 then the evaluations will be converted to win/draw/loss percentages prior to learning on them. (Slightly changes the gradient because eval has a different derivative than wdl). Default: 0.
+
+`lambda` - value in range [0..1]. 1 means that only evaluation is used for learning, 0 means that only game result is used. Values inbetween result in interpolation between the two contributions. See `lambda_limit` for when this is applied. Default: 1.0.
+
+`lambda2` - value in range [0..1]. 1 means that only evaluation is used for learning, 0 means that only game result is used. Values inbetween result in interpolation between the two contributions. See `lambda_limit` for when this is applied. Default: 1.0.
+
+`lambda_limit` - the maximum absolute score value for which `lambda` is used as opposed to `lambda2`. For positions with absolute evaluation higher than `lambda_limit` `lambda2` will be used. Default: 32000 (so always `lambda`).
+
+`max_grad` - the maximum allowed loss gradient for backpropagation. Effectively a form of gradient clipping. Useful for the first iterations with a randomly generated net as with higher lr backpropagation often overshoots and kills the net. The default value is fairly conservative, values as low as 0.25 could be used with lr of 1.0 without problems. Default: 1.0.
+
+`reduction_gameply` - the minimum ply after which positions won't be skipped. Positions at plies below this value are skipped with a probability that lessens linearly with the ply (reaching 0 at `reduction_gameply`). Default: 1.
+
+`eval_limit` - positions with absolute evaluation higher than this will be skipped. Default: 32000 (nothing is skipped).
+
+`save_only_once` - this is a modifier not a parameter, no value follows it. If specified then there will be only one network file generated.
+
+`no_shuffle` - this is a modifier not a parameter, no value follows it. If specified then data within a batch won't be shuffled.
+
+`nn_batch_size` - minibatch size used for learning. Should be smaller than batch size. Default: 1000.
+
+`newbob_decay` - learning rate will be multiplied by this factor every time a net is rejected (so in other words it controls LR drops). Default: 0.5 (no LR drops)
+
+`assume_quiet` - this is a flag option. When specified learn will not perform qsearch to reach a quiet position.
+
+`smart_fen_skipping` - this is a flag option. When specified some position that are not good candidates for teaching are skipped. This includes positions where the best move is a capture or promotion, and position where a king is in check.
+
+`newbob_num_trials` - determines after how many subsequent rejected nets the training process will be terminated. Default: 4.
+
+`auto_lr_drop` - every time this many positions are processed the learning rate is multiplied by `newbob_decay`. In other words this value specifies for how many positions a single learning rate stage lasts. If 0 then doesn't have any effect. Default: 0.
+
+`nn_options` - if you're reading this you don't use it. It passes messages directly to the network evaluation. I don't know what it can do either.
+
+`eval_save_interval` - every `eval_save_interval` positions the network will be saved and either accepted or rejected (in which case an LR drop follows). Default: 100000000 (100M). (generally people use values in 10M-100M range)
+
+`loss_output_interval` - every `loss_output_interval` fitness statistics are displayed. Default: 1000000 (1M)
+
+`validation_set_file_name` - path to the file with training data to be used for validation (loss computation and move accuracy)
+
+`sfen_read_size` - the number of sfens to always keep in the buffer. Default: 10000000 (10M)
+
+`thread_buffer_size` - the number of sfens to copy at once to each thread requesting more sfens for learning. Default: 10000
+
+`seed` - seed for the PRNG. Can be either a number or a string. If it's a string then its hash will be used. If not specified then the current time will be used.
+
+`verbose` - this is a modifier, not a parameter. When used there will be more detailed output during training.
+
+## Legacy subcommands and parameters
+
+### Convert
+
+`convert_plain`
+`convert_bin`
+`interpolate_eval`
+`check_invalid_fen`
+`check_illegal_move`
+`convert_bin_from_pgn-extract`
+`pgn_eval_side_to_move`
+`convert_no_eval_fens_as_score_zero`
+`src_score_min_value`
+`src_score_max_value`
+`dest_score_min_value`
+`dest_score_max_value`
+
+### Shuffle
+
+`shuffle`
+`buffer_size`
+`shuffleq`
+`shufflem`
+`output_file_name`
diff --git a/docs/transform.md b/docs/transform.md
new file mode 100644
index 00000000..82e963fe
--- /dev/null
+++ b/docs/transform.md
@@ -0,0 +1,21 @@
+# Transform
+
+`transform` command exposes subcommands that perform some specific transformation over data. The call syntax is `transform <subcommand>`. Currently implemented subcommands are listed and described below.
+
+## `nudged_static`
+
+`transform nudged_static` takes named parameters in the form of `gensfen param_1_name param_1_value param_2_name param_2_value ...` and flag parameters which don't require values.
+
+This command goes through positions in the input files and replaces the scores with new ones - generated from static eval - but slightly adjusted based on the scores in the original input file.
+
+Currently the following options are available:
+
+`input_file` - path to the input file. Supports bin and binpack formats. Default: in.binpack.
+
+`output_file` - path to the output file. Supports bin and binpack formats. Default: out.binpack.
+
+`absolute` - states that the adjustment should be bounded by an absolute value. After this token follows the maximum absolute adjustment. Values are always adjusted towards scores in the input file. This is the default mode. Default maximum adjustement: 5.
+
+`relative` - states that the adjustment should be bounded by a value relative in magnitude to the static eval value. After this token follows the maximum relative change - a floating point value greater than 0. For example a value of 0.1 only allows changing the static eval by at most 10% towards the score from the input file.
+
+`interpolate` states that the output score should be a value interpolated between static eval and the score from the input file. After this token follows the interpolation constant `t`. `t` of 0 means that only static eval is used. `t` of 1 means that only score from the input file is used. `t` of 0.5 means that the static eval and input score are averaged. It accepts values outside of range `<0, 1>`, but the usefulness is questionable.
diff --git a/script/extract_bin.py b/script/extract_bin.py
new file mode 100644
index 00000000..9574aa17
--- /dev/null
+++ b/script/extract_bin.py
@@ -0,0 +1,42 @@
+import sys
+
+ENTRY_SIZE = 40
+NUM_ENTRIES_IN_CHUNK = 1024*1024
+
+def copy(infile, outfile, count, times):
+    if times > 1:
+        outfile.write(infile.read(count*ENTRY_SIZE)*times)
+    else:
+        offset = 0
+        while offset < count:
+            to_read = NUM_ENTRIES_IN_CHUNK if offset + NUM_ENTRIES_IN_CHUNK <= count else count - offset
+
+            outfile.write(infile.read(to_read*ENTRY_SIZE))
+
+            offset += NUM_ENTRIES_IN_CHUNK
+
+def work():
+    filename = sys.argv[1]
+    offset = int(sys.argv[2])
+    count = int(sys.argv[3])
+    times = int(sys.argv[4]) if len(sys.argv) >= 5 else 1
+
+    with open(filename, 'rb') as infile:
+        infile.seek(offset * ENTRY_SIZE)
+        filename_parts = filename.split('.')
+        out_path = '.'.join(filename_parts[:-1]) + '_' + str(offset) + '_' + str(count) + '_' + str(times) + '.' + filename_parts[-1]
+        with open(out_path, 'wb') as outfile:
+            copy(infile, outfile, count, times)
+
+def show_help():
+    print('Usage: python extract_bin.py filename offset count [times]')
+    print('filename - the path to the .bin file to process')
+    print('offset - the number of sfens to skip')
+    print('count - the number of sfens to extract')
+    print('times - the number of times to repeat the extracted sfens. Default = 1')
+    print('The result is saved in a new file named `filename.stem`_`offset`_`count`_`times`.bin')
+
+if len(sys.argv) < 4:
+    show_help()
+else:
+    work()
diff --git a/script/shuffle_binpack.py b/script/shuffle_binpack.py
new file mode 100644
index 00000000..409d4907
--- /dev/null
+++ b/script/shuffle_binpack.py
@@ -0,0 +1,69 @@
+import struct
+import sys
+import os
+import random
+from pathlib import Path
+
+def index_binpack(file):
+    print('Indexing...')
+    index = []
+    offset = 0
+    report_every = 100
+    prev_mib = -report_every
+    while file.peek():
+        chunk_header = file.read(8)
+        assert chunk_header[0:4] == b'BINP'
+        size = struct.unpack('<I', chunk_header[4:])[0]
+        file.seek(size, os.SEEK_CUR)
+        index.append((offset, size + 8))
+        offset += size + 8
+
+        mib = offset // 1024 // 1024
+        if mib // 100 != prev_mib // 100:
+            print('Indexed {} MiB'.format(mib))
+            prev_mib = mib
+
+    return index
+
+def copy_binpack_indexed(in_file, index, out_file):
+    print('Copying...')
+    total_size = 0
+    report_every = 100
+    prev_mib = -report_every
+    for offset, size in index:
+        in_file.seek(offset, os.SEEK_SET)
+        data = in_file.read(size)
+        assert len(data) == size
+        out_file.write(data)
+
+        total_size += size
+        mib = total_size // 1024 // 1024
+        if mib // 100 != prev_mib // 100:
+            print('Copied {} MiB'.format(mib))
+            prev_mib = mib
+
+def main():
+    if len(sys.argv) < 3:
+        print('Usage: python shuffle_binpack.py infile outfile')
+        return
+
+    in_filename = sys.argv[1]
+    out_filename = sys.argv[2]
+
+    if (Path(out_filename).exists()):
+        print('Output path already exists. Please specify a path to a file that does not exist.')
+        return
+
+    in_file = open(in_filename, 'rb')
+    out_file = open(out_filename, 'wb')
+
+    index = index_binpack(in_file)
+    print('Shuffling...')
+    random.shuffle(index)
+
+    copy_binpack_indexed(in_file, index, out_file)
+
+    in_file.close()
+    out_file.close()
+
+main()
diff --git a/src/Makefile b/src/Makefile
index e4c5a836..6f72809a 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -28,43 +28,49 @@ else
 EXE = stockfish
 endif
 
-### Installation dir definitions
-PREFIX = /usr/local
-BINDIR = $(PREFIX)/bin
-
-### Built-in benchmark for pgo-builds
-PGOBENCH = ./$(EXE) bench
-PGOGENSFEN = ./$(EXE) gensfen depth 3 loop 100000
-
-### Source and object files
-SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp \
-	material.cpp misc.cpp movegen.cpp movepick.cpp pawns.cpp position.cpp psqt.cpp \
-	search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \
-	nnue/evaluate_nnue.cpp \
-	nnue/evaluate_nnue_learner.cpp \
-	nnue/features/half_kp.cpp \
-	nnue/features/half_relative_kp.cpp \
-	nnue/features/k.cpp \
-	nnue/features/p.cpp \
-	nnue/features/castling_right.cpp \
-	nnue/features/enpassant.cpp \
-	nnue/nnue_test_command.cpp \
-	extra/sfen_packer.cpp \
-	learn/gensfen2019.cpp \
-	learn/learner.cpp \
-	learn/learning_tools.cpp \
-	learn/multi_think.cpp
-
-OBJS = $(notdir $(SRCS:.cpp=.o))
-
-VPATH = syzygy:nnue:nnue/features:eval:extra:learn
-
 ### Establish the operating system name
 KERNEL = $(shell uname -s)
 ifeq ($(KERNEL),Linux)
 	OS = $(shell uname -o)
 endif
 
+### Installation dir definitions
+PREFIX = /usr/local
+BINDIR = $(PREFIX)/bin
+
+### Built-in benchmark for pgo-builds
+PGO_TRAINING_DATA_FILE = pgo_training_data.bin
+PGOBENCH = ./$(EXE) bench
+PGOGENSFEN = ./$(EXE) gensfen depth 3 loop 1000 sfen_format bin output_file_name $(PGO_TRAINING_DATA_FILE)
+
+### Source and object files
+SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp \
+	material.cpp misc.cpp movegen.cpp movepick.cpp pawns.cpp position.cpp psqt.cpp \
+	search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \
+	extra/stockfish_blas.cpp \
+	nnue/evaluate_nnue.cpp \
+	nnue/evaluate_nnue_learner.cpp \
+	nnue/features/half_kp.cpp \
+	nnue/features/half_ka.cpp \
+	nnue/features/half_relative_kp.cpp \
+	nnue/features/half_relative_ka.cpp \
+	nnue/features/k.cpp \
+	nnue/features/p.cpp \
+	nnue/features/a.cpp \
+	nnue/features/castling_right.cpp \
+	nnue/features/enpassant.cpp \
+	nnue/nnue_test_command.cpp \
+	learn/sfen_packer.cpp \
+	learn/learn.cpp \
+	learn/gensfen.cpp \
+	learn/opening_book.cpp \
+	learn/convert.cpp \
+	learn/transform.cpp
+
+OBJS = $(notdir $(SRCS:.cpp=.o))
+
+VPATH = syzygy:nnue:nnue/features:eval:extra:learn
+
 ### ==========================================================================
 ### Section 2. High-level Configuration
 ### ==========================================================================
@@ -99,17 +105,23 @@ endif
 
 ### 2.1. General and architecture defaults
 
+ifeq ($(ARCH),)
+   ARCH = x86-64-modern
+   help_skip_sanity = yes
+endif
 # explicitly check for the list of supported architectures (as listed with make help),
 # the user can override with `make ARCH=x86-32-vnni256 SUPPORTED_ARCH=true`
-ifeq ($(ARCH),$(filter $(ARCH),x86-64-vnni512 x86-64-vnni256 x86-64-avx512 x86-64-bmi2 x86-64-avx2 \
-                               x86-64-sse41-popcnt x86-64-modern x86-64-ssse3 x86-64-sse3-popcnt \
-                               x86-64 x86-32-sse41-popcnt x86-32-sse2 x86-32 ppc-64 ppc-32 \
-                               armv7 armv7-neon armv8 apple-silicon general-64 general-32))
+ifeq ($(ARCH), $(filter $(ARCH), \
+                 x86-64-vnni512 x86-64-vnni256 x86-64-avx512 x86-64-bmi2 x86-64-avx2 \
+                 x86-64-sse41-popcnt x86-64-modern x86-64-ssse3 x86-64-sse3-popcnt \
+                 x86-64 x86-32-sse41-popcnt x86-32-sse2 x86-32 ppc-64 ppc-32 \
+                 armv7 armv7-neon armv8 apple-silicon general-64 general-32))
    SUPPORTED_ARCH=true
 else
    SUPPORTED_ARCH=false
 endif
 
+blas = no
 optimize = yes
 debug = no
 sanitize = no
@@ -127,7 +139,6 @@ avx512 = no
 vnni256 = no
 vnni512 = no
 neon = no
-ARCH = x86-64-modern
 STRIP = strip
 
 ### 2.2 Architecture specific
@@ -306,9 +317,9 @@ endif
 ### ==========================================================================
 
 ### 3.1 Selecting compiler (default = gcc)
-CXXFLAGS += -Wall -Wcast-qual -fno-exceptions -std=c++17 $(EXTRACXXFLAGS) $(LEARNCXXFLAGS)
-DEPENDFLAGS += -std=c++17
-LDFLAGS += $(EXTRALDFLAGS) $(LEARNLDFLAGS)
+CXXFLAGS += -g -Wall -Wcast-qual -fno-exceptions -std=c++17 -fopenmp -I. $(EXTRACXXFLAGS)
+LDFLAGS += -fopenmp $(EXTRALDFLAGS)
+DEPENDFLAGS += -std=c++17 -I.
 
 ifeq ($(COMP),)
 	COMP=gcc
@@ -391,19 +402,6 @@ ifeq ($(COMP),clang)
 	endif
 endif
 
-ifeq ($(comp),icc)
-	profile_make = icc-profile-make
-	profile_use = icc-profile-use
-else
-ifeq ($(comp),clang)
-	profile_make = clang-profile-make
-	profile_use = clang-profile-use
-else
-	profile_make = gcc-profile-make
-	profile_use = gcc-profile-use
-endif
-endif
-
 ifeq ($(KERNEL),Darwin)
 	CXXFLAGS += -arch $(arch) -mmacosx-version-min=10.14
 	LDFLAGS += -arch $(arch) -mmacosx-version-min=10.14
@@ -415,20 +413,30 @@ endif
 # Currently we don't know how to make PGO builds with the NDK yet.
 ifeq ($(COMP),ndk)
 	CXXFLAGS += -stdlib=libc++ -fPIE
+	comp=clang
 	ifeq ($(arch),armv7)
-		comp=armv7a-linux-androideabi16-clang
 		CXX=armv7a-linux-androideabi16-clang++
 		CXXFLAGS += -mthumb -march=armv7-a -mfloat-abi=softfp -mfpu=neon
 		STRIP=arm-linux-androideabi-strip
 	endif
 	ifeq ($(arch),armv8)
-		comp=aarch64-linux-android21-clang
 		CXX=aarch64-linux-android21-clang++
 		STRIP=aarch64-linux-android-strip
 	endif
 	LDFLAGS += -static-libstdc++ -pie -lm -latomic
 endif
 
+ifeq ($(comp),icc)
+	profile_make = icc-profile-make
+	profile_use = icc-profile-use
+else ifeq ($(comp),clang)
+	profile_make = clang-profile-make
+	profile_use = clang-profile-use
+else
+	profile_make = gcc-profile-make
+	profile_use = gcc-profile-use
+endif
+
 ### Travis CI script uses COMPILER to overwrite CXX
 ifdef COMPILER
 	COMPCXX=$(COMPILER)
@@ -463,14 +471,33 @@ ifneq ($(comp),mingw)
 endif
 endif
 
-### 3.2.1 Debugging
+### 3.2.1. BLAS libraries
+ifeq ($(blas), yes)
+	LDFLAGS += -lopenblas
+
+	ifeq ($(KERNEL),Linux)
+		LDFLAGS +=
+	else
+		CXXFLAGS += -I/mingw64/include/OpenBLAS
+
+		ifeq ($(debug),yes)
+			LDFLAGS += -Wl,-static
+		else
+			LDFLAGS += -Wl,-s -static
+		endif
+	endif
+
+	CXXFLAGS += -DUSE_BLAS
+endif
+
+### 3.2.2 Debugging
 ifeq ($(debug),no)
 	CXXFLAGS += -DNDEBUG
 else
 	CXXFLAGS += -g
 endif
 
-### 3.2.2 Debugging with undefined behavior sanitizers
+### 3.2.3 Debugging with undefined behavior sanitizers
 ifneq ($(sanitize),no)
         CXXFLAGS += -g3 -fsanitize=$(sanitize)
         LDFLAGS += -fsanitize=$(sanitize)
@@ -600,11 +627,13 @@ endif
 ### needs access to the optimization flags.
 ifeq ($(optimize),yes)
 ifeq ($(debug), no)
-	ifeq ($(COMP),ndk)
-		CXXFLAGS += -flto=thin
-		LDFLAGS += $(CXXFLAGS)
-	else ifeq ($(comp),clang)
+	ifeq ($(comp),clang)
 		CXXFLAGS += -flto=thin
+		ifneq ($(findstring MINGW,$(KERNEL)),)
+			CXXFLAGS += -fuse-ld=lld
+		else ifneq ($(findstring MSYS,$(KERNEL)),)
+			CXXFLAGS += -fuse-ld=lld
+		endif
 		LDFLAGS += $(CXXFLAGS)
 
 # GCC and CLANG use different methods for parallelizing LTO and CLANG pretends to be
@@ -628,10 +657,12 @@ ifeq ($(debug), no)
 # So, only enable it for a cross from Linux by default.
 	else ifeq ($(comp),mingw)
 	ifeq ($(KERNEL),Linux)
+	ifneq ($(arch),i386)
 		CXXFLAGS += -flto
 		LDFLAGS += $(CXXFLAGS) -flto=jobserver
 	endif
 	endif
+	endif
 endif
 endif
 
@@ -707,11 +738,12 @@ help:
 	@echo "make -j build ARCH=x86-64-ssse3 COMP=clang"
 	@echo ""
 	@echo "-------------------------------"
-ifeq ($(SUPPORTED_ARCH), true)
+ifeq ($(SUPPORTED_ARCH)$(help_skip_sanity), true)
 	@echo "The selected architecture $(ARCH) will enable the following configuration: "
 	@$(MAKE) ARCH=$(ARCH) COMP=$(COMP) config-sanity
 else
 	@echo "Specify a supported architecture with the ARCH option for more details"
+	@echo ""
 endif
 
 
@@ -719,7 +751,7 @@ endif
         config-sanity icc-profile-use icc-profile-make gcc-profile-use gcc-profile-make \
         clang-profile-use clang-profile-make
 
-build: config-sanity
+build: net config-sanity
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) all
 
 profile-build: net config-sanity objclean profileclean
@@ -729,6 +761,7 @@ profile-build: net config-sanity objclean profileclean
 	@echo ""
 	@echo "Step 2/4. Running benchmark for pgo-build ..."
 	$(PGOBENCH) > /dev/null
+	$(PGOGENSFEN) > /dev/null
 	@echo ""
 	@echo "Step 3/4. Building optimized executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) objclean
@@ -745,12 +778,13 @@ install:
 	-cp $(EXE) $(BINDIR)
 	-strip $(BINDIR)/$(EXE)
 
-#clean all
+# clean all
 clean: objclean profileclean
 	@rm -f .depend *~ core
 
+# evaluation network (nnue)
 net:
-	$(eval nnuenet := $(shell grep EvalFile ucioption.cpp | grep Option | sed 's/.*\(nn-[a-z0-9]\{12\}.nnue\).*/\1/'))
+	$(eval nnuenet := $(shell grep EvalFileDefaultName evaluate.h | grep define | sed 's/.*\(nn-[a-z0-9]\{12\}.nnue\).*/\1/'))
 	@echo "Default net: $(nnuenet)"
 	$(eval nnuedownloadurl := https://tests.stockfishchess.org/api/nn/$(nnuenet))
 	$(eval curl_or_wget := $(shell if hash curl 2>/dev/null; then echo "curl -skL"; elif hash wget 2>/dev/null; then echo "wget -qO-"; fi))
@@ -772,7 +806,6 @@ net:
             echo "shasum / sha256sum not found, skipping net validation"; \
         fi
 
-
 # clean binaries and objects
 objclean:
 	@rm -f $(EXE) *.o ./syzygy/*.o ./nnue/*.o ./nnue/features/*.o ./learn/*.o ./extra/*.o ./eval/*.o
@@ -782,6 +815,7 @@ profileclean:
 	@rm -rf profdir
 	@rm -f bench.txt *.gcda *.gcno ./syzygy/*.gcda ./nnue/*.gcda ./nnue/features/*.gcda *.s ./learn/*.gcda ./extra/*.gcda ./eval/*.gcda
 	@rm -f stockfish.profdata *.profraw
+	@rm -f $(PGO_TRAINING_DATA_FILE)
 
 default:
 	help
@@ -792,7 +826,7 @@ default:
 
 all: $(EXE) .depend
 
-config-sanity:
+config-sanity: net
 	@echo ""
 	@echo "Config:"
 	@echo "debug: '$(debug)'"
@@ -913,6 +947,6 @@ profile-learn: config-sanity objclean profileclean
 	rm generated_kifu.bin
 
 .depend:
-	-@$(CXX) $(DEPENDFLAGS) -MM $(SRCS) > $@ 2> /dev/null
+	-@$(CXX) $(DEPENDFLAGS) -MM $(SRCS) > $@
 
 -include .depend
diff --git a/src/benchmark.cpp b/src/benchmark.cpp
index 806e9840..ffb631a2 100644
--- a/src/benchmark.cpp
+++ b/src/benchmark.cpp
@@ -164,5 +164,7 @@ vector<string> setup_bench(const Position& current, istream& is) {
           ++posCounter;
       }
 
+  list.emplace_back("setoption name Use NNUE value true");
+
   return list;
 }
diff --git a/src/eval/evaluate_common.h b/src/eval/evaluate_common.h
deleted file mode 100644
index b043f2e1..00000000
--- a/src/eval/evaluate_common.h
+++ /dev/null
@@ -1,82 +0,0 @@
-﻿#ifndef _EVALUATE_COMMON_H_
-#define _EVALUATE_COMMON_H_
-
-// A common header-like function for modern evaluation functions (EVAL_KPPT and EVAL_KPP_KKPT).
-
-#if defined(EVAL_NNUE) || defined(EVAL_LEARN)
-#include <functional>
-
-// KK file name
-#define KK_BIN "KK_synthesized.bin"
-
-// KKP file name
-#define KKP_BIN "KKP_synthesized.bin"
-
-// KPP file name
-#define KPP_BIN "KPP_synthesized.bin"
-
-namespace Eval
-{
-
-#if defined(USE_EVAL_HASH)
-	// prefetch function
-	void prefetch_evalhash(const Key key);
-#endif
-
-	// An operator that applies the function f to each parameter of the evaluation function.
-	// Used for parameter analysis etc.
-	// type indicates the survey target.
-	// type = -1 :KK,KKP,KPP all
-	// type = 0: KK only
-	// type = 1: KKP only
-	// type = 2: KPP only
-	void foreach_eval_param(std::function<void(int32_t, int32_t)>f, int type = -1);
-
-	// --------------------------
-	// for learning
-	// --------------------------
-
-#if defined(EVAL_LEARN)
-	// Initialize the gradient array during learning
-	// Pass the learning rate as an argument. If 0.0, the default value is used.
-	// The epoch of update_weights() gradually changes from eta to eta2 until eta_epoch.
-	// After eta2_epoch, gradually change from eta2 to eta3.
-	void init_grad(double eta1, uint64_t eta_epoch, double eta2, uint64_t eta2_epoch, double eta3);
-
-	// Add the gradient difference value to the gradient array for all features that appear in the current phase.
-	// freeze[0]: Flag that kk does not learn
-	// freeze[1]: Flag that kkp does not learn
-	// freeze[2]: Flag that kpp does not learn
-	// freeze[3]: Flag that kppp does not learn
-	void add_grad(Position& pos, Color rootColor, double delt_grad, const std::array<bool, 4>& freeze);
-
-	// Do SGD or AdaGrad or something based on the current gradient.
-	// epoch: Generation counter (starting from 0)
-	// freeze[0]: Flag that kk does not learn
-	// freeze[1]: Flag that kkp does not learn
-	// freeze[2]: Flag that kpp does not learn
-	// freeze[3]: Flag that kppp does not learn
-	void update_weights(uint64_t epoch, const std::array<bool, 4>& freeze);
-
-	// Save the evaluation function parameters to a file.
-	// You can specify the extension added to the end of the file.
-	void save_eval(std::string suffix);
-
-	// Get the current eta.
-	double get_eta();
-
-	// --learning related commands
-
-	// A function that normalizes KK. Note that it is not completely equivalent to the original evaluation function.
-	// By making the values ​​of kkp and kpp as close to zero as possible, the value of the feature factor (which is zero) that did not appear during learning
-	// The idea of ​​ensuring it is valid.
-	void regularize_kk();
-
-#endif
-
-
-}
-
-#endif  // defined(EVAL_NNUE) || defined(EVAL_LEARN)
-
-#endif // _EVALUATE_KPPT_COMMON_H_
diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 8edc9bb8..dd204a52 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -20,61 +20,25 @@
 #include <cassert>
 #include <cstdlib>
 #include <cstring>   // For std::memset
+#include <fstream>
 #include <iomanip>
 #include <sstream>
 #include <iostream>
-#include <set>
+#include <streambuf>
+#include <vector>
+
+#include "nnue/evaluate_nnue.h"
 
 #include "bitboard.h"
 #include "evaluate.h"
 #include "material.h"
+#include "misc.h"
 #include "pawns.h"
 #include "thread.h"
 #include "uci.h"
+#include "incbin/incbin.h"
 
-#ifdef EVAL_LEARN
-namespace Learner
-{
-    extern bool use_raw_nnue_eval;
-}
-#endif
-
-namespace Eval {
-
-  bool useNNUE;
-  std::string eval_file_loaded="None";
-
-  void init_NNUE() {
-
-    useNNUE = Options["Use NNUE"];
-    std::string eval_file = std::string(Options["EvalFile"]);
-    if (useNNUE && eval_file_loaded != eval_file)
-        if (Eval::NNUE::load_eval_file(eval_file))
-            eval_file_loaded = eval_file;
-  }
-
-  void verify_NNUE() {
-
-    std::string eval_file = std::string(Options["EvalFile"]);
-    if (useNNUE && eval_file_loaded != eval_file)
-    {
-        UCI::OptionsMap defaults;
-        UCI::init(defaults);
-
-        sync_cout << "info string ERROR: NNUE evaluation used, but the network file " << eval_file << " was not loaded successfully." << sync_endl;
-        sync_cout << "info string ERROR: The UCI option EvalFile might need to specify the full path, including the directory/folder name, to the file." << sync_endl;
-        sync_cout << "info string ERROR: The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/"+std::string(defaults["EvalFile"]) << sync_endl;
-        sync_cout << "info string ERROR: If the UCI option Use NNUE is set to true, network evaluation parameters compatible with the program must be available." << sync_endl;
-        sync_cout << "info string ERROR: The engine will be terminated now." << sync_endl;
-        std::exit(EXIT_FAILURE);
-    }
-
-    if (useNNUE)
-        sync_cout << "info string NNUE evaluation using " << eval_file << " enabled." << sync_endl;
-    else
-        sync_cout << "info string classical evaluation enabled." << sync_endl;
-  }
-}
+using namespace std;
 
 namespace Trace {
 
@@ -120,11 +84,11 @@ using namespace Trace;
 namespace {
 
   // Threshold for lazy and space evaluation
-  constexpr Value LazyThreshold1 =  Value(1400);
-  constexpr Value LazyThreshold2 =  Value(1300);
-  constexpr Value SpaceThreshold = Value(12222);
-  constexpr Value NNUEThreshold1 =   Value(550);
-  constexpr Value NNUEThreshold2 =   Value(150);
+  constexpr Value LazyThreshold1 =  Value(1565);
+  constexpr Value LazyThreshold2 =  Value(1102);
+  constexpr Value SpaceThreshold = Value(11551);
+  constexpr Value NNUEThreshold1 =   Value(682);
+  constexpr Value NNUEThreshold2 =   Value(176);
 
   // KingAttackWeights[PieceType] contains king attack weights by piece type
   constexpr int KingAttackWeights[PIECE_TYPE_NB] = { 0, 0, 81, 52, 44, 10 };
@@ -132,7 +96,7 @@ namespace {
   // SafeCheck[PieceType][single/multiple] contains safe check bonus by piece type,
   // higher if multiple safe checks are possible for that piece type.
   constexpr int SafeCheck[][2] = {
-      {}, {}, {792, 1283}, {645, 967}, {1084, 1897}, {772, 1119}
+      {}, {}, {803, 1292}, {639, 974}, {1087, 1878}, {759, 1132}
   };
 
 #define S(mg, eg) make_score(mg, eg)
@@ -140,19 +104,25 @@ namespace {
   // MobilityBonus[PieceType-2][attacked] contains bonuses for middle and end game,
   // indexed by piece type and number of attacked squares in the mobility area.
   constexpr Score MobilityBonus[][32] = {
-    { S(-62,-81), S(-53,-56), S(-12,-31), S( -4,-16), S(  3,  5), S( 13, 11), // Knight
-      S( 22, 17), S( 28, 20), S( 33, 25) },
-    { S(-48,-59), S(-20,-23), S( 16, -3), S( 26, 13), S( 38, 24), S( 51, 42), // Bishop
-      S( 55, 54), S( 63, 57), S( 63, 65), S( 68, 73), S( 81, 78), S( 81, 86),
-      S( 91, 88), S( 98, 97) },
-    { S(-60,-78), S(-20,-17), S(  2, 23), S(  3, 39), S(  3, 70), S( 11, 99), // Rook
-      S( 22,103), S( 31,121), S( 40,134), S( 40,139), S( 41,158), S( 48,164),
-      S( 57,168), S( 57,169), S( 62,172) },
-    { S(-30,-48), S(-12,-30), S( -8, -7), S( -9, 19), S( 20, 40), S( 23, 55), // Queen
-      S( 23, 59), S( 35, 75), S( 38, 78), S( 53, 96), S( 64, 96), S( 65,100),
-      S( 65,121), S( 66,127), S( 67,131), S( 67,133), S( 72,136), S( 72,141),
-      S( 77,147), S( 79,150), S( 93,151), S(108,168), S(108,168), S(108,171),
-      S(110,182), S(114,182), S(114,192), S(116,219) }
+    { S(-62,-79), S(-53,-57), S(-12,-31), S( -3,-17), S(  3,  7), S( 12, 13), // Knight
+      S( 21, 16), S( 28, 21), S( 37, 26) },
+    { S(-47,-59), S(-20,-25), S( 14, -8), S( 29, 12), S( 39, 21), S( 53, 40), // Bishop
+      S( 53, 56), S( 60, 58), S( 62, 65), S( 69, 72), S( 78, 78), S( 83, 87),
+      S( 91, 88), S( 96, 98) },
+    { S(-60,-82), S(-24,-15), S(  0, 17) ,S(  3, 43), S(  4, 72), S( 14,100), // Rook
+      S( 20,102), S( 30,122), S( 41,133), S(41 ,139), S( 41,153), S( 45,160),
+      S( 57,165), S( 58,170), S( 67,175) },
+    { S(-29,-49), S(-16,-29), S( -8, -8), S( -8, 17), S( 18, 39), S( 25, 54), // Queen
+      S( 23, 59), S( 37, 73), S( 41, 76), S( 54, 95), S( 65, 95) ,S( 68,101),
+      S( 69,124), S( 70,128), S( 70,132), S( 70,133) ,S( 71,136), S( 72,140),
+      S( 74,147), S( 76,149), S( 90,153), S(104,169), S(105,171), S(106,171),
+      S(112,178), S(114,185), S(114,187), S(119,221) }
+  };
+
+  // BishopPawns[distance from edge] contains a file-dependent penalty for pawns on
+  // squares of the same color as our bishop.
+  constexpr Score BishopPawns[int(FILE_NB) / 2] = {
+    S(3, 8), S(3, 9), S(1, 8), S(3, 7)
   };
 
   // KingProtector[knight/bishop] contains penalty for each distance unit to own king
@@ -160,32 +130,31 @@ namespace {
 
   // Outpost[knight/bishop] contains bonuses for each knight or bishop occupying a
   // pawn protected square on rank 4 to 6 which is also safe from a pawn attack.
-  constexpr Score Outpost[] = { S(56, 36), S(30, 23) };
+  constexpr Score Outpost[] = { S(56, 34), S(31, 23) };
 
   // PassedRank[Rank] contains a bonus according to the rank of a passed pawn
   constexpr Score PassedRank[RANK_NB] = {
-    S(0, 0), S(10, 28), S(17, 33), S(15, 41), S(62, 72), S(168, 177), S(276, 260)
+    S(0, 0), S(9, 28), S(15, 31), S(17, 39), S(64, 70), S(171, 177), S(277, 260)
   };
 
   // RookOnFile[semiopen/open] contains bonuses for each rook when there is
   // no (friendly) pawn on the rook file.
-  constexpr Score RookOnFile[] = { S(19, 7), S(48, 29) };
+  constexpr Score RookOnFile[] = { S(19, 7), S(48, 27) };
 
   // ThreatByMinor/ByRook[attacked PieceType] contains bonuses according to
   // which piece type attacks which one. Attacks on lesser pieces which are
   // pawn-defended are not considered.
   constexpr Score ThreatByMinor[PIECE_TYPE_NB] = {
-    S(0, 0), S(5, 32), S(57, 41), S(77, 56), S(88, 119), S(79, 161)
+    S(0, 0), S(5, 32), S(55, 41), S(77, 56), S(89, 119), S(79, 162)
   };
 
   constexpr Score ThreatByRook[PIECE_TYPE_NB] = {
-    S(0, 0), S(3, 46), S(37, 68), S(42, 60), S(0, 38), S(58, 41)
+    S(0, 0), S(3, 44), S(37, 68), S(42, 60), S(0, 39), S(58, 43)
   };
 
   // Assorted bonuses and penalties
   constexpr Score BadOutpost          = S( -7, 36);
   constexpr Score BishopOnKingRing    = S( 24,  0);
-  constexpr Score BishopPawns         = S(  3,  7);
   constexpr Score BishopXRayPawns     = S(  4,  5);
   constexpr Score CorneredBishop      = S( 50, 50);
   constexpr Score FlankAttacks        = S(  8,  0);
@@ -198,7 +167,6 @@ namespace {
   constexpr Score ReachableOutpost    = S( 31, 22);
   constexpr Score RestrictedPiece     = S(  7,  7);
   constexpr Score RookOnKingRing      = S( 16,  0);
-  constexpr Score RookOnQueenFile     = S(  6, 11);
   constexpr Score SliderOnQueen       = S( 60, 18);
   constexpr Score ThreatByKing        = S( 24, 89);
   constexpr Score ThreatByPawnPush    = S( 48, 39);
@@ -387,7 +355,7 @@ namespace {
                 // when the bishop is outside the pawn chain.
                 Bitboard blocked = pos.pieces(Us, PAWN) & shift<Down>(pos.pieces());
 
-                score -= BishopPawns * pos.pawns_on_same_color_squares(Us, s)
+                score -= BishopPawns[edge_distance(file_of(s))] * pos.pawns_on_same_color_squares(Us, s)
                                      * (!(attackedBy[Us][PAWN] & s) + popcount(blocked & CenterFiles));
 
                 // Penalty for all enemy pawns x-rayed
@@ -414,10 +382,6 @@ namespace {
 
         if (Pt == ROOK)
         {
-            // Bonus for rook on the same file as a queen
-            if (file_bb(s) & pos.pieces(QUEEN))
-                score += RookOnQueenFile;
-
             // Bonus for rook on an open or semi-open file
             if (pos.is_on_semiopen_file(Us, s))
                 score += RookOnFile[pos.is_on_semiopen_file(Them, s)];
@@ -515,18 +479,18 @@ namespace {
     int kingFlankAttack  = popcount(b1) + popcount(b2);
     int kingFlankDefense = popcount(b3);
 
-    kingDanger +=        kingAttackersCount[Them] * kingAttackersWeight[Them]
-                 + 185 * popcount(kingRing[Us] & weak)
-                 + 148 * popcount(unsafeChecks)
-                 +  98 * popcount(pos.blockers_for_king(Us))
-                 +  69 * kingAttacksCount[Them]
-                 +   3 * kingFlankAttack * kingFlankAttack / 8
-                 +       mg_value(mobility[Them] - mobility[Us])
-                 - 873 * !pos.count<QUEEN>(Them)
-                 - 100 * bool(attackedBy[Us][KNIGHT] & attackedBy[Us][KING])
-                 -   6 * mg_value(score) / 8
-                 -   4 * kingFlankDefense
-                 +  37;
+    kingDanger +=        kingAttackersCount[Them] * kingAttackersWeight[Them] // (~10 Elo)
+                 + 185 * popcount(kingRing[Us] & weak)                        // (~15 Elo)
+                 + 148 * popcount(unsafeChecks)                               // (~4 Elo)
+                 +  98 * popcount(pos.blockers_for_king(Us))                  // (~2 Elo)
+                 +  69 * kingAttacksCount[Them]                               // (~0.5 Elo)
+                 +   3 * kingFlankAttack * kingFlankAttack / 8                // (~0.5 Elo)
+                 +       mg_value(mobility[Them] - mobility[Us])              // (~0.5 Elo)
+                 - 873 * !pos.count<QUEEN>(Them)                              // (~24 Elo)
+                 - 100 * bool(attackedBy[Us][KNIGHT] & attackedBy[Us][KING])  // (~5 Elo)
+                 -   6 * mg_value(score) / 8                                  // (~8 Elo)
+                 -   4 * kingFlankDefense                                     // (~5 Elo)
+                 +  37;                                                       // (~0.5 Elo)
 
     // Transform the kingDanger units into a Score, and subtract it from the evaluation
     if (kingDanger > 100)
@@ -843,7 +807,9 @@ namespace {
             sf = 37 + 3 * (pos.count<QUEEN>(WHITE) == 1 ? pos.count<BISHOP>(BLACK) + pos.count<KNIGHT>(BLACK)
                                                         : pos.count<BISHOP>(WHITE) + pos.count<KNIGHT>(WHITE));
         else
-            sf = std::min(sf, 36 + 7 * pos.count<PAWN>(strongSide));
+            sf = std::min(sf, 36 + 7 * pos.count<PAWN>(strongSide)) - 4 * !pawnsOnBothFlanks;
+
+        sf -= 4 * !pawnsOnBothFlanks;
     }
 
     // Interpolate between the middlegame and (scaled by 'sf') endgame score
@@ -947,19 +913,47 @@ make_v:
 /// evaluation of the position from the point of view of the side to move.
 
 Value Eval::evaluate(const Position& pos) {
-#ifdef EVAL_LEARN
-  if (Learner::use_raw_nnue_eval) {
-      return NNUE::evaluate(pos);
+
+  Value v;
+
+  if (NNUE::useNNUE == NNUE::UseNNUEMode::Pure) {
+      v = NNUE::evaluate(pos);
+
+      // Guarantee evaluation does not hit the tablebase range
+      v = std::clamp(v, VALUE_TB_LOSS_IN_MAX_PLY + 1, VALUE_TB_WIN_IN_MAX_PLY - 1);
+
+      return v;
   }
-#endif
+  else if (NNUE::useNNUE == NNUE::UseNNUEMode::False)
+      v = Evaluation<NO_TRACE>(pos).value();
+  else
+  {
+      // Scale and shift NNUE for compatibility with search and classical evaluation
+      auto  adjusted_NNUE = [&](){
+         int mat = pos.non_pawn_material() + PawnValueMg * pos.count<PAWN>();
+         return NNUE::evaluate(pos) * (720 + mat / 32) / 1024 + Tempo;
+      };
 
-  bool classical = !Eval::useNNUE
-                ||  abs(eg_value(pos.psq_score())) * 16 > NNUEThreshold1 * (16 + pos.rule50_count());
-  Value v = classical ? Evaluation<NO_TRACE>(pos).value()
-                      : NNUE::evaluate(pos) * 5 / 4 + Tempo;
+      // If there is PSQ imbalance use classical eval, with small probability if it is small
+      Value psq = Value(abs(eg_value(pos.psq_score())));
+      int   r50 = 16 + pos.rule50_count();
+      bool  largePsq = psq * 16 > (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50;
+      bool  classical = largePsq || (psq > PawnValueMg / 4 && !(pos.this_thread()->nodes & 0xB));
 
-  if (classical && Eval::useNNUE && abs(v) * 16 < NNUEThreshold2 * (16 + pos.rule50_count()))
-      v = NNUE::evaluate(pos) * 5 / 4 + Tempo;
+      bool strongClassical = pos.non_pawn_material() < 2 * RookValueMg && pos.count<PAWN>() < 2;
+
+      v = classical || strongClassical ? Evaluation<NO_TRACE>(pos).value() : adjusted_NNUE();
+
+      // If the classical eval is small and imbalance large, use NNUE nevertheless.
+      // For the case of opposite colored bishops, switch to NNUE eval with
+      // small probability if the classical eval is less than the threshold.
+      if (   largePsq && !strongClassical
+          && (   abs(v) * 16 < NNUEThreshold2 * r50
+              || (   pos.opposite_bishops()
+                  && abs(v) * 16 < (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50
+                  && !(pos.this_thread()->nodes & 0xB))))
+          v = adjusted_NNUE();
+  }
 
   // Damp down the evaluation linearly when shuffling
   v = v * (100 - pos.rule50_count()) / 100;
@@ -1015,7 +1009,7 @@ std::string Eval::trace(const Position& pos) {
 
   ss << "\nClassical evaluation: " << to_cp(v) << " (white side)\n";
 
-  if (Eval::useNNUE)
+  if (NNUE::useNNUE != NNUE::UseNNUEMode::False)
   {
       v = NNUE::evaluate(pos);
       v = pos.side_to_move() == WHITE ? v : -v;
diff --git a/src/evaluate.h b/src/evaluate.h
index e808068d..f5d3efa7 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -26,23 +26,13 @@
 class Position;
 
 namespace Eval {
-
   std::string trace(const Position& pos);
   Value evaluate(const Position& pos);
 
-  extern bool useNNUE;
-  extern std::string eval_file_loaded;
-  void init_NNUE();
-  void verify_NNUE();
-
-  namespace NNUE {
-
-    Value evaluate(const Position& pos);
-    Value compute_eval(const Position& pos);
-    void  update_eval(const Position& pos);
-    bool  load_eval_file(const std::string& evalFile);
-
-  } // namespace NNUE
+  // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
+  // for the build process (profile-build and fishtest) to work. Do not change the
+  // name of the macro, as it is used in the Makefile.
+  #define EvalFileDefaultName   "nn-c3ca321c51c9.nnue"
 
 } // namespace Eval
 
diff --git a/src/extra/nnue_data_binpack_format.h b/src/extra/nnue_data_binpack_format.h
new file mode 100644
index 00000000..31c6f7bb
--- /dev/null
+++ b/src/extra/nnue_data_binpack_format.h
@@ -0,0 +1,7842 @@
+/*
+
+Copyright 2020 Tomasz Sobczyk
+
+Permission is hereby granted, free of charge,
+to any person obtaining a copy of this software
+and associated documentation files (the "Software"),
+to deal in the Software without restriction,
+including without limitation the rights to use, copy,
+modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall
+be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH
+THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+*/
+
+#pragma once
+
+#include <cstdio>
+#include <cassert>
+#include <string>
+#include <string_view>
+#include <vector>
+#include <memory>
+#include <fstream>
+#include <cstring>
+#include <iostream>
+#include <set>
+#include <cstdio>
+#include <cassert>
+#include <array>
+#include <limits>
+#include <climits>
+#include <optional>
+
+#if (defined(_MSC_VER) || defined(__INTEL_COMPILER)) && !defined(__clang__)
+#include <intrin.h>
+#endif
+
+namespace chess
+{
+    #if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__)
+
+    #define FORCEINLINE __attribute__((always_inline))
+
+    #elif defined(_MSC_VER)
+
+    // NOTE: for some reason it breaks the profiler a little
+    //       keep it on only when not profiling.
+    //#define FORCEINLINE __forceinline
+    #define FORCEINLINE
+
+    #else
+
+    #define FORCEINLINE inline
+
+    #endif
+
+    #if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__)
+
+    #define NOINLINE __attribute__((noinline))
+
+    #elif defined(_MSC_VER)
+
+    #define NOINLINE __declspec(noinline)
+
+    #else
+
+    #define NOINLINE
+
+    #endif
+
+    namespace intrin
+    {
+        [[nodiscard]] constexpr int popcount_constexpr(std::uint64_t value)
+        {
+            int r = 0;
+            while (value)
+            {
+                value &= value - 1;
+                ++r;
+            }
+            return r;
+        }
+
+        [[nodiscard]] constexpr int lsb_constexpr(std::uint64_t value)
+        {
+            int c = 0;
+            value &= ~value + 1; // leave only the lsb
+            if ((value & 0x00000000FFFFFFFFull) == 0) c += 32;
+            if ((value & 0x0000FFFF0000FFFFull) == 0) c += 16;
+            if ((value & 0x00FF00FF00FF00FFull) == 0) c += 8;
+            if ((value & 0x0F0F0F0F0F0F0F0Full) == 0) c += 4;
+            if ((value & 0x3333333333333333ull) == 0) c += 2;
+            if ((value & 0x5555555555555555ull) == 0) c += 1;
+            return c;
+        }
+
+        [[nodiscard]] constexpr int msb_constexpr(std::uint64_t value)
+        {
+            int c = 63;
+            if ((value & 0xFFFFFFFF00000000ull) == 0) { c -= 32; value <<= 32; }
+            if ((value & 0xFFFF000000000000ull) == 0) { c -= 16; value <<= 16; }
+            if ((value & 0xFF00000000000000ull) == 0) { c -= 8; value <<= 8; }
+            if ((value & 0xF000000000000000ull) == 0) { c -= 4; value <<= 4; }
+            if ((value & 0xC000000000000000ull) == 0) { c -= 2; value <<= 2; }
+            if ((value & 0x8000000000000000ull) == 0) { c -= 1; }
+            return c;
+        }
+    }
+
+    namespace intrin
+    {
+        [[nodiscard]] inline int popcount(std::uint64_t b)
+        {
+    #if (defined(_MSC_VER) || defined(__INTEL_COMPILER)) && !defined(__clang__)
+
+            return static_cast<int>(_mm_popcnt_u64(b));
+
+    #else
+
+            return static_cast<int>(__builtin_popcountll(b));
+
+    #endif
+        }
+
+    #if defined(_MSC_VER) && !defined(__clang__)
+
+        [[nodiscard]] inline int lsb(std::uint64_t value)
+        {
+            assert(value != 0);
+
+            unsigned long idx;
+            _BitScanForward64(&idx, value);
+            return static_cast<int>(idx);
+        }
+
+        [[nodiscard]] inline int msb(std::uint64_t value)
+        {
+            assert(value != 0);
+
+            unsigned long idx;
+            _BitScanReverse64(&idx, value);
+            return static_cast<int>(idx);
+        }
+
+    #else
+
+        [[nodiscard]] inline int lsb(std::uint64_t value)
+        {
+            assert(value != 0);
+
+            return __builtin_ctzll(value);
+        }
+
+        [[nodiscard]] inline int msb(std::uint64_t value)
+        {
+            assert(value != 0);
+
+            return 63 ^ __builtin_clzll(value);
+        }
+
+    #endif
+    }
+
+    template <typename IntT>
+    [[nodiscard]] constexpr IntT floorLog2(IntT value)
+    {
+        return intrin::msb_constexpr(value);
+    }
+
+    template <typename IntT>
+    constexpr auto computeMasks()
+    {
+        static_assert(std::is_unsigned_v<IntT>);
+
+        constexpr std::size_t numBits = sizeof(IntT) * CHAR_BIT;
+        std::array<IntT, numBits + 1u> nbitmasks{};
+
+        for (std::size_t i = 0; i < numBits; ++i)
+        {
+            nbitmasks[i] = (static_cast<IntT>(1u) << i) - 1u;
+        }
+        nbitmasks[numBits] = ~static_cast<IntT>(0u);
+
+        return nbitmasks;
+    }
+
+    template <typename IntT>
+    constexpr auto nbitmask = computeMasks<IntT>();
+
+    template <std::size_t N, typename FromT, typename ToT = std::make_signed_t<FromT>>
+    inline ToT signExtend(FromT value)
+    {
+        static_assert(std::is_signed_v<ToT>);
+        static_assert(std::is_unsigned_v<FromT>);
+        static_assert(sizeof(ToT) == sizeof(FromT));
+
+        constexpr std::size_t totalBits = sizeof(FromT) * CHAR_BIT;
+
+        static_assert(N > 0 && N <= totalBits);
+
+        constexpr std::size_t unusedBits = totalBits - N;
+        if constexpr (ToT(~FromT(0)) >> 1 == ToT(~FromT(0)))
+        {
+            return ToT(value << unusedBits) >> ToT(unusedBits);
+        }
+        else
+        {
+            constexpr FromT mask = (~FromT(0)) >> unusedBits;
+            value &= mask;
+            if (value & (FromT(1) << (N - 1)))
+            {
+                value |= ~mask;
+            }
+            return static_cast<ToT>(value);
+        }
+    }
+
+    namespace lookup
+    {
+        constexpr int nthSetBitIndexNaive(std::uint64_t value, int n)
+        {
+            for (int i = 0; i < n; ++i)
+            {
+                value &= value - 1;
+            }
+            return intrin::lsb_constexpr(value);
+        }
+
+        constexpr std::array<std::array<std::uint8_t, 8>, 256> nthSetBitIndex = []()
+        {
+            std::array<std::array<std::uint8_t, 8>, 256> t{};
+
+            for (int i = 0; i < 256; ++i)
+            {
+                for (int j = 0; j < 8; ++j)
+                {
+                    t[i][j] = nthSetBitIndexNaive(i, j);
+                }
+            }
+
+            return t;
+        }();
+    }
+
+    inline int nthSetBitIndex(std::uint64_t v, std::uint64_t n)
+    {
+        std::uint64_t shift = 0;
+
+        std::uint64_t p = intrin::popcount(v & 0xFFFFFFFFull);
+        std::uint64_t pmask = static_cast<std::uint64_t>(p > n) - 1ull;
+        v >>= 32 & pmask;
+        shift += 32 & pmask;
+        n -= p & pmask;
+
+        p = intrin::popcount(v & 0xFFFFull);
+        pmask = static_cast<std::uint64_t>(p > n) - 1ull;
+        v >>= 16 & pmask;
+        shift += 16 & pmask;
+        n -= p & pmask;
+
+        p = intrin::popcount(v & 0xFFull);
+        pmask = static_cast<std::uint64_t>(p > n) - 1ull;
+        shift += 8 & pmask;
+        v >>= 8 & pmask;
+        n -= p & pmask;
+
+        return static_cast<int>(lookup::nthSetBitIndex[v & 0xFFull][n] + shift);
+    }
+
+    namespace util
+    {
+        inline std::size_t usedBits(std::size_t value)
+        {
+            if (value == 0) return 0;
+            return intrin::msb(value) + 1;
+        }
+    }
+
+    template <typename EnumT>
+    struct EnumTraits;
+
+    template <typename EnumT>
+    [[nodiscard]] constexpr auto hasEnumTraits() -> decltype(EnumTraits<EnumT>::cardinaliy, bool{})
+    {
+        return true;
+    }
+
+    template <typename EnumT>
+    [[nodiscard]] constexpr bool hasEnumTraits(...)
+    {
+        return false;
+    }
+
+    template <typename EnumT>
+    [[nodiscard]] constexpr bool isNaturalIndex() noexcept
+    {
+        return EnumTraits<EnumT>::isNaturalIndex;
+    }
+
+    template <typename EnumT>
+    [[nodiscard]] constexpr int cardinality() noexcept
+    {
+        return EnumTraits<EnumT>::cardinality;
+    }
+
+    template <typename EnumT>
+    [[nodiscard]] constexpr const std::array<EnumT, cardinality<EnumT>()>& values() noexcept
+    {
+        return EnumTraits<EnumT>::values;
+    }
+
+    template <typename EnumT>
+    [[nodiscard]] constexpr EnumT fromOrdinal(int id) noexcept
+    {
+        assert(!EnumTraits<EnumT>::isNaturalIndex || (id >= 0 && id < EnumTraits<EnumT>::cardinality));
+
+        return EnumTraits<EnumT>::fromOrdinal(id);
+    }
+
+    template <typename EnumT>
+    [[nodiscard]] constexpr typename EnumTraits<EnumT>::IdType ordinal(EnumT v) noexcept
+    {
+        return EnumTraits<EnumT>::ordinal(v);
+    }
+
+    template <typename EnumT, typename... ArgsTs, typename SFINAE = std::enable_if_t<hasEnumTraits<EnumT>()>>
+    [[nodiscard]] constexpr decltype(auto) toString(EnumT v, ArgsTs&&... args)
+    {
+        return EnumTraits<EnumT>::toString(v, std::forward<ArgsTs>(args)...);
+    }
+
+    template <typename EnumT>
+    [[nodiscard]] constexpr decltype(auto) toString(EnumT v)
+    {
+        return EnumTraits<EnumT>::toString(v);
+    }
+
+    template <typename EnumT, typename FormatT, typename SFINAE = std::enable_if_t<!hasEnumTraits<FormatT>()>>
+    [[nodiscard]] constexpr decltype(auto) toString(FormatT&& f, EnumT v)
+    {
+        return EnumTraits<EnumT>::toString(std::forward<FormatT>(f), v);
+    }
+
+    template <typename EnumT>
+    [[nodiscard]] constexpr decltype(auto) toChar(EnumT v)
+    {
+        return EnumTraits<EnumT>::toChar(v);
+    }
+
+    template <typename EnumT, typename FormatT>
+    [[nodiscard]] constexpr decltype(auto) toChar(FormatT&& f, EnumT v)
+    {
+        return EnumTraits<EnumT>::toChar(std::forward<FormatT>(f), v);
+    }
+
+    template <typename EnumT, typename... ArgsTs>
+    [[nodiscard]] constexpr decltype(auto) fromString(ArgsTs&& ... args)
+    {
+        return EnumTraits<EnumT>::fromString(std::forward<ArgsTs>(args)...);
+    }
+
+    template <typename EnumT, typename... ArgsTs>
+    [[nodiscard]] constexpr decltype(auto) fromChar(ArgsTs&& ... args)
+    {
+        return EnumTraits<EnumT>::fromChar(std::forward<ArgsTs>(args)...);
+    }
+
+    template <>
+    struct EnumTraits<bool>
+    {
+        using IdType = int;
+        using EnumType = bool;
+
+        static constexpr int cardinality = 2;
+        static constexpr bool isNaturalIndex = true;
+
+        static constexpr std::array<EnumType, cardinality> values{
+            false,
+            true
+        };
+
+        [[nodiscard]] static constexpr int ordinal(EnumType c) noexcept
+        {
+            return static_cast<IdType>(c);
+        }
+
+        [[nodiscard]] static constexpr EnumType fromOrdinal(IdType id) noexcept
+        {
+            return static_cast<EnumType>(id);
+        }
+    };
+
+    template <typename EnumT, typename ValueT, std::size_t SizeV = cardinality<EnumT>()>
+    struct EnumArray
+    {
+        static_assert(isNaturalIndex<EnumT>(), "Enum must start with 0 and end with cardinality-1.");
+
+        using value_type      = ValueT;
+        using size_type       = std::size_t;
+        using difference_type = std::ptrdiff_t;
+        using pointer         = ValueT *;
+        using const_pointer   = const ValueT*;
+        using reference       = ValueT &;
+        using const_reference = const ValueT &;
+
+        using iterator       = pointer;
+        using const_iterator = const_pointer;
+
+        using reverse_iterator       = std::reverse_iterator<iterator>;
+        using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+
+        using KeyType = EnumT;
+        using ValueType = ValueT;
+
+        constexpr void fill(const ValueType& init)
+        {
+            for (auto& v : elements)
+            {
+                v = init;
+            }
+        }
+
+        [[nodiscard]] constexpr ValueType& operator[](const KeyType& dir)
+        {
+            assert(static_cast<int>(ordinal(dir)) < static_cast<int>(SizeV));
+
+            return elements[ordinal(dir)];
+        }
+
+        [[nodiscard]] constexpr const ValueType& operator[](const KeyType& dir) const
+        {
+            assert(static_cast<int>(ordinal(dir)) < static_cast<int>(SizeV));
+
+            return elements[ordinal(dir)];
+        }
+
+        [[nodiscard]] constexpr ValueType& front()
+        {
+            return elements[0];
+        }
+
+        [[nodiscard]] constexpr const ValueType& front() const
+        {
+            return elements[0];
+        }
+
+        [[nodiscard]] constexpr ValueType& back()
+        {
+            return elements[SizeV - 1];
+        }
+
+        [[nodiscard]] constexpr const ValueType& back() const
+        {
+            return elements[SizeV - 1];
+        }
+
+        [[nodiscard]] constexpr pointer data()
+        {
+            return elements;
+        }
+
+        [[nodiscard]] constexpr const_pointer data() const
+        {
+            return elements;
+        }
+
+        [[nodiscard]] constexpr iterator begin() noexcept
+        {
+            return elements;
+        }
+
+        [[nodiscard]] constexpr const_iterator begin() const noexcept
+        {
+            return elements;
+        }
+
+        [[nodiscard]] constexpr iterator end() noexcept
+        {
+            return elements + SizeV;
+        }
+
+        [[nodiscard]] constexpr const_iterator end() const noexcept
+        {
+            return elements + SizeV;
+        }
+
+        [[nodiscard]] constexpr reverse_iterator rbegin() noexcept
+        {
+            return reverse_iterator(end());
+        }
+
+        [[nodiscard]] constexpr const_reverse_iterator rbegin() const noexcept
+        {
+            return const_reverse_iterator(end());
+        }
+
+        [[nodiscard]] constexpr reverse_iterator rend() noexcept
+        {
+            return reverse_iterator(begin());
+        }
+
+        [[nodiscard]] constexpr const_reverse_iterator rend() const noexcept
+        {
+            return const_reverse_iterator(begin());
+        }
+
+        [[nodiscard]] constexpr const_iterator cbegin() const noexcept
+        {
+            return begin();
+        }
+
+        [[nodiscard]] constexpr const_iterator cend() const noexcept
+        {
+            return end();
+        }
+
+        [[nodiscard]] constexpr const_reverse_iterator crbegin() const noexcept
+        {
+            return rbegin();
+        }
+
+        [[nodiscard]] constexpr const_reverse_iterator crend() const noexcept
+        {
+            return rend();
+        }
+
+        [[nodiscard]] constexpr size_type size() const noexcept
+        {
+            return SizeV;
+        }
+
+        ValueT elements[SizeV];
+    };
+
+    template <typename Enum1T, typename Enum2T, typename ValueT, std::size_t Size1V = cardinality<Enum1T>(), std::size_t Size2V = cardinality<Enum2T>()>
+    using EnumArray2 = EnumArray<Enum1T, EnumArray<Enum2T, ValueT, Size2V>, Size1V>;
+
+    enum struct Color : std::uint8_t
+    {
+        White,
+        Black
+    };
+
+    template <>
+    struct EnumTraits<Color>
+    {
+        using IdType = int;
+        using EnumType = Color;
+
+        static constexpr int cardinality = 2;
+        static constexpr bool isNaturalIndex = true;
+
+        static constexpr std::array<EnumType, cardinality> values{
+            Color::White,
+            Color::Black
+        };
+
+        [[nodiscard]] static constexpr int ordinal(EnumType c) noexcept
+        {
+            return static_cast<IdType>(c);
+        }
+
+        [[nodiscard]] static constexpr EnumType fromOrdinal(IdType id) noexcept
+        {
+            assert(id >= 0 && id < cardinality);
+
+            return static_cast<EnumType>(id);
+        }
+
+        [[nodiscard]] static constexpr std::string_view toString(EnumType c) noexcept
+        {
+            return std::string_view("wb" + ordinal(c), 1);
+        }
+
+        [[nodiscard]] static constexpr char toChar(EnumType c) noexcept
+        {
+            return "wb"[ordinal(c)];
+        }
+
+        [[nodiscard]] static constexpr std::optional<Color> fromChar(char c) noexcept
+        {
+            if (c == 'w') return Color::White;
+            if (c == 'b') return Color::Black;
+
+            return {};
+        }
+
+        [[nodiscard]] static constexpr std::optional<Color> fromString(std::string_view sv) noexcept
+        {
+            if (sv.size() != 1) return {};
+
+            return fromChar(sv[0]);
+        }
+    };
+
+    constexpr Color operator!(Color c)
+    {
+        return fromOrdinal<Color>(ordinal(c) ^ 1);
+    }
+
+    enum struct PieceType : std::uint8_t
+    {
+        Pawn,
+        Knight,
+        Bishop,
+        Rook,
+        Queen,
+        King,
+
+        None
+    };
+
+    template <>
+    struct EnumTraits<PieceType>
+    {
+        using IdType = int;
+        using EnumType = PieceType;
+
+        static constexpr int cardinality = 7;
+        static constexpr bool isNaturalIndex = true;
+
+        static constexpr std::array<EnumType, cardinality> values{
+            PieceType::Pawn,
+            PieceType::Knight,
+            PieceType::Bishop,
+            PieceType::Rook,
+            PieceType::Queen,
+            PieceType::King,
+            PieceType::None
+        };
+
+        [[nodiscard]] static constexpr int ordinal(EnumType c) noexcept
+        {
+            return static_cast<IdType>(c);
+        }
+
+        [[nodiscard]] static constexpr EnumType fromOrdinal(IdType id) noexcept
+        {
+            assert(id >= 0 && id < cardinality);
+
+            return static_cast<EnumType>(id);
+        }
+
+        [[nodiscard]] static constexpr std::string_view toString(EnumType p, Color c) noexcept
+        {
+            return std::string_view("PpNnBbRrQqKk " + (chess::ordinal(p) * 2 + chess::ordinal(c)), 1);
+        }
+
+        [[nodiscard]] static constexpr char toChar(EnumType p, Color c) noexcept
+        {
+            return "PpNnBbRrQqKk "[chess::ordinal(p) * 2 + chess::ordinal(c)];
+        }
+
+        [[nodiscard]] static constexpr std::optional<PieceType> fromChar(char c) noexcept
+        {
+            auto it = std::string_view("PpNnBbRrQqKk ").find(c);
+            if (it == std::string::npos) return {};
+            else return static_cast<PieceType>(it/2);
+        }
+
+        [[nodiscard]] static constexpr std::optional<PieceType> fromString(std::string_view sv) noexcept
+        {
+            if (sv.size() != 1) return {};
+
+            return fromChar(sv[0]);
+        }
+    };
+
+    struct Piece
+    {
+        [[nodiscard]] static constexpr Piece fromId(int id)
+        {
+            return Piece(id);
+        }
+
+        [[nodiscard]] static constexpr Piece none()
+        {
+            return Piece(PieceType::None, Color::White);
+        }
+
+        constexpr Piece() noexcept :
+            Piece(PieceType::None, Color::White)
+        {
+
+        }
+
+        constexpr Piece(PieceType type, Color color) noexcept :
+            m_id((ordinal(type) << 1) | ordinal(color))
+        {
+            assert(type != PieceType::None || color == Color::White);
+        }
+
+        constexpr Piece& operator=(const Piece& other) = default;
+
+        [[nodiscard]] constexpr friend bool operator==(Piece lhs, Piece rhs) noexcept
+        {
+            return lhs.m_id == rhs.m_id;
+        }
+
+        [[nodiscard]] constexpr friend bool operator!=(Piece lhs, Piece rhs) noexcept
+        {
+            return !(lhs == rhs);
+        }
+
+        [[nodiscard]] constexpr PieceType type() const
+        {
+            return fromOrdinal<PieceType>(m_id >> 1);
+        }
+
+        [[nodiscard]] constexpr Color color() const
+        {
+            return fromOrdinal<Color>(m_id & 1);
+        }
+
+        [[nodiscard]] constexpr std::pair<PieceType, Color> parts() const
+        {
+            return std::make_pair(type(), color());
+        }
+
+        [[nodiscard]] constexpr explicit operator int() const
+        {
+            return static_cast<int>(m_id);
+        }
+
+    private:
+        constexpr Piece(int id) :
+            m_id(id)
+        {
+        }
+
+        std::uint8_t m_id; // lowest bit is a color, 7 highest bits are a piece type
+    };
+
+    [[nodiscard]] constexpr Piece operator|(PieceType type, Color color) noexcept
+    {
+        return Piece(type, color);
+    }
+
+    [[nodiscard]] constexpr Piece operator|(Color color, PieceType type) noexcept
+    {
+        return Piece(type, color);
+    }
+
+    constexpr Piece whitePawn = Piece(PieceType::Pawn, Color::White);
+    constexpr Piece whiteKnight = Piece(PieceType::Knight, Color::White);
+    constexpr Piece whiteBishop = Piece(PieceType::Bishop, Color::White);
+    constexpr Piece whiteRook = Piece(PieceType::Rook, Color::White);
+    constexpr Piece whiteQueen = Piece(PieceType::Queen, Color::White);
+    constexpr Piece whiteKing = Piece(PieceType::King, Color::White);
+
+    constexpr Piece blackPawn = Piece(PieceType::Pawn, Color::Black);
+    constexpr Piece blackKnight = Piece(PieceType::Knight, Color::Black);
+    constexpr Piece blackBishop = Piece(PieceType::Bishop, Color::Black);
+    constexpr Piece blackRook = Piece(PieceType::Rook, Color::Black);
+    constexpr Piece blackQueen = Piece(PieceType::Queen, Color::Black);
+    constexpr Piece blackKing = Piece(PieceType::King, Color::Black);
+
+    static_assert(Piece::none().type() == PieceType::None);
+
+    template <>
+    struct EnumTraits<Piece>
+    {
+        using IdType = int;
+        using EnumType = Piece;
+
+        static constexpr int cardinality = 13;
+        static constexpr bool isNaturalIndex = true;
+
+        static constexpr std::array<EnumType, cardinality> values{
+            whitePawn,
+            blackPawn,
+            whiteKnight,
+            blackKnight,
+            whiteBishop,
+            blackBishop,
+            whiteRook,
+            blackRook,
+            whiteQueen,
+            blackQueen,
+            whiteKing,
+            blackKing,
+            Piece::none()
+        };
+
+        [[nodiscard]] static constexpr int ordinal(EnumType c) noexcept
+        {
+            return static_cast<IdType>(c);
+        }
+
+        [[nodiscard]] static constexpr EnumType fromOrdinal(int id) noexcept
+        {
+            assert(id >= 0 && id < cardinality);
+
+            return Piece::fromId(id);
+        }
+
+        [[nodiscard]] static constexpr std::string_view toString(EnumType p) noexcept
+        {
+            return std::string_view("PpNnBbRrQqKk " + ordinal(p), 1);
+        }
+
+        [[nodiscard]] static constexpr char toChar(EnumType p) noexcept
+        {
+            return "PpNnBbRrQqKk "[ordinal(p)];
+        }
+
+        [[nodiscard]] static constexpr std::optional<Piece> fromChar(char c) noexcept
+        {
+            auto it = std::string_view("PpNnBbRrQqKk ").find(c);
+            if (it == std::string::npos) return {};
+            else return Piece::fromId(static_cast<int>(it));
+        }
+
+        [[nodiscard]] static constexpr std::optional<Piece> fromString(std::string_view sv) noexcept
+        {
+            if (sv.size() != 1) return {};
+
+            return fromChar(sv[0]);
+        }
+    };
+
+    template <typename TagT>
+    struct Coord
+    {
+        constexpr Coord() noexcept :
+            m_i(0)
+        {
+        }
+
+        constexpr explicit Coord(int i) noexcept :
+            m_i(i)
+        {
+        }
+
+        [[nodiscard]] constexpr explicit operator int() const
+        {
+            return static_cast<int>(m_i);
+        }
+
+        constexpr friend Coord& operator++(Coord& c)
+        {
+            ++c.m_i;
+            return c;
+        }
+
+        constexpr friend Coord& operator--(Coord& c)
+        {
+            --c.m_i;
+            return c;
+        }
+
+        constexpr friend Coord& operator+=(Coord& c, int d)
+        {
+            c.m_i += d;
+            return c;
+        }
+
+        constexpr friend Coord& operator-=(Coord& c, int d)
+        {
+            c.m_i -= d;
+            return c;
+        }
+
+        constexpr friend Coord operator+(const Coord& c, int d)
+        {
+            Coord cpy(c);
+            cpy += d;
+            return cpy;
+        }
+
+        constexpr friend Coord operator-(const Coord& c, int d)
+        {
+            Coord cpy(c);
+            cpy -= d;
+            return cpy;
+        }
+
+        constexpr friend int operator-(const Coord& c1, const Coord& c2)
+        {
+            return c1.m_i - c2.m_i;
+        }
+
+        [[nodiscard]] constexpr friend bool operator==(const Coord& c1, const Coord& c2) noexcept
+        {
+            return c1.m_i == c2.m_i;
+        }
+
+        [[nodiscard]] constexpr friend bool operator!=(const Coord& c1, const Coord& c2) noexcept
+        {
+            return c1.m_i != c2.m_i;
+        }
+
+        [[nodiscard]] constexpr friend bool operator<(const Coord& c1, const Coord& c2) noexcept
+        {
+            return c1.m_i < c2.m_i;
+        }
+
+        [[nodiscard]] constexpr friend bool operator<=(const Coord& c1, const Coord& c2) noexcept
+        {
+            return c1.m_i <= c2.m_i;
+        }
+
+        [[nodiscard]] constexpr friend bool operator>(const Coord& c1, const Coord& c2) noexcept
+        {
+            return c1.m_i > c2.m_i;
+        }
+
+        [[nodiscard]] constexpr friend bool operator>=(const Coord& c1, const Coord& c2) noexcept
+        {
+            return c1.m_i >= c2.m_i;
+        }
+
+    private:
+        std::int8_t m_i;
+    };
+
+    struct FileTag;
+    struct RankTag;
+    using File = Coord<FileTag>;
+    using Rank = Coord<RankTag>;
+
+    constexpr File fileA = File(0);
+    constexpr File fileB = File(1);
+    constexpr File fileC = File(2);
+    constexpr File fileD = File(3);
+    constexpr File fileE = File(4);
+    constexpr File fileF = File(5);
+    constexpr File fileG = File(6);
+    constexpr File fileH = File(7);
+
+    constexpr Rank rank1 = Rank(0);
+    constexpr Rank rank2 = Rank(1);
+    constexpr Rank rank3 = Rank(2);
+    constexpr Rank rank4 = Rank(3);
+    constexpr Rank rank5 = Rank(4);
+    constexpr Rank rank6 = Rank(5);
+    constexpr Rank rank7 = Rank(6);
+    constexpr Rank rank8 = Rank(7);
+
+    template <>
+    struct EnumTraits<File>
+    {
+        using IdType = int;
+        using EnumType = File;
+
+        static constexpr int cardinality = 8;
+        static constexpr bool isNaturalIndex = true;
+
+        [[nodiscard]] static constexpr int ordinal(EnumType c) noexcept
+        {
+            return static_cast<IdType>(c);
+        }
+
+        [[nodiscard]] static constexpr EnumType fromOrdinal(IdType id) noexcept
+        {
+            assert(id >= 0 && id < cardinality);
+
+            return static_cast<EnumType>(id);
+        }
+
+        [[nodiscard]] static constexpr std::string_view toString(EnumType c) noexcept
+        {
+            assert(ordinal(c) >= 0 && ordinal(c) < 8);
+
+            return std::string_view("abcdefgh" + ordinal(c), 1);
+        }
+
+        [[nodiscard]] static constexpr std::optional<File> fromChar(char c) noexcept
+        {
+            if (c < 'a' || c > 'h') return {};
+            return static_cast<File>(c - 'a');
+        }
+
+        [[nodiscard]] static constexpr std::optional<File> fromString(std::string_view sv) noexcept
+        {
+            if (sv.size() != 1) return {};
+
+            return fromChar(sv[0]);
+        }
+    };
+
+    template <>
+    struct EnumTraits<Rank>
+    {
+        using IdType = int;
+        using EnumType = Rank;
+
+        static constexpr int cardinality = 8;
+        static constexpr bool isNaturalIndex = true;
+
+        [[nodiscard]] static constexpr int ordinal(EnumType c) noexcept
+        {
+            return static_cast<IdType>(c);
+        }
+
+        [[nodiscard]] static constexpr EnumType fromOrdinal(IdType id) noexcept
+        {
+            assert(id >= 0 && id < cardinality);
+
+            return static_cast<EnumType>(id);
+        }
+
+        [[nodiscard]] static constexpr std::string_view toString(EnumType c) noexcept
+        {
+            assert(ordinal(c) >= 0 && ordinal(c) < 8);
+
+            return std::string_view("12345678" + ordinal(c), 1);
+        }
+
+        [[nodiscard]] static constexpr std::optional<Rank> fromChar(char c) noexcept
+        {
+            if (c < '1' || c > '8') return {};
+            return static_cast<Rank>(c - '1');
+        }
+
+        [[nodiscard]] static constexpr std::optional<Rank> fromString(std::string_view sv) noexcept
+        {
+            if (sv.size() != 1) return {};
+
+            return fromChar(sv[0]);
+        }
+    };
+
+    // files east
+    // ranks north
+    struct FlatSquareOffset
+    {
+        std::int8_t value;
+
+        constexpr FlatSquareOffset() noexcept :
+            value(0)
+        {
+        }
+
+        constexpr FlatSquareOffset(int files, int ranks) noexcept :
+            value(files + ranks * cardinality<File>())
+        {
+            assert(files + ranks * cardinality<File>() >= std::numeric_limits<std::int8_t>::min());
+            assert(files + ranks * cardinality<File>() <= std::numeric_limits<std::int8_t>::max());
+        }
+
+        constexpr FlatSquareOffset operator-() const noexcept
+        {
+            return FlatSquareOffset(-value);
+        }
+
+    private:
+        constexpr FlatSquareOffset(int v) noexcept :
+            value(v)
+        {
+        }
+    };
+
+    struct Offset
+    {
+        std::int8_t files;
+        std::int8_t ranks;
+
+        constexpr Offset() :
+            files(0),
+            ranks(0)
+        {
+        }
+
+        constexpr Offset(int files_, int ranks_) :
+            files(files_),
+            ranks(ranks_)
+        {
+        }
+
+        [[nodiscard]] constexpr FlatSquareOffset flat() const
+        {
+            return { files, ranks };
+        }
+
+        [[nodiscard]] constexpr Offset operator-() const
+        {
+            return { -files, -ranks };
+        }
+    };
+
+    struct SquareCoords
+    {
+        File file;
+        Rank rank;
+
+        constexpr SquareCoords() noexcept :
+            file{},
+            rank{}
+        {
+        }
+
+        constexpr SquareCoords(File f, Rank r) noexcept :
+            file(f),
+            rank(r)
+        {
+        }
+
+        constexpr friend SquareCoords& operator+=(SquareCoords& c, Offset offset)
+        {
+            c.file += offset.files;
+            c.rank += offset.ranks;
+            return c;
+        }
+
+        [[nodiscard]] constexpr friend SquareCoords operator+(const SquareCoords& c, Offset offset)
+        {
+            SquareCoords cpy(c);
+            cpy.file += offset.files;
+            cpy.rank += offset.ranks;
+            return cpy;
+        }
+
+        [[nodiscard]] constexpr bool isOk() const
+        {
+            return file >= fileA && file <= fileH && rank >= rank1 && rank <= rank8;
+        }
+    };
+
+    struct Square
+    {
+    private:
+        static constexpr std::int8_t m_noneId = cardinality<Rank>() * cardinality<File>();
+
+        static constexpr std::uint8_t fileMask = 0b111;
+        static constexpr std::uint8_t rankMask = 0b111000;
+        static constexpr std::uint8_t rankShift = 3;
+
+    public:
+        [[nodiscard]] static constexpr Square none()
+        {
+            return Square(m_noneId);
+        }
+
+        constexpr Square() noexcept :
+            m_id(0)
+        {
+        }
+
+        constexpr explicit Square(int idx) noexcept :
+            m_id(idx)
+        {
+            assert(isOk() || m_id == m_noneId);
+        }
+
+        constexpr Square(File file, Rank rank) noexcept :
+            m_id(ordinal(file) + ordinal(rank) * cardinality<File>())
+        {
+            assert(isOk());
+        }
+
+        constexpr explicit Square(SquareCoords coords) noexcept :
+            Square(coords.file, coords.rank)
+        {
+        }
+
+        [[nodiscard]] constexpr friend bool operator<(Square lhs, Square rhs) noexcept
+        {
+            return lhs.m_id < rhs.m_id;
+        }
+
+        [[nodiscard]] constexpr friend bool operator>(Square lhs, Square rhs) noexcept
+        {
+            return lhs.m_id > rhs.m_id;
+        }
+
+        [[nodiscard]] constexpr friend bool operator<=(Square lhs, Square rhs) noexcept
+        {
+            return lhs.m_id <= rhs.m_id;
+        }
+
+        [[nodiscard]] constexpr friend bool operator>=(Square lhs, Square rhs) noexcept
+        {
+            return lhs.m_id >= rhs.m_id;
+        }
+
+        [[nodiscard]] constexpr friend bool operator==(Square lhs, Square rhs) noexcept
+        {
+            return lhs.m_id == rhs.m_id;
+        }
+
+        [[nodiscard]] constexpr friend bool operator!=(Square lhs, Square rhs) noexcept
+        {
+            return !(lhs == rhs);
+        }
+
+        constexpr friend Square& operator++(Square& sq)
+        {
+            ++sq.m_id;
+            return sq;
+        }
+
+        constexpr friend Square& operator--(Square& sq)
+        {
+            --sq.m_id;
+            return sq;
+        }
+
+        [[nodiscard]] constexpr friend Square operator+(Square sq, FlatSquareOffset offset)
+        {
+            Square sqCpy = sq;
+            sqCpy += offset;
+            return sqCpy;
+        }
+
+        constexpr friend Square& operator+=(Square& sq, FlatSquareOffset offset)
+        {
+            assert(sq.m_id + offset.value >= 0 && sq.m_id + offset.value < Square::m_noneId);
+            sq.m_id += offset.value;
+            return sq;
+        }
+
+        [[nodiscard]] constexpr friend Square operator+(Square sq, Offset offset)
+        {
+            assert(sq.file() + offset.files >= fileA);
+            assert(sq.file() + offset.files <= fileH);
+            assert(sq.rank() + offset.ranks >= rank1);
+            assert(sq.rank() + offset.ranks <= rank8);
+            return operator+(sq, offset.flat());
+        }
+
+        constexpr friend Square& operator+=(Square& sq, Offset offset)
+        {
+            return operator+=(sq, offset.flat());
+        }
+
+        [[nodiscard]] constexpr explicit operator int() const
+        {
+            return m_id;
+        }
+
+        [[nodiscard]] constexpr File file() const
+        {
+            assert(isOk());
+            return File(static_cast<unsigned>(m_id) & fileMask);
+        }
+
+        [[nodiscard]] constexpr Rank rank() const
+        {
+            assert(isOk());
+            return Rank(static_cast<unsigned>(m_id) >> rankShift);
+        }
+
+        [[nodiscard]] constexpr SquareCoords coords() const
+        {
+            return { file(), rank() };
+        }
+
+        [[nodiscard]] constexpr Color color() const
+        {
+            assert(isOk());
+            return !fromOrdinal<Color>((ordinal(rank()) + ordinal(file())) & 1);
+        }
+
+        constexpr void flipVertically()
+        {
+            m_id ^= rankMask;
+        }
+
+        constexpr void flipHorizontally()
+        {
+            m_id ^= fileMask;
+        }
+
+        constexpr Square flippedVertically() const
+        {
+            return Square(m_id ^ rankMask);
+        }
+
+        constexpr Square flippedHorizontally() const
+        {
+            return Square(m_id ^ fileMask);
+        }
+
+        [[nodiscard]] constexpr bool isOk() const
+        {
+            return m_id >= 0 && m_id < m_noneId;
+        }
+
+    private:
+        std::int8_t m_id;
+    };
+
+    constexpr Square a1(fileA, rank1);
+    constexpr Square a2(fileA, rank2);
+    constexpr Square a3(fileA, rank3);
+    constexpr Square a4(fileA, rank4);
+    constexpr Square a5(fileA, rank5);
+    constexpr Square a6(fileA, rank6);
+    constexpr Square a7(fileA, rank7);
+    constexpr Square a8(fileA, rank8);
+
+    constexpr Square b1(fileB, rank1);
+    constexpr Square b2(fileB, rank2);
+    constexpr Square b3(fileB, rank3);
+    constexpr Square b4(fileB, rank4);
+    constexpr Square b5(fileB, rank5);
+    constexpr Square b6(fileB, rank6);
+    constexpr Square b7(fileB, rank7);
+    constexpr Square b8(fileB, rank8);
+
+    constexpr Square c1(fileC, rank1);
+    constexpr Square c2(fileC, rank2);
+    constexpr Square c3(fileC, rank3);
+    constexpr Square c4(fileC, rank4);
+    constexpr Square c5(fileC, rank5);
+    constexpr Square c6(fileC, rank6);
+    constexpr Square c7(fileC, rank7);
+    constexpr Square c8(fileC, rank8);
+
+    constexpr Square d1(fileD, rank1);
+    constexpr Square d2(fileD, rank2);
+    constexpr Square d3(fileD, rank3);
+    constexpr Square d4(fileD, rank4);
+    constexpr Square d5(fileD, rank5);
+    constexpr Square d6(fileD, rank6);
+    constexpr Square d7(fileD, rank7);
+    constexpr Square d8(fileD, rank8);
+
+    constexpr Square e1(fileE, rank1);
+    constexpr Square e2(fileE, rank2);
+    constexpr Square e3(fileE, rank3);
+    constexpr Square e4(fileE, rank4);
+    constexpr Square e5(fileE, rank5);
+    constexpr Square e6(fileE, rank6);
+    constexpr Square e7(fileE, rank7);
+    constexpr Square e8(fileE, rank8);
+
+    constexpr Square f1(fileF, rank1);
+    constexpr Square f2(fileF, rank2);
+    constexpr Square f3(fileF, rank3);
+    constexpr Square f4(fileF, rank4);
+    constexpr Square f5(fileF, rank5);
+    constexpr Square f6(fileF, rank6);
+    constexpr Square f7(fileF, rank7);
+    constexpr Square f8(fileF, rank8);
+
+    constexpr Square g1(fileG, rank1);
+    constexpr Square g2(fileG, rank2);
+    constexpr Square g3(fileG, rank3);
+    constexpr Square g4(fileG, rank4);
+    constexpr Square g5(fileG, rank5);
+    constexpr Square g6(fileG, rank6);
+    constexpr Square g7(fileG, rank7);
+    constexpr Square g8(fileG, rank8);
+
+    constexpr Square h1(fileH, rank1);
+    constexpr Square h2(fileH, rank2);
+    constexpr Square h3(fileH, rank3);
+    constexpr Square h4(fileH, rank4);
+    constexpr Square h5(fileH, rank5);
+    constexpr Square h6(fileH, rank6);
+    constexpr Square h7(fileH, rank7);
+    constexpr Square h8(fileH, rank8);
+
+    static_assert(e1.color() == Color::Black);
+    static_assert(e8.color() == Color::White);
+
+    static_assert(e1.file() == fileE);
+    static_assert(e1.rank() == rank1);
+
+    static_assert(e1.flippedHorizontally() == d1);
+    static_assert(e1.flippedVertically() == e8);
+
+    template <>
+    struct EnumTraits<Square>
+    {
+        using IdType = int;
+        using EnumType = Square;
+
+        static constexpr int cardinality = chess::cardinality<Rank>() * chess::cardinality<File>();
+        static constexpr bool isNaturalIndex = true;
+
+        static constexpr std::array<EnumType, cardinality> values{
+            a1, b1, c1, d1, e1, f1, g1, h1,
+            a2, b2, c2, d2, e2, f2, g2, h2,
+            a3, b3, c3, d3, e3, f3, g3, h3,
+            a4, b4, c4, d4, e4, f4, g4, h4,
+            a5, b5, c5, d5, e5, f5, g5, h5,
+            a6, b6, c6, d6, e6, f6, g6, h6,
+            a7, b7, c7, d7, e7, f7, g7, h7,
+            a8, b8, c8, d8, e8, f8, g8, h8
+        };
+
+        [[nodiscard]] static constexpr int ordinal(EnumType c) noexcept
+        {
+            return static_cast<IdType>(c);
+        }
+
+        [[nodiscard]] static constexpr EnumType fromOrdinal(IdType id) noexcept
+        {
+            assert(id >= 0 && id < cardinality + 1);
+
+            return static_cast<EnumType>(id);
+        }
+
+        [[nodiscard]] static constexpr std::string_view toString(Square sq)
+        {
+            assert(sq.isOk());
+
+            return
+                std::string_view(
+                    "a1b1c1d1e1f1g1h1"
+                    "a2b2c2d2e2f2g2h2"
+                    "a3b3c3d3e3f3g3h3"
+                    "a4b4c4d4e4f4g4h4"
+                    "a5b5c5d5e5f5g5h5"
+                    "a6b6c6d6e6f6g6h6"
+                    "a7b7c7d7e7f7g7h7"
+                    "a8b8c8d8e8f8g8h8"
+                    + (ordinal(sq) * 2),
+                    2
+                );
+        }
+
+        [[nodiscard]] static constexpr std::optional<Square> fromString(std::string_view sv) noexcept
+        {
+            if (sv.size() != 2) return {};
+
+            const char f = sv[0];
+            const char r = sv[1];
+            if (f < 'a' || f > 'h') return {};
+            if (r < '1' || r > '8') return {};
+
+            return Square(static_cast<File>(f - 'a'), static_cast<Rank>(r - '1'));
+        }
+    };
+
+    static_assert(toString(d1) == std::string_view("d1"));
+    static_assert(values<Square>()[29] == f4);
+
+    enum struct MoveType : std::uint8_t
+    {
+        Normal,
+        Promotion,
+        Castle,
+        EnPassant
+    };
+
+    template <>
+    struct EnumTraits<MoveType>
+    {
+        using IdType = int;
+        using EnumType = MoveType;
+
+        static constexpr int cardinality = 4;
+        static constexpr bool isNaturalIndex = true;
+
+        static constexpr std::array<EnumType, cardinality> values{
+            MoveType::Normal,
+            MoveType::Promotion,
+            MoveType::Castle,
+            MoveType::EnPassant
+        };
+
+        [[nodiscard]] static constexpr int ordinal(EnumType c) noexcept
+        {
+            return static_cast<IdType>(c);
+        }
+
+        [[nodiscard]] static constexpr EnumType fromOrdinal(IdType id) noexcept
+        {
+            assert(id >= 0 && id < cardinality);
+
+            return static_cast<EnumType>(id);
+        }
+    };
+
+    enum struct CastleType : std::uint8_t
+    {
+        Short,
+        Long
+    };
+
+    [[nodiscard]] constexpr CastleType operator!(CastleType ct)
+    {
+        return static_cast<CastleType>(static_cast<std::uint8_t>(ct) ^ 1);
+    }
+
+    template <>
+    struct EnumTraits<CastleType>
+    {
+        using IdType = int;
+        using EnumType = CastleType;
+
+        static constexpr int cardinality = 2;
+        static constexpr bool isNaturalIndex = true;
+
+        static constexpr std::array<EnumType, cardinality> values{
+            CastleType::Short,
+            CastleType::Long
+        };
+
+        [[nodiscard]] static constexpr int ordinal(EnumType c) noexcept
+        {
+            return static_cast<IdType>(c);
+        }
+
+        [[nodiscard]] static constexpr EnumType fromOrdinal(IdType id) noexcept
+        {
+            assert(id >= 0 && id < cardinality);
+
+            return static_cast<EnumType>(id);
+        }
+    };
+
+    struct CompressedMove;
+
+    // castling is encoded as a king capturing rook
+    // ep is encoded as a normal pawn capture (move.to is empty on the board)
+    struct Move
+    {
+        Square from;
+        Square to;
+        MoveType type = MoveType::Normal;
+        Piece promotedPiece = Piece::none();
+
+        [[nodiscard]] constexpr friend bool operator==(const Move& lhs, const Move& rhs) noexcept
+        {
+            return lhs.from == rhs.from
+                && lhs.to == rhs.to
+                && lhs.type == rhs.type
+                && lhs.promotedPiece == rhs.promotedPiece;
+        }
+
+        [[nodiscard]] constexpr friend bool operator!=(const Move& lhs, const Move& rhs) noexcept
+        {
+            return !(lhs == rhs);
+        }
+
+        [[nodiscard]] constexpr CompressedMove compress() const noexcept;
+
+        [[nodiscard]] constexpr static Move null()
+        {
+            return Move{ Square::none(), Square::none() };
+        }
+
+        [[nodiscard]] constexpr static Move castle(CastleType ct, Color c);
+
+        [[nodiscard]] constexpr static Move normal(Square from, Square to)
+        {
+            return Move{ from, to, MoveType::Normal, Piece::none() };
+        }
+
+        [[nodiscard]] constexpr static Move enPassant(Square from, Square to)
+        {
+            return Move{ from, to, MoveType::EnPassant, Piece::none() };
+        }
+
+        [[nodiscard]] constexpr static Move promotion(Square from, Square to, Piece piece)
+        {
+            return Move{ from, to, MoveType::Promotion, piece };
+        }
+    };
+
+    namespace detail::castle
+    {
+        constexpr EnumArray2<CastleType, Color, Move> moves = { {
+            {{ { e1, h1, MoveType::Castle }, { e8, h8, MoveType::Castle } }},
+            {{ { e1, a1, MoveType::Castle }, { e8, a8, MoveType::Castle } }}
+        } };
+    }
+
+    [[nodiscard]] constexpr Move Move::castle(CastleType ct, Color c)
+    {
+        return detail::castle::moves[ct][c];
+    }
+
+    static_assert(sizeof(Move) == 4);
+
+    struct CompressedMove
+    {
+    private:
+        // from most significant bits
+        // 2 bits for move type
+        // 6 bits for from square
+        // 6 bits for to square
+        // 2 bits for promoted piece type
+        //    0 if not a promotion
+        static constexpr std::uint16_t squareMask = 0b111111u;
+        static constexpr std::uint16_t promotedPieceTypeMask = 0b11u;
+        static constexpr std::uint16_t moveTypeMask = 0b11u;
+
+    public:
+        [[nodiscard]] constexpr static CompressedMove readFromBigEndian(const unsigned char* data)
+        {
+            CompressedMove move{};
+            move.m_packed = (data[0] << 8) | data[1];
+            return move;
+        }
+
+        constexpr CompressedMove() noexcept :
+            m_packed(0)
+        {
+        }
+
+        // move must be either valid or a null move
+        constexpr CompressedMove(Move move) noexcept :
+            m_packed(0)
+        {
+            // else null move
+            if (move.from != move.to)
+            {
+                assert(move.from != Square::none());
+                assert(move.to != Square::none());
+
+                m_packed =
+                    (static_cast<std::uint16_t>(ordinal(move.type)) << (16 - 2))
+                    | (static_cast<std::uint16_t>(ordinal(move.from)) << (16 - 2 - 6))
+                    | (static_cast<std::uint16_t>(ordinal(move.to)) << (16 - 2 - 6 - 6));
+
+                if (move.type == MoveType::Promotion)
+                {
+                    assert(move.promotedPiece != Piece::none());
+
+                    m_packed |= ordinal(move.promotedPiece.type()) - ordinal(PieceType::Knight);
+                }
+                else
+                {
+                    assert(move.promotedPiece == Piece::none());
+                }
+            }
+        }
+
+        void writeToBigEndian(unsigned char* data) const
+        {
+            *data++ = m_packed >> 8;
+            *data++ = m_packed & 0xFF;
+        }
+
+        [[nodiscard]] constexpr std::uint16_t packed() const
+        {
+            return m_packed;
+        }
+
+        [[nodiscard]] constexpr MoveType type() const
+        {
+            return fromOrdinal<MoveType>(m_packed >> (16 - 2));
+        }
+
+        [[nodiscard]] constexpr Square from() const
+        {
+            return fromOrdinal<Square>((m_packed >> (16 - 2 - 6)) & squareMask);
+        }
+
+        [[nodiscard]] constexpr Square to() const
+        {
+            return fromOrdinal<Square>((m_packed >> (16 - 2 - 6 - 6)) & squareMask);
+        }
+
+        [[nodiscard]] constexpr Piece promotedPiece() const
+        {
+            if (type() == MoveType::Promotion)
+            {
+                const Color color =
+                    (to().rank() == rank1)
+                    ? Color::Black
+                    : Color::White;
+
+                const PieceType pt = fromOrdinal<PieceType>((m_packed & promotedPieceTypeMask) + ordinal(PieceType::Knight));
+                return color | pt;
+            }
+            else
+            {
+                return Piece::none();
+            }
+        }
+
+        [[nodiscard]] constexpr Move decompress() const noexcept
+        {
+            if (m_packed == 0)
+            {
+                return Move::null();
+            }
+            else
+            {
+                const MoveType type = fromOrdinal<MoveType>(m_packed >> (16 - 2));
+                const Square from = fromOrdinal<Square>((m_packed >> (16 - 2 - 6)) & squareMask);
+                const Square to = fromOrdinal<Square>((m_packed >> (16 - 2 - 6 - 6)) & squareMask);
+                const Piece promotedPiece = [&]() {
+                    if (type == MoveType::Promotion)
+                    {
+                        const Color color =
+                            (to.rank() == rank1)
+                            ? Color::Black
+                            : Color::White;
+
+                        const PieceType pt = fromOrdinal<PieceType>((m_packed & promotedPieceTypeMask) + ordinal(PieceType::Knight));
+                        return color | pt;
+                    }
+                    else
+                    {
+                        return Piece::none();
+                    }
+                }();
+
+                return Move{ from, to, type, promotedPiece };
+            }
+        }
+
+    private:
+        std::uint16_t m_packed;
+    };
+
+    static_assert(sizeof(CompressedMove) == 2);
+
+    [[nodiscard]] constexpr CompressedMove Move::compress() const noexcept
+    {
+        return CompressedMove(*this);
+    }
+
+    static_assert(a4 + Offset{ 0, 1 } == a5);
+    static_assert(a4 + Offset{ 0, 2 } == a6);
+    static_assert(a4 + Offset{ 0, -2 } == a2);
+    static_assert(a4 + Offset{ 0, -1 } == a3);
+
+    static_assert(e4 + Offset{ 1, 0 } == f4);
+    static_assert(e4 + Offset{ 2, 0 } == g4);
+    static_assert(e4 + Offset{ -1, 0 } == d4);
+    static_assert(e4 + Offset{ -2, 0 } == c4);
+
+    enum struct CastlingRights : std::uint8_t
+    {
+        None = 0x0,
+        WhiteKingSide = 0x1,
+        WhiteQueenSide = 0x2,
+        BlackKingSide = 0x4,
+        BlackQueenSide = 0x8,
+        White = WhiteKingSide | WhiteQueenSide,
+        Black = BlackKingSide | BlackQueenSide,
+        All = WhiteKingSide | WhiteQueenSide | BlackKingSide | BlackQueenSide
+    };
+
+    [[nodiscard]] constexpr CastlingRights operator|(CastlingRights lhs, CastlingRights rhs)
+    {
+        return static_cast<CastlingRights>(static_cast<std::uint8_t>(lhs) | static_cast<std::uint8_t>(rhs));
+    }
+
+    [[nodiscard]] constexpr CastlingRights operator&(CastlingRights lhs, CastlingRights rhs)
+    {
+        return static_cast<CastlingRights>(static_cast<std::uint8_t>(lhs) & static_cast<std::uint8_t>(rhs));
+    }
+
+    [[nodiscard]] constexpr CastlingRights operator~(CastlingRights lhs)
+    {
+        return static_cast<CastlingRights>(~static_cast<std::uint8_t>(lhs) & static_cast<std::uint8_t>(CastlingRights::All));
+    }
+
+    constexpr CastlingRights& operator|=(CastlingRights& lhs, CastlingRights rhs)
+    {
+        lhs = static_cast<CastlingRights>(static_cast<std::uint8_t>(lhs) | static_cast<std::uint8_t>(rhs));
+        return lhs;
+    }
+
+    constexpr CastlingRights& operator&=(CastlingRights& lhs, CastlingRights rhs)
+    {
+        lhs = static_cast<CastlingRights>(static_cast<std::uint8_t>(lhs) & static_cast<std::uint8_t>(rhs));
+        return lhs;
+    }
+    // checks whether lhs contains rhs
+    [[nodiscard]] constexpr bool contains(CastlingRights lhs, CastlingRights rhs)
+    {
+        return (lhs & rhs) == rhs;
+    }
+
+    template <>
+    struct EnumTraits<CastlingRights>
+    {
+        using IdType = int;
+        using EnumType = CastlingRights;
+
+        static constexpr int cardinality = 4;
+        static constexpr bool isNaturalIndex = false;
+
+        static constexpr std::array<EnumType, cardinality> values{
+            CastlingRights::WhiteKingSide,
+            CastlingRights::WhiteQueenSide,
+            CastlingRights::BlackKingSide,
+            CastlingRights::BlackQueenSide
+        };
+
+        [[nodiscard]] static constexpr int ordinal(EnumType c) noexcept
+        {
+            return static_cast<IdType>(c);
+        }
+
+        [[nodiscard]] static constexpr EnumType fromOrdinal(IdType id) noexcept
+        {
+            return static_cast<EnumType>(id);
+        }
+    };
+
+    struct CompressedReverseMove;
+
+    struct ReverseMove
+    {
+        Move move;
+        Piece capturedPiece;
+        Square oldEpSquare;
+        CastlingRights oldCastlingRights;
+
+        // We need a well defined case for the starting position.
+        constexpr ReverseMove() :
+            move(Move::null()),
+            capturedPiece(Piece::none()),
+            oldEpSquare(Square::none()),
+            oldCastlingRights(CastlingRights::All)
+        {
+        }
+
+        constexpr ReverseMove(const Move& move_, Piece capturedPiece_, Square oldEpSquare_, CastlingRights oldCastlingRights_) :
+            move(move_),
+            capturedPiece(capturedPiece_),
+            oldEpSquare(oldEpSquare_),
+            oldCastlingRights(oldCastlingRights_)
+        {
+        }
+
+        constexpr bool isNull() const
+        {
+            return move.from == move.to;
+        }
+
+        [[nodiscard]] constexpr CompressedReverseMove compress() const noexcept;
+
+        [[nodiscard]] constexpr friend bool operator==(const ReverseMove& lhs, const ReverseMove& rhs) noexcept
+        {
+            return lhs.move == rhs.move
+                && lhs.capturedPiece == rhs.capturedPiece
+                && lhs.oldEpSquare == rhs.oldEpSquare
+                && lhs.oldCastlingRights == rhs.oldCastlingRights;
+        }
+
+        [[nodiscard]] constexpr friend bool operator!=(const ReverseMove& lhs, const ReverseMove& rhs) noexcept
+        {
+            return !(lhs == rhs);
+        }
+    };
+
+    static_assert(sizeof(ReverseMove) == 7);
+
+    struct CompressedReverseMove
+    {
+    private:
+        // we use 7 bits because it can be Square::none()
+        static constexpr std::uint32_t squareMask = 0b1111111u;
+        static constexpr std::uint32_t pieceMask = 0b1111u;
+        static constexpr std::uint32_t castlingRightsMask = 0b1111;
+    public:
+
+        constexpr CompressedReverseMove() noexcept :
+            m_move{},
+            m_oldState{}
+        {
+        }
+
+        constexpr CompressedReverseMove(const ReverseMove& rm) noexcept :
+            m_move(rm.move.compress()),
+            m_oldState{ static_cast<uint16_t>(
+                ((ordinal(rm.capturedPiece) & pieceMask) << 11)
+                | ((ordinal(rm.oldCastlingRights) & castlingRightsMask) << 7)
+                | (ordinal(rm.oldEpSquare) & squareMask)
+                )
+            }
+        {
+        }
+
+        [[nodiscard]] constexpr Move move() const
+        {
+            return m_move.decompress();
+        }
+
+        [[nodiscard]] const CompressedMove& compressedMove() const
+        {
+            return m_move;
+        }
+
+        [[nodiscard]] constexpr Piece capturedPiece() const
+        {
+            return fromOrdinal<Piece>(m_oldState >> 11);
+        }
+
+        [[nodiscard]] constexpr CastlingRights oldCastlingRights() const
+        {
+            return fromOrdinal<CastlingRights>((m_oldState >> 7) & castlingRightsMask);
+        }
+
+        [[nodiscard]] constexpr Square oldEpSquare() const
+        {
+            return fromOrdinal<Square>(m_oldState & squareMask);
+        }
+
+        [[nodiscard]] constexpr ReverseMove decompress() const noexcept
+        {
+            const Piece capturedPiece = fromOrdinal<Piece>(m_oldState >> 11);
+            const CastlingRights castlingRights = fromOrdinal<CastlingRights>((m_oldState >> 7) & castlingRightsMask);
+            // We could pack the ep square more, but don't have to, because
+            // can't save another byte anyway.
+            const Square epSquare = fromOrdinal<Square>(m_oldState & squareMask);
+
+            return ReverseMove(m_move.decompress(), capturedPiece, epSquare, castlingRights);
+        }
+
+    private:
+        CompressedMove m_move;
+        std::uint16_t m_oldState;
+    };
+
+    static_assert(sizeof(CompressedReverseMove) == 4);
+
+    [[nodiscard]] constexpr CompressedReverseMove ReverseMove::compress() const noexcept
+    {
+        return CompressedReverseMove(*this);
+    }
+
+    // This can be regarded as a perfect hash. Going back is hard.
+    struct PackedReverseMove
+    {
+        static constexpr std::uint32_t mask = 0x7FFFFFFu;
+        static constexpr std::size_t numBits = 27;
+
+    private:
+        static constexpr std::uint32_t squareMask = 0b111111u;
+        static constexpr std::uint32_t pieceMask = 0b1111u;
+        static constexpr std::uint32_t pieceTypeMask = 0b111u;
+        static constexpr std::uint32_t castlingRightsMask = 0b1111;
+        static constexpr std::uint32_t fileMask = 0b111;
+
+    public:
+        constexpr PackedReverseMove(const std::uint32_t packed) :
+            m_packed(packed)
+        {
+
+        }
+
+        constexpr PackedReverseMove(const ReverseMove& reverseMove) :
+            m_packed(
+                0u
+                // The only move when square is none() is null move and
+                // then both squares are none(). No other move is like that
+                // so we don't lose any information by storing only
+                // the 6 bits of each square.
+                | ((ordinal(reverseMove.move.from) & squareMask) << 21)
+                | ((ordinal(reverseMove.move.to) & squareMask) << 15)
+                // Other masks are just for code clarity, they should
+                // never change the values.
+                | ((ordinal(reverseMove.capturedPiece) & pieceMask) << 11)
+                | ((ordinal(reverseMove.oldCastlingRights) & castlingRightsMask) << 7)
+                | ((ordinal(reverseMove.move.promotedPiece.type()) & pieceTypeMask) << 4)
+                | (((reverseMove.oldEpSquare != Square::none()) & 1) << 3)
+                // We probably could omit the squareMask here but for clarity it's left.
+                | (ordinal(Square(ordinal(reverseMove.oldEpSquare) & squareMask).file()) & fileMask)
+            )
+        {
+        }
+
+        constexpr std::uint32_t packed() const
+        {
+            return m_packed;
+        }
+
+        constexpr ReverseMove unpack(Color sideThatMoved) const
+        {
+            ReverseMove rmove{};
+
+            rmove.move.from = fromOrdinal<Square>((m_packed >> 21) & squareMask);
+            rmove.move.to = fromOrdinal<Square>((m_packed >> 15) & squareMask);
+            rmove.capturedPiece = fromOrdinal<Piece>((m_packed >> 11) & pieceMask);
+            rmove.oldCastlingRights = fromOrdinal<CastlingRights>((m_packed >> 7) & castlingRightsMask);
+            const PieceType promotedPieceType = fromOrdinal<PieceType>((m_packed >> 4) & pieceTypeMask);
+            if (promotedPieceType != PieceType::None)
+            {
+                rmove.move.promotedPiece = Piece(promotedPieceType, sideThatMoved);
+                rmove.move.type = MoveType::Promotion;
+            }
+            const bool hasEpSquare = static_cast<bool>((m_packed >> 3) & 1);
+            if (hasEpSquare)
+            {
+                // ep square is always where the opponent moved
+                const Rank rank =
+                    sideThatMoved == Color::White
+                    ? rank6
+                    : rank3;
+                const File file = fromOrdinal<File>(m_packed & fileMask);
+                rmove.oldEpSquare = Square(file, rank);
+                if (rmove.oldEpSquare == rmove.move.to)
+                {
+                    rmove.move.type = MoveType::EnPassant;
+                }
+            }
+            else
+            {
+                rmove.oldEpSquare = Square::none();
+            }
+
+            if (rmove.move.type == MoveType::Normal && rmove.oldCastlingRights != CastlingRights::None)
+            {
+                // If castling was possible then we know it was the king that moved from e1/e8.
+                if (rmove.move.from == e1)
+                {
+                    if (rmove.move.to == h1 || rmove.move.to == a1)
+                    {
+                        rmove.move.type = MoveType::Castle;
+                    }
+                }
+                else if (rmove.move.from == e8)
+                {
+                    if (rmove.move.to == h8 || rmove.move.to == a8)
+                    {
+                        rmove.move.type = MoveType::Castle;
+                    }
+                }
+            }
+
+            return rmove;
+        }
+
+    private:
+        // Uses only 27 lowest bits.
+        // Bit meaning from highest to lowest.
+        // - 6 bits from
+        // - 6 bits to
+        // - 4 bits for the captured piece
+        // - 4 bits for prev castling rights
+        // - 3 bits promoted piece type
+        // - 1 bit  to specify if the ep square was valid (false if none())
+        // - 3 bits for prev ep square file
+        std::uint32_t m_packed;
+    };
+
+    struct MoveCompareLess
+    {
+        [[nodiscard]] bool operator()(const Move& lhs, const Move& rhs) const noexcept
+        {
+            if (ordinal(lhs.from) < ordinal(rhs.from)) return true;
+            if (ordinal(lhs.from) > ordinal(rhs.from)) return false;
+
+            if (ordinal(lhs.to) < ordinal(rhs.to)) return true;
+            if (ordinal(lhs.to) > ordinal(rhs.to)) return false;
+
+            if (ordinal(lhs.type) < ordinal(rhs.type)) return true;
+            if (ordinal(lhs.type) > ordinal(rhs.type)) return false;
+
+            if (ordinal(lhs.promotedPiece) < ordinal(rhs.promotedPiece)) return true;
+
+            return false;
+        }
+    };
+
+    struct ReverseMoveCompareLess
+    {
+        [[nodiscard]] bool operator()(const ReverseMove& lhs, const ReverseMove& rhs) const noexcept
+        {
+            if (MoveCompareLess{}(lhs.move, rhs.move)) return true;
+            if (MoveCompareLess{}(rhs.move, lhs.move)) return false;
+
+            if (ordinal(lhs.capturedPiece) < ordinal(rhs.capturedPiece)) return true;
+            if (ordinal(lhs.capturedPiece) > ordinal(rhs.capturedPiece)) return false;
+
+            if (static_cast<unsigned>(lhs.oldCastlingRights) < static_cast<unsigned>(rhs.oldCastlingRights)) return true;
+            if (static_cast<unsigned>(lhs.oldCastlingRights) > static_cast<unsigned>(rhs.oldCastlingRights)) return false;
+
+            if (ordinal(lhs.oldEpSquare) < ordinal(rhs.oldEpSquare)) return true;
+            if (ordinal(lhs.oldEpSquare) > ordinal(rhs.oldEpSquare)) return false;
+
+            return false;
+        }
+    };
+
+    struct BitboardIterator
+    {
+        using value_type = Square;
+        using difference_type = std::ptrdiff_t;
+        using reference = Square;
+        using iterator_category = std::input_iterator_tag;
+        using pointer = const Square*;
+
+        constexpr BitboardIterator() noexcept :
+            m_squares(0)
+        {
+        }
+
+        constexpr BitboardIterator(std::uint64_t v) noexcept :
+            m_squares(v)
+        {
+        }
+
+        constexpr BitboardIterator(const BitboardIterator&) = default;
+        constexpr BitboardIterator(BitboardIterator&&) = default;
+        constexpr BitboardIterator& operator=(const BitboardIterator&) = default;
+        constexpr BitboardIterator& operator=(BitboardIterator&&) = default;
+
+        [[nodiscard]] constexpr bool friend operator==(BitboardIterator lhs, BitboardIterator rhs) noexcept
+        {
+            return lhs.m_squares == rhs.m_squares;
+        }
+
+        [[nodiscard]] constexpr bool friend operator!=(BitboardIterator lhs, BitboardIterator rhs) noexcept
+        {
+            return lhs.m_squares != rhs.m_squares;
+        }
+
+        [[nodiscard]] inline Square operator*() const
+        {
+            return first();
+        }
+
+        constexpr BitboardIterator& operator++() noexcept
+        {
+            popFirst();
+            return *this;
+        }
+
+    private:
+        std::uint64_t m_squares;
+
+        constexpr void popFirst() noexcept
+        {
+            m_squares &= m_squares - 1;
+        }
+
+        [[nodiscard]] inline Square first() const
+        {
+            assert(m_squares != 0);
+
+            return fromOrdinal<Square>(intrin::lsb(m_squares));
+        }
+    };
+
+    struct Bitboard
+    {
+        // bits counted from the LSB
+        // order is A1 B2 ... G8 H8
+        // just like in Square
+
+    public:
+        constexpr Bitboard() noexcept :
+            m_squares(0)
+        {
+        }
+
+    private:
+        constexpr explicit Bitboard(Square sq) noexcept :
+            m_squares(static_cast<std::uint64_t>(1ULL) << ordinal(sq))
+        {
+            assert(sq.isOk());
+        }
+
+        constexpr explicit Bitboard(Rank r) noexcept :
+            m_squares(static_cast<std::uint64_t>(0xFFULL) << (ordinal(r) * 8))
+        {
+        }
+
+        constexpr explicit Bitboard(File f) noexcept :
+            m_squares(static_cast<std::uint64_t>(0x0101010101010101ULL) << ordinal(f))
+        {
+        }
+
+        constexpr explicit Bitboard(Color c) noexcept :
+            m_squares(c == Color::White ? 0xAA55AA55AA55AA55ULL : ~0xAA55AA55AA55AA55ULL)
+        {
+        }
+
+        constexpr explicit Bitboard(std::uint64_t bb) noexcept :
+            m_squares(bb)
+        {
+        }
+
+        // files A..file inclusive
+        static constexpr EnumArray<File, std::uint64_t> m_filesUpToBB{
+            0x0101010101010101ULL,
+            0x0303030303030303ULL,
+            0x0707070707070707ULL,
+            0x0F0F0F0F0F0F0F0FULL,
+            0x1F1F1F1F1F1F1F1FULL,
+            0x3F3F3F3F3F3F3F3FULL,
+            0x7F7F7F7F7F7F7F7FULL,
+            0xFFFFFFFFFFFFFFFFULL
+        };
+
+    public:
+
+        [[nodiscard]] static constexpr Bitboard none()
+        {
+            return Bitboard{};
+        }
+
+        [[nodiscard]] static constexpr Bitboard all()
+        {
+            return ~none();
+        }
+
+        [[nodiscard]] static constexpr Bitboard square(Square sq)
+        {
+            return Bitboard(sq);
+        }
+
+        [[nodiscard]] static constexpr Bitboard file(File f)
+        {
+            return Bitboard(f);
+        }
+
+        [[nodiscard]] static constexpr Bitboard rank(Rank r)
+        {
+            return Bitboard(r);
+        }
+
+        [[nodiscard]] static constexpr Bitboard color(Color c)
+        {
+            return Bitboard(c);
+        }
+
+        [[nodiscard]] static constexpr Bitboard fromBits(std::uint64_t bits)
+        {
+            return Bitboard(bits);
+        }
+
+        // inclusive
+        [[nodiscard]] static constexpr Bitboard betweenFiles(File left, File right)
+        {
+            assert(left <= right);
+
+            if (left == fileA)
+            {
+                return Bitboard::fromBits(m_filesUpToBB[right]);
+            }
+            else
+            {
+                return Bitboard::fromBits(m_filesUpToBB[right] ^ m_filesUpToBB[left - 1]);
+            }
+        }
+
+        [[nodiscard]] constexpr bool isEmpty() const
+        {
+            return m_squares == 0;
+        }
+
+        [[nodiscard]] constexpr bool isSet(Square sq) const
+        {
+            return !!((m_squares >> ordinal(sq)) & 1ull);
+        }
+
+        constexpr void set(Square sq)
+        {
+            *this |= Bitboard(sq);
+        }
+
+        constexpr void unset(Square sq)
+        {
+            *this &= ~(Bitboard(sq));
+        }
+
+        constexpr void toggle(Square sq)
+        {
+            *this ^= Bitboard(sq);
+        }
+
+        [[nodiscard]] constexpr BitboardIterator begin() const
+        {
+            return BitboardIterator(m_squares);
+        }
+
+        [[nodiscard]] constexpr BitboardIterator end() const
+        {
+            return BitboardIterator{};
+        }
+
+        [[nodiscard]] constexpr BitboardIterator cbegin() const
+        {
+            return BitboardIterator(m_squares);
+        }
+
+        [[nodiscard]] constexpr BitboardIterator cend() const
+        {
+            return BitboardIterator{};
+        }
+
+        [[nodiscard]] constexpr bool friend operator==(Bitboard lhs, Bitboard rhs) noexcept
+        {
+            return lhs.m_squares == rhs.m_squares;
+        }
+
+        [[nodiscard]] constexpr bool friend operator!=(Bitboard lhs, Bitboard rhs) noexcept
+        {
+            return lhs.m_squares != rhs.m_squares;
+        }
+
+        constexpr Bitboard shiftedVertically(int ranks) const
+        {
+            if (ranks >= 0)
+            {
+                return fromBits(m_squares << 8 * ranks);
+            }
+            else
+            {
+                return fromBits(m_squares >> -8 * ranks);
+            }
+        }
+
+        template <int files, int ranks>
+        constexpr void shift()
+        {
+            static_assert(files >= -7);
+            static_assert(ranks >= -7);
+            static_assert(files <= 7);
+            static_assert(ranks <= 7);
+
+            if constexpr (files != 0)
+            {
+                constexpr Bitboard mask =
+                    files > 0
+                    ? Bitboard::betweenFiles(fileA, fileH - files)
+                    : Bitboard::betweenFiles(fileA - files, fileH);
+
+                m_squares &= mask.m_squares;
+            }
+
+            constexpr int shift = files + ranks * 8;
+            if constexpr (shift == 0)
+            {
+                return;
+            }
+
+            if constexpr (shift < 0)
+            {
+                m_squares >>= -shift;
+            }
+            else
+            {
+                m_squares <<= shift;
+            }
+        }
+
+        template <int files, int ranks>
+        constexpr Bitboard shifted() const
+        {
+            Bitboard bbCpy(*this);
+            bbCpy.shift<files, ranks>();
+            return bbCpy;
+        }
+
+        constexpr void shift(Offset offset)
+        {
+            assert(offset.files >= -7);
+            assert(offset.ranks >= -7);
+            assert(offset.files <= 7);
+            assert(offset.ranks <= 7);
+
+            if (offset.files != 0)
+            {
+                const Bitboard mask =
+                    offset.files > 0
+                    ? Bitboard::betweenFiles(fileA, fileH - offset.files)
+                    : Bitboard::betweenFiles(fileA - offset.files, fileH);
+
+                m_squares &= mask.m_squares;
+            }
+
+            const int shift = offset.files + offset.ranks * 8;
+            if (shift < 0)
+            {
+                m_squares >>= -shift;
+            }
+            else
+            {
+                m_squares <<= shift;
+            }
+        }
+
+        [[nodiscard]] constexpr Bitboard shifted(Offset offset) const
+        {
+            Bitboard bbCpy(*this);
+            bbCpy.shift(offset);
+            return bbCpy;
+        }
+
+        [[nodiscard]] constexpr Bitboard operator~() const
+        {
+            Bitboard bb = *this;
+            bb.m_squares = ~m_squares;
+            return bb;
+        }
+
+        constexpr Bitboard& operator^=(Color c)
+        {
+            m_squares ^= Bitboard(c).m_squares;
+            return *this;
+        }
+
+        constexpr Bitboard& operator&=(Color c)
+        {
+            m_squares &= Bitboard(c).m_squares;
+            return *this;
+        }
+
+        constexpr Bitboard& operator|=(Color c)
+        {
+            m_squares |= Bitboard(c).m_squares;
+            return *this;
+        }
+
+        [[nodiscard]] constexpr Bitboard operator^(Color c) const
+        {
+            Bitboard bb = *this;
+            bb ^= c;
+            return bb;
+        }
+
+        [[nodiscard]] constexpr Bitboard operator&(Color c) const
+        {
+            Bitboard bb = *this;
+            bb &= c;
+            return bb;
+        }
+
+        [[nodiscard]] constexpr Bitboard operator|(Color c) const
+        {
+            Bitboard bb = *this;
+            bb |= c;
+            return bb;
+        }
+
+        constexpr Bitboard& operator^=(Square sq)
+        {
+            m_squares ^= Bitboard(sq).m_squares;
+            return *this;
+        }
+
+        constexpr Bitboard& operator&=(Square sq)
+        {
+            m_squares &= Bitboard(sq).m_squares;
+            return *this;
+        }
+
+        constexpr Bitboard& operator|=(Square sq)
+        {
+            m_squares |= Bitboard(sq).m_squares;
+            return *this;
+        }
+
+        [[nodiscard]] constexpr Bitboard operator^(Square sq) const
+        {
+            Bitboard bb = *this;
+            bb ^= sq;
+            return bb;
+        }
+
+        [[nodiscard]] constexpr Bitboard operator&(Square sq) const
+        {
+            Bitboard bb = *this;
+            bb &= sq;
+            return bb;
+        }
+
+        [[nodiscard]] constexpr Bitboard operator|(Square sq) const
+        {
+            Bitboard bb = *this;
+            bb |= sq;
+            return bb;
+        }
+
+        [[nodiscard]] constexpr friend Bitboard operator^(Square sq, Bitboard bb)
+        {
+            return bb ^ sq;
+        }
+
+        [[nodiscard]] constexpr friend Bitboard operator&(Square sq, Bitboard bb)
+        {
+            return bb & sq;
+        }
+
+        [[nodiscard]] constexpr friend Bitboard operator|(Square sq, Bitboard bb)
+        {
+            return bb | sq;
+        }
+
+        constexpr Bitboard& operator^=(Bitboard rhs)
+        {
+            m_squares ^= rhs.m_squares;
+            return *this;
+        }
+
+        constexpr Bitboard& operator&=(Bitboard rhs)
+        {
+            m_squares &= rhs.m_squares;
+            return *this;
+        }
+
+        constexpr Bitboard& operator|=(Bitboard rhs)
+        {
+            m_squares |= rhs.m_squares;
+            return *this;
+        }
+
+        [[nodiscard]] constexpr Bitboard operator^(Bitboard sq) const
+        {
+            Bitboard bb = *this;
+            bb ^= sq;
+            return bb;
+        }
+
+        [[nodiscard]] constexpr Bitboard operator&(Bitboard sq) const
+        {
+            Bitboard bb = *this;
+            bb &= sq;
+            return bb;
+        }
+
+        [[nodiscard]] constexpr Bitboard operator|(Bitboard sq) const
+        {
+            Bitboard bb = *this;
+            bb |= sq;
+            return bb;
+        }
+
+        [[nodiscard]] inline int count() const
+        {
+            return static_cast<int>(intrin::popcount(m_squares));
+        }
+
+        [[nodiscard]] constexpr bool moreThanOne() const
+        {
+            return !!(m_squares & (m_squares - 1));
+        }
+
+        [[nodiscard]] constexpr bool exactlyOne() const
+        {
+            return m_squares != 0 && !moreThanOne();
+        }
+
+        [[nodiscard]] constexpr bool any() const
+        {
+            return !!m_squares;
+        }
+
+        [[nodiscard]] inline Square first() const
+        {
+            assert(m_squares != 0);
+
+            return fromOrdinal<Square>(intrin::lsb(m_squares));
+        }
+
+        [[nodiscard]] inline Square nth(int n) const
+        {
+            assert(count() > n);
+
+            Bitboard cpy = *this;
+            while (n--) cpy.popFirst();
+            return cpy.first();
+        }
+
+        [[nodiscard]] inline Square last() const
+        {
+            assert(m_squares != 0);
+
+            return fromOrdinal<Square>(intrin::msb(m_squares));
+        }
+
+        [[nodiscard]] constexpr std::uint64_t bits() const
+        {
+            return m_squares;
+        }
+
+        constexpr void popFirst()
+        {
+            assert(m_squares != 0);
+
+            m_squares &= m_squares - 1;
+        }
+
+        constexpr Bitboard& operator=(const Bitboard& other) = default;
+
+    private:
+        std::uint64_t m_squares;
+    };
+
+    [[nodiscard]] constexpr Bitboard operator^(Square sq0, Square sq1)
+    {
+        return Bitboard::square(sq0) ^ sq1;
+    }
+
+    [[nodiscard]] constexpr Bitboard operator&(Square sq0, Square sq1)
+    {
+        return Bitboard::square(sq0) & sq1;
+    }
+
+    [[nodiscard]] constexpr Bitboard operator|(Square sq0, Square sq1)
+    {
+        return Bitboard::square(sq0) | sq1;
+    }
+
+    [[nodiscard]] constexpr Bitboard operator""_bb(unsigned long long bits)
+    {
+        return Bitboard::fromBits(bits);
+    }
+
+    namespace bb
+    {
+        namespace fancy_magics
+        {
+            // Implementation based on https://github.com/syzygy1/Cfish
+
+            alignas(64) constexpr EnumArray<Square, std::uint64_t> g_rookMagics{ {
+                0x0A80004000801220ull,
+                0x8040004010002008ull,
+                0x2080200010008008ull,
+                0x1100100008210004ull,
+                0xC200209084020008ull,
+                0x2100010004000208ull,
+                0x0400081000822421ull,
+                0x0200010422048844ull,
+                0x0800800080400024ull,
+                0x0001402000401000ull,
+                0x3000801000802001ull,
+                0x4400800800100083ull,
+                0x0904802402480080ull,
+                0x4040800400020080ull,
+                0x0018808042000100ull,
+                0x4040800080004100ull,
+                0x0040048001458024ull,
+                0x00A0004000205000ull,
+                0x3100808010002000ull,
+                0x4825010010000820ull,
+                0x5004808008000401ull,
+                0x2024818004000A00ull,
+                0x0005808002000100ull,
+                0x2100060004806104ull,
+                0x0080400880008421ull,
+                0x4062220600410280ull,
+                0x010A004A00108022ull,
+                0x0000100080080080ull,
+                0x0021000500080010ull,
+                0x0044000202001008ull,
+                0x0000100400080102ull,
+                0xC020128200040545ull,
+                0x0080002000400040ull,
+                0x0000804000802004ull,
+                0x0000120022004080ull,
+                0x010A386103001001ull,
+                0x9010080080800400ull,
+                0x8440020080800400ull,
+                0x0004228824001001ull,
+                0x000000490A000084ull,
+                0x0080002000504000ull,
+                0x200020005000C000ull,
+                0x0012088020420010ull,
+                0x0010010080080800ull,
+                0x0085001008010004ull,
+                0x0002000204008080ull,
+                0x0040413002040008ull,
+                0x0000304081020004ull,
+                0x0080204000800080ull,
+                0x3008804000290100ull,
+                0x1010100080200080ull,
+                0x2008100208028080ull,
+                0x5000850800910100ull,
+                0x8402019004680200ull,
+                0x0120911028020400ull,
+                0x0000008044010200ull,
+                0x0020850200244012ull,
+                0x0020850200244012ull,
+                0x0000102001040841ull,
+                0x140900040A100021ull,
+                0x000200282410A102ull,
+                0x000200282410A102ull,
+                0x000200282410A102ull,
+                0x4048240043802106ull
+                    } };
+
+            alignas(64) constexpr EnumArray<Square, std::uint64_t> g_bishopMagics{ {
+                0x40106000A1160020ull,
+                0x0020010250810120ull,
+                0x2010010220280081ull,
+                0x002806004050C040ull,
+                0x0002021018000000ull,
+                0x2001112010000400ull,
+                0x0881010120218080ull,
+                0x1030820110010500ull,
+                0x0000120222042400ull,
+                0x2000020404040044ull,
+                0x8000480094208000ull,
+                0x0003422A02000001ull,
+                0x000A220210100040ull,
+                0x8004820202226000ull,
+                0x0018234854100800ull,
+                0x0100004042101040ull,
+                0x0004001004082820ull,
+                0x0010000810010048ull,
+                0x1014004208081300ull,
+                0x2080818802044202ull,
+                0x0040880C00A00100ull,
+                0x0080400200522010ull,
+                0x0001000188180B04ull,
+                0x0080249202020204ull,
+                0x1004400004100410ull,
+                0x00013100A0022206ull,
+                0x2148500001040080ull,
+                0x4241080011004300ull,
+                0x4020848004002000ull,
+                0x10101380D1004100ull,
+                0x0008004422020284ull,
+                0x01010A1041008080ull,
+                0x0808080400082121ull,
+                0x0808080400082121ull,
+                0x0091128200100C00ull,
+                0x0202200802010104ull,
+                0x8C0A020200440085ull,
+                0x01A0008080B10040ull,
+                0x0889520080122800ull,
+                0x100902022202010Aull,
+                0x04081A0816002000ull,
+                0x0000681208005000ull,
+                0x8170840041008802ull,
+                0x0A00004200810805ull,
+                0x0830404408210100ull,
+                0x2602208106006102ull,
+                0x1048300680802628ull,
+                0x2602208106006102ull,
+                0x0602010120110040ull,
+                0x0941010801043000ull,
+                0x000040440A210428ull,
+                0x0008240020880021ull,
+                0x0400002012048200ull,
+                0x00AC102001210220ull,
+                0x0220021002009900ull,
+                0x84440C080A013080ull,
+                0x0001008044200440ull,
+                0x0004C04410841000ull,
+                0x2000500104011130ull,
+                0x1A0C010011C20229ull,
+                0x0044800112202200ull,
+                0x0434804908100424ull,
+                0x0300404822C08200ull,
+                0x48081010008A2A80ull
+            } };
+
+            alignas(64) static EnumArray<Square, Bitboard> g_rookMasks;
+            alignas(64) static EnumArray<Square, std::uint8_t> g_rookShifts;
+            alignas(64) static EnumArray<Square, const Bitboard*> g_rookAttacks;
+
+            alignas(64) static EnumArray<Square, Bitboard> g_bishopMasks;
+            alignas(64) static EnumArray<Square, std::uint8_t> g_bishopShifts;
+            alignas(64) static EnumArray<Square, const Bitboard*> g_bishopAttacks;
+
+            alignas(64) static std::array<Bitboard, 102400> g_allRookAttacks;
+            alignas(64) static std::array<Bitboard, 5248> g_allBishopAttacks;
+
+            inline Bitboard bishopAttacks(Square s, Bitboard occupied)
+            {
+                const std::size_t idx =
+                    (occupied & fancy_magics::g_bishopMasks[s]).bits()
+                    * fancy_magics::g_bishopMagics[s]
+                    >> fancy_magics::g_bishopShifts[s];
+
+                return fancy_magics::g_bishopAttacks[s][idx];
+            }
+
+            inline Bitboard rookAttacks(Square s, Bitboard occupied)
+            {
+                const std::size_t idx =
+                    (occupied & fancy_magics::g_rookMasks[s]).bits()
+                    * fancy_magics::g_rookMagics[s]
+                    >> fancy_magics::g_rookShifts[s];
+
+                return fancy_magics::g_rookAttacks[s][idx];
+            }
+        }
+
+        [[nodiscard]] constexpr Bitboard square(Square sq)
+        {
+            return Bitboard::square(sq);
+        }
+
+        [[nodiscard]] constexpr Bitboard rank(Rank rank)
+        {
+            return Bitboard::rank(rank);
+        }
+
+        [[nodiscard]] constexpr Bitboard file(File file)
+        {
+            return Bitboard::file(file);
+        }
+
+        [[nodiscard]] constexpr Bitboard color(Color c)
+        {
+            return Bitboard::color(c);
+        }
+
+        [[nodiscard]] constexpr Bitboard before(Square sq)
+        {
+            return Bitboard::fromBits(nbitmask<std::uint64_t>[ordinal(sq)]);
+        }
+
+        constexpr Bitboard lightSquares = bb::color(Color::White);
+        constexpr Bitboard darkSquares = bb::color(Color::Black);
+
+        constexpr Bitboard fileA = bb::file(chess::fileA);
+        constexpr Bitboard fileB = bb::file(chess::fileB);
+        constexpr Bitboard fileC = bb::file(chess::fileC);
+        constexpr Bitboard fileD = bb::file(chess::fileD);
+        constexpr Bitboard fileE = bb::file(chess::fileE);
+        constexpr Bitboard fileF = bb::file(chess::fileF);
+        constexpr Bitboard fileG = bb::file(chess::fileG);
+        constexpr Bitboard fileH = bb::file(chess::fileH);
+
+        constexpr Bitboard rank1 = bb::rank(chess::rank1);
+        constexpr Bitboard rank2 = bb::rank(chess::rank2);
+        constexpr Bitboard rank3 = bb::rank(chess::rank3);
+        constexpr Bitboard rank4 = bb::rank(chess::rank4);
+        constexpr Bitboard rank5 = bb::rank(chess::rank5);
+        constexpr Bitboard rank6 = bb::rank(chess::rank6);
+        constexpr Bitboard rank7 = bb::rank(chess::rank7);
+        constexpr Bitboard rank8 = bb::rank(chess::rank8);
+
+        constexpr Bitboard a1 = bb::square(chess::a1);
+        constexpr Bitboard a2 = bb::square(chess::a2);
+        constexpr Bitboard a3 = bb::square(chess::a3);
+        constexpr Bitboard a4 = bb::square(chess::a4);
+        constexpr Bitboard a5 = bb::square(chess::a5);
+        constexpr Bitboard a6 = bb::square(chess::a6);
+        constexpr Bitboard a7 = bb::square(chess::a7);
+        constexpr Bitboard a8 = bb::square(chess::a8);
+
+        constexpr Bitboard b1 = bb::square(chess::b1);
+        constexpr Bitboard b2 = bb::square(chess::b2);
+        constexpr Bitboard b3 = bb::square(chess::b3);
+        constexpr Bitboard b4 = bb::square(chess::b4);
+        constexpr Bitboard b5 = bb::square(chess::b5);
+        constexpr Bitboard b6 = bb::square(chess::b6);
+        constexpr Bitboard b7 = bb::square(chess::b7);
+        constexpr Bitboard b8 = bb::square(chess::b8);
+
+        constexpr Bitboard c1 = bb::square(chess::c1);
+        constexpr Bitboard c2 = bb::square(chess::c2);
+        constexpr Bitboard c3 = bb::square(chess::c3);
+        constexpr Bitboard c4 = bb::square(chess::c4);
+        constexpr Bitboard c5 = bb::square(chess::c5);
+        constexpr Bitboard c6 = bb::square(chess::c6);
+        constexpr Bitboard c7 = bb::square(chess::c7);
+        constexpr Bitboard c8 = bb::square(chess::c8);
+
+        constexpr Bitboard d1 = bb::square(chess::d1);
+        constexpr Bitboard d2 = bb::square(chess::d2);
+        constexpr Bitboard d3 = bb::square(chess::d3);
+        constexpr Bitboard d4 = bb::square(chess::d4);
+        constexpr Bitboard d5 = bb::square(chess::d5);
+        constexpr Bitboard d6 = bb::square(chess::d6);
+        constexpr Bitboard d7 = bb::square(chess::d7);
+        constexpr Bitboard d8 = bb::square(chess::d8);
+
+        constexpr Bitboard e1 = bb::square(chess::e1);
+        constexpr Bitboard e2 = bb::square(chess::e2);
+        constexpr Bitboard e3 = bb::square(chess::e3);
+        constexpr Bitboard e4 = bb::square(chess::e4);
+        constexpr Bitboard e5 = bb::square(chess::e5);
+        constexpr Bitboard e6 = bb::square(chess::e6);
+        constexpr Bitboard e7 = bb::square(chess::e7);
+        constexpr Bitboard e8 = bb::square(chess::e8);
+
+        constexpr Bitboard f1 = bb::square(chess::f1);
+        constexpr Bitboard f2 = bb::square(chess::f2);
+        constexpr Bitboard f3 = bb::square(chess::f3);
+        constexpr Bitboard f4 = bb::square(chess::f4);
+        constexpr Bitboard f5 = bb::square(chess::f5);
+        constexpr Bitboard f6 = bb::square(chess::f6);
+        constexpr Bitboard f7 = bb::square(chess::f7);
+        constexpr Bitboard f8 = bb::square(chess::f8);
+
+        constexpr Bitboard g1 = bb::square(chess::g1);
+        constexpr Bitboard g2 = bb::square(chess::g2);
+        constexpr Bitboard g3 = bb::square(chess::g3);
+        constexpr Bitboard g4 = bb::square(chess::g4);
+        constexpr Bitboard g5 = bb::square(chess::g5);
+        constexpr Bitboard g6 = bb::square(chess::g6);
+        constexpr Bitboard g7 = bb::square(chess::g7);
+        constexpr Bitboard g8 = bb::square(chess::g8);
+
+        constexpr Bitboard h1 = bb::square(chess::h1);
+        constexpr Bitboard h2 = bb::square(chess::h2);
+        constexpr Bitboard h3 = bb::square(chess::h3);
+        constexpr Bitboard h4 = bb::square(chess::h4);
+        constexpr Bitboard h5 = bb::square(chess::h5);
+        constexpr Bitboard h6 = bb::square(chess::h6);
+        constexpr Bitboard h7 = bb::square(chess::h7);
+        constexpr Bitboard h8 = bb::square(chess::h8);
+
+        [[nodiscard]] Bitboard between(Square s1, Square s2);
+
+        [[nodiscard]] Bitboard line(Square s1, Square s2);
+
+        template <PieceType PieceTypeV>
+        [[nodiscard]] Bitboard pseudoAttacks(Square sq);
+
+        [[nodiscard]] Bitboard pseudoAttacks(PieceType pt, Square sq);
+
+        template <PieceType PieceTypeV>
+        Bitboard attacks(Square sq, Bitboard occupied)
+        {
+            static_assert(PieceTypeV != PieceType::None && PieceTypeV != PieceType::Pawn);
+
+            assert(sq.isOk());
+
+            if constexpr (PieceTypeV == PieceType::Bishop)
+            {
+                return fancy_magics::bishopAttacks(sq, occupied);
+            }
+            else if constexpr (PieceTypeV == PieceType::Rook)
+            {
+                return fancy_magics::rookAttacks(sq, occupied);
+            }
+            else if constexpr (PieceTypeV == PieceType::Queen)
+            {
+                return
+                    fancy_magics::bishopAttacks(sq, occupied)
+                    | fancy_magics::rookAttacks(sq, occupied);
+            }
+            else
+            {
+                return pseudoAttacks<PieceTypeV>(sq);
+            }
+        }
+
+        [[nodiscard]] inline Bitboard attacks(PieceType pt, Square sq, Bitboard occupied)
+        {
+            assert(sq.isOk());
+
+            switch (pt)
+            {
+            case PieceType::Bishop:
+                return attacks<PieceType::Bishop>(sq, occupied);
+            case PieceType::Rook:
+                return attacks<PieceType::Rook>(sq, occupied);
+            case PieceType::Queen:
+                return attacks<PieceType::Queen>(sq, occupied);
+            default:
+                return pseudoAttacks(pt, sq);
+            }
+        }
+
+        [[nodiscard]] inline Bitboard pawnAttacks(Bitboard pawns, Color color);
+
+        [[nodiscard]] inline Bitboard westPawnAttacks(Bitboard pawns, Color color);
+
+        [[nodiscard]] inline Bitboard eastPawnAttacks(Bitboard pawns, Color color);
+
+        [[nodiscard]] inline bool isAttackedBySlider(
+            Square sq,
+            Bitboard bishops,
+            Bitboard rooks,
+            Bitboard queens,
+            Bitboard occupied
+        );
+
+        namespace detail
+        {
+            static constexpr std::array<Offset, 8> knightOffsets{ { {-1, -2}, {-1, 2}, {1, -2}, {1, 2}, {-2, -1}, {-2, 1}, {2, -1}, {2, 1} } };
+            static constexpr std::array<Offset, 8> kingOffsets{ { {-1, -1}, {-1, 0}, {-1, 1}, {0, -1}, {0, 1}, {1, -1}, {1, 0}, {1, 1} } };
+
+            enum Direction
+            {
+                North = 0,
+                NorthEast,
+                East,
+                SouthEast,
+                South,
+                SouthWest,
+                West,
+                NorthWest
+            };
+
+            constexpr std::array<Offset, 8> offsets = { {
+                { 0, 1 },
+                { 1, 1 },
+                { 1, 0 },
+                { 1, -1 },
+                { 0, -1 },
+                { -1, -1 },
+                { -1, 0 },
+                { -1, 1 }
+            } };
+
+            static constexpr std::array<Offset, 4> bishopOffsets{
+                offsets[NorthEast],
+                offsets[SouthEast],
+                offsets[SouthWest],
+                offsets[NorthWest]
+            };
+            static constexpr std::array<Offset, 4> rookOffsets{
+                offsets[North],
+                offsets[East],
+                offsets[South],
+                offsets[West]
+            };
+
+            [[nodiscard]] static EnumArray<Square, Bitboard> generatePseudoAttacks_Pawn()
+            {
+                // pseudo attacks don't make sense for pawns
+                return {};
+            }
+
+            [[nodiscard]] static EnumArray<Square, Bitboard> generatePseudoAttacks_Knight()
+            {
+                EnumArray<Square, Bitboard> bbs{};
+
+                for (Square fromSq = chess::a1; fromSq != Square::none(); ++fromSq)
+                {
+                    Bitboard bb{};
+
+                    for (auto&& offset : knightOffsets)
+                    {
+                        const SquareCoords toSq = fromSq.coords() + offset;
+                        if (toSq.isOk())
+                        {
+                            bb |= Square(toSq);
+                        }
+                    }
+
+                    bbs[fromSq] = bb;
+                }
+
+                return bbs;
+            }
+
+            [[nodiscard]] static Bitboard generateSliderPseudoAttacks(const std::array<Offset, 4> & offsets_, Square fromSq)
+            {
+                assert(fromSq.isOk());
+
+                Bitboard bb{};
+
+                for (auto&& offset : offsets_)
+                {
+                    SquareCoords fromSqC = fromSq.coords();
+
+                    for (;;)
+                    {
+                        fromSqC += offset;
+
+                        if (!fromSqC.isOk())
+                        {
+                            break;
+                        }
+
+                        bb |= Square(fromSqC);
+                    }
+                }
+
+                return bb;
+            }
+
+            [[nodiscard]] static EnumArray<Square, Bitboard> generatePseudoAttacks_Bishop()
+            {
+                EnumArray<Square, Bitboard> bbs{};
+
+                for (Square fromSq = chess::a1; fromSq != Square::none(); ++fromSq)
+                {
+                    bbs[fromSq] = generateSliderPseudoAttacks(bishopOffsets, fromSq);
+                }
+
+                return bbs;
+            }
+
+            [[nodiscard]] static EnumArray<Square, Bitboard> generatePseudoAttacks_Rook()
+            {
+                EnumArray<Square, Bitboard> bbs{};
+
+                for (Square fromSq = chess::a1; fromSq != Square::none(); ++fromSq)
+                {
+                    bbs[fromSq] = generateSliderPseudoAttacks(rookOffsets, fromSq);
+                }
+
+                return bbs;
+            }
+
+            [[nodiscard]] static EnumArray<Square, Bitboard> generatePseudoAttacks_Queen()
+            {
+                EnumArray<Square, Bitboard> bbs{};
+
+                for (Square fromSq = chess::a1; fromSq != Square::none(); ++fromSq)
+                {
+                    bbs[fromSq] =
+                        generateSliderPseudoAttacks(bishopOffsets, fromSq)
+                        | generateSliderPseudoAttacks(rookOffsets, fromSq);
+                }
+
+                return bbs;
+            }
+
+            [[nodiscard]] static EnumArray<Square, Bitboard> generatePseudoAttacks_King()
+            {
+                EnumArray<Square, Bitboard> bbs{};
+
+                for (Square fromSq = chess::a1; fromSq != Square::none(); ++fromSq)
+                {
+                    Bitboard bb{};
+
+                    for (auto&& offset : kingOffsets)
+                    {
+                        const SquareCoords toSq = fromSq.coords() + offset;
+                        if (toSq.isOk())
+                        {
+                            bb |= Square(toSq);
+                        }
+                    }
+
+                    bbs[fromSq] = bb;
+                }
+
+                return bbs;
+            }
+
+            [[nodiscard]] static EnumArray2<PieceType, Square, Bitboard> generatePseudoAttacks()
+            {
+                return EnumArray2<PieceType, Square, Bitboard>{
+                    generatePseudoAttacks_Pawn(),
+                        generatePseudoAttacks_Knight(),
+                        generatePseudoAttacks_Bishop(),
+                        generatePseudoAttacks_Rook(),
+                        generatePseudoAttacks_Queen(),
+                        generatePseudoAttacks_King()
+                };
+            }
+
+            static const EnumArray2<PieceType, Square, Bitboard>& pseudoAttacks()
+            {
+                static const EnumArray2<PieceType, Square, Bitboard> s_pseudoAttacks = generatePseudoAttacks();
+                return s_pseudoAttacks;
+            }
+
+            [[nodiscard]] static Bitboard generatePositiveRayAttacks(Direction dir, Square fromSq)
+            {
+                assert(fromSq.isOk());
+
+                Bitboard bb{};
+
+                const auto offset = offsets[dir];
+                SquareCoords fromSqC = fromSq.coords();
+                for (;;)
+                {
+                    fromSqC += offset;
+
+                    if (!fromSqC.isOk())
+                    {
+                        break;
+                    }
+
+                    bb |= Square(fromSqC);
+                }
+
+                return bb;
+            }
+
+            // classical slider move generation approach https://www.chessprogramming.org/Classical_Approach
+
+            [[nodiscard]] static EnumArray<Square, Bitboard> generatePositiveRayAttacks(Direction dir)
+            {
+                EnumArray<Square, Bitboard> bbs{};
+
+                for (Square fromSq = chess::a1; fromSq != Square::none(); ++fromSq)
+                {
+                    bbs[fromSq] = generatePositiveRayAttacks(dir, fromSq);
+                }
+
+                return bbs;
+            }
+
+            [[nodiscard]] static std::array<EnumArray<Square, Bitboard>, 8> generatePositiveRayAttacks()
+            {
+                std::array<EnumArray<Square, Bitboard>, 8> bbs{};
+
+                bbs[North] = generatePositiveRayAttacks(North);
+                bbs[NorthEast] = generatePositiveRayAttacks(NorthEast);
+                bbs[East] = generatePositiveRayAttacks(East);
+                bbs[SouthEast] = generatePositiveRayAttacks(SouthEast);
+                bbs[South] = generatePositiveRayAttacks(South);
+                bbs[SouthWest] = generatePositiveRayAttacks(SouthWest);
+                bbs[West] = generatePositiveRayAttacks(West);
+                bbs[NorthWest] = generatePositiveRayAttacks(NorthWest);
+
+                return bbs;
+            }
+
+
+            static const std::array<EnumArray<Square, Bitboard>, 8>& positiveRayAttacks()
+            {
+                static const std::array<EnumArray<Square, Bitboard>, 8> s_positiveRayAttacks = generatePositiveRayAttacks();
+                return s_positiveRayAttacks;
+            }
+
+            template <Direction DirV>
+            [[nodiscard]] static Bitboard slidingAttacks(Square sq, Bitboard occupied)
+            {
+                assert(sq.isOk());
+
+                Bitboard attacks = positiveRayAttacks()[DirV][sq];
+
+                if constexpr (DirV == NorthWest || DirV == North || DirV == NorthEast || DirV == East)
+                {
+                    Bitboard blocker = (attacks & occupied) | h8; // set highest bit (H8) so msb never fails
+                    return attacks ^ positiveRayAttacks()[DirV][blocker.first()];
+                }
+                else
+                {
+                    Bitboard blocker = (attacks & occupied) | a1;
+                    return attacks ^ positiveRayAttacks()[DirV][blocker.last()];
+                }
+            }
+
+            template Bitboard slidingAttacks<Direction::North>(Square, Bitboard);
+            template Bitboard slidingAttacks<Direction::NorthEast>(Square, Bitboard);
+            template Bitboard slidingAttacks<Direction::East>(Square, Bitboard);
+            template Bitboard slidingAttacks<Direction::SouthEast>(Square, Bitboard);
+            template Bitboard slidingAttacks<Direction::South>(Square, Bitboard);
+            template Bitboard slidingAttacks<Direction::SouthWest>(Square, Bitboard);
+            template Bitboard slidingAttacks<Direction::West>(Square, Bitboard);
+            template Bitboard slidingAttacks<Direction::NorthWest>(Square, Bitboard);
+
+            template <PieceType PieceTypeV>
+            [[nodiscard]] inline Bitboard pieceSlidingAttacks(Square sq, Bitboard occupied)
+            {
+                static_assert(
+                    PieceTypeV == PieceType::Rook
+                    || PieceTypeV == PieceType::Bishop
+                    || PieceTypeV == PieceType::Queen);
+
+                assert(sq.isOk());
+
+                if constexpr (PieceTypeV == PieceType::Bishop)
+                {
+                    return
+                        detail::slidingAttacks<detail::NorthEast>(sq, occupied)
+                        | detail::slidingAttacks<detail::SouthEast>(sq, occupied)
+                        | detail::slidingAttacks<detail::SouthWest>(sq, occupied)
+                        | detail::slidingAttacks<detail::NorthWest>(sq, occupied);
+                }
+                else if constexpr (PieceTypeV == PieceType::Rook)
+                {
+                    return
+                        detail::slidingAttacks<detail::North>(sq, occupied)
+                        | detail::slidingAttacks<detail::East>(sq, occupied)
+                        | detail::slidingAttacks<detail::South>(sq, occupied)
+                        | detail::slidingAttacks<detail::West>(sq, occupied);
+                }
+                else // if constexpr (PieceTypeV == PieceType::Queen)
+                {
+                    return
+                        detail::slidingAttacks<detail::North>(sq, occupied)
+                        | detail::slidingAttacks<detail::NorthEast>(sq, occupied)
+                        | detail::slidingAttacks<detail::East>(sq, occupied)
+                        | detail::slidingAttacks<detail::SouthEast>(sq, occupied)
+                        | detail::slidingAttacks<detail::South>(sq, occupied)
+                        | detail::slidingAttacks<detail::SouthWest>(sq, occupied)
+                        | detail::slidingAttacks<detail::West>(sq, occupied)
+                        | detail::slidingAttacks<detail::NorthWest>(sq, occupied);
+                }
+            }
+
+            static Bitboard generateBetween(Square s1, Square s2)
+            {
+                Bitboard bb = Bitboard::none();
+
+                if (s1 == s2)
+                {
+                    return bb;
+                }
+
+                const int fd = s2.file() - s1.file();
+                const int rd = s2.rank() - s1.rank();
+
+                if (fd == 0 || rd == 0 || fd == rd || fd == -rd)
+                {
+                    // s1 and s2 lie on a line.
+                    const int fileStep = (fd > 0) - (fd < 0);
+                    const int rankStep = (rd > 0) - (rd < 0);
+                    const auto step = FlatSquareOffset(fileStep, rankStep);
+                    s1 += step; // omit s1
+                    while(s1 != s2) // omit s2
+                    {
+                        bb |= s1;
+                        s1 += step;
+                    }
+                }
+
+                return bb;
+            }
+
+            static Bitboard generateLine(Square s1, Square s2)
+            {
+                for (PieceType pt : { PieceType::Bishop, PieceType::Rook })
+                {
+                    const Bitboard s1Attacks = pseudoAttacks()[pt][s1];
+                    if (s1Attacks.isSet(s2))
+                    {
+                        const Bitboard s2Attacks = pseudoAttacks()[pt][s2];
+                        return (s1Attacks & s2Attacks) | s1 | s2;
+                    }
+                }
+
+                return Bitboard::none();
+            }
+
+            static const EnumArray2<Square, Square, Bitboard> between = []()
+            {
+                EnumArray2<Square, Square, Bitboard> between_;
+
+                for (Square s1 : values<Square>())
+                {
+                    for (Square s2 : values<Square>())
+                    {
+                        between_[s1][s2] = generateBetween(s1, s2);
+                    }
+                }
+
+                return between_;
+            }();
+
+            static const EnumArray2<Square, Square, Bitboard> line = []()
+            {
+                EnumArray2<Square, Square, Bitboard> line_;
+
+                for (Square s1 : values<Square>())
+                {
+                    for (Square s2 : values<Square>())
+                    {
+                        line_[s1][s2] = generateLine(s1, s2);
+                    }
+                }
+
+                return line_;
+            }();
+        }
+
+        namespace fancy_magics
+        {
+            enum struct MagicsType
+            {
+                Rook,
+                Bishop
+            };
+
+            template <MagicsType TypeV>
+            [[nodiscard]] inline Bitboard slidingAttacks(Square sq, Bitboard occupied)
+            {
+                if (TypeV == MagicsType::Rook)
+                {
+                    return chess::bb::detail::pieceSlidingAttacks<PieceType::Rook>(sq, occupied);
+                }
+
+                if (TypeV == MagicsType::Bishop)
+                {
+                    return chess::bb::detail::pieceSlidingAttacks<PieceType::Bishop>(sq, occupied);
+                }
+
+                return Bitboard::none();
+            }
+
+            template <MagicsType TypeV, std::size_t SizeV>
+            [[nodiscard]] inline bool initMagics(
+                const EnumArray<Square, std::uint64_t>& magics,
+                std::array<Bitboard, SizeV>& table,
+                EnumArray<Square, Bitboard>& masks,
+                EnumArray<Square, std::uint8_t>& shifts,
+                EnumArray<Square, const Bitboard*>& attacks
+            )
+            {
+                std::size_t size = 0;
+                for (Square sq : values<Square>())
+                {
+                    const Bitboard edges =
+                        ((bb::rank1 | bb::rank8) & ~Bitboard::rank(sq.rank()))
+                        | ((bb::fileA | bb::fileH) & ~Bitboard::file(sq.file()));
+
+                    Bitboard* currentAttacks = table.data() + size;
+
+                    attacks[sq] = currentAttacks;
+                    masks[sq] = slidingAttacks<TypeV>(sq, Bitboard::none()) & ~edges;
+                    shifts[sq] = 64 - masks[sq].count();
+
+                    Bitboard occupied = Bitboard::none();
+                    do
+                    {
+                        const std::size_t idx =
+                            (occupied & masks[sq]).bits()
+                            * magics[sq]
+                            >> shifts[sq];
+
+                        currentAttacks[idx] = slidingAttacks<TypeV>(sq, occupied);
+
+                        ++size;
+                        occupied = Bitboard::fromBits(occupied.bits() - masks[sq].bits()) & masks[sq];
+                    } while (occupied.any());
+                }
+
+                return true;
+            }
+
+            static bool g_isRookMagicsInitialized =
+                initMagics<MagicsType::Rook>(g_rookMagics, g_allRookAttacks, g_rookMasks, g_rookShifts, g_rookAttacks);
+
+            static bool g_isBishopMagicsInitialized =
+                initMagics<MagicsType::Bishop>(g_bishopMagics, g_allBishopAttacks, g_bishopMasks, g_bishopShifts, g_bishopAttacks);
+        }
+
+        [[nodiscard]] inline Bitboard between(Square s1, Square s2)
+        {
+            return detail::between[s1][s2];
+        }
+
+        [[nodiscard]] inline Bitboard line(Square s1, Square s2)
+        {
+            return detail::line[s1][s2];
+        }
+
+        template <PieceType PieceTypeV>
+        [[nodiscard]] inline Bitboard pseudoAttacks(Square sq)
+        {
+            static_assert(PieceTypeV != PieceType::None && PieceTypeV != PieceType::Pawn);
+
+            assert(sq.isOk());
+
+            return detail::pseudoAttacks()[PieceTypeV][sq];
+        }
+
+        [[nodiscard]] inline Bitboard pseudoAttacks(PieceType pt, Square sq)
+        {
+            assert(sq.isOk());
+
+            return detail::pseudoAttacks()[pt][sq];
+        }
+
+        [[nodiscard]] inline Bitboard pawnAttacks(Bitboard pawns, Color color)
+        {
+            if (color == Color::White)
+            {
+                return pawns.shifted<1, 1>() | pawns.shifted<-1, 1>();
+            }
+            else
+            {
+                return pawns.shifted<1, -1>() | pawns.shifted<-1, -1>();
+            }
+        }
+
+        [[nodiscard]] inline Bitboard westPawnAttacks(Bitboard pawns, Color color)
+        {
+            if (color == Color::White)
+            {
+                return pawns.shifted<-1, 1>();
+            }
+            else
+            {
+                return pawns.shifted<-1, -1>();
+            }
+        }
+
+        [[nodiscard]] inline Bitboard eastPawnAttacks(Bitboard pawns, Color color)
+        {
+            if (color == Color::White)
+            {
+                return pawns.shifted<1, 1>();
+            }
+            else
+            {
+                return pawns.shifted<1, -1>();
+            }
+        }
+
+        [[nodiscard]] inline bool isAttackedBySlider(
+            Square sq,
+            Bitboard bishops,
+            Bitboard rooks,
+            Bitboard queens,
+            Bitboard occupied
+        )
+        {
+            const Bitboard opponentBishopLikePieces = (bishops | queens);
+            const Bitboard bishopAttacks = bb::attacks<PieceType::Bishop>(sq, occupied);
+            if ((bishopAttacks & opponentBishopLikePieces).any())
+            {
+                return true;
+            }
+
+            const Bitboard opponentRookLikePieces = (rooks | queens);
+            const Bitboard rookAttacks = bb::attacks<PieceType::Rook>(sq, occupied);
+            return (rookAttacks & opponentRookLikePieces).any();
+        }
+    }
+
+    struct CastlingTraits
+    {
+        static constexpr EnumArray2<Color, CastleType, Square> rookDestination = { { {{ f1, d1 }}, {{ f8, d8 }} } };
+        static constexpr EnumArray2<Color, CastleType, Square> kingDestination = { { {{ g1, c1 }}, {{ g8, c8 }} } };
+
+        static constexpr EnumArray2<Color, CastleType, Square> rookStart = { { {{ h1, a1 }}, {{ h8, a8 }} } };
+
+        static constexpr EnumArray<Color, Square> kingStart = { { e1, e8 } };
+
+        static constexpr EnumArray2<Color, CastleType, Bitboard> castlingPath = {
+            {
+                {{ Bitboard::square(f1) | g1, Bitboard::square(b1) | c1 | d1 }},
+                {{ Bitboard::square(f8) | g8, Bitboard::square(b8) | c8 | d8 }}
+            }
+        };
+
+        static constexpr EnumArray2<Color, CastleType, Square> squarePassedByKing = {
+            {
+                {{ f1, d1 }},
+                {{ f8, d8 }}
+            }
+        };
+
+        static constexpr EnumArray2<Color, CastleType, CastlingRights> castlingRights = {
+            {
+                {{ CastlingRights::WhiteKingSide, CastlingRights::WhiteQueenSide }},
+                {{ CastlingRights::BlackKingSide, CastlingRights::BlackQueenSide }}
+            }
+        };
+
+        // Move has to be a legal castling move.
+        static constexpr CastleType moveCastlingType(const Move& move)
+        {
+            return (move.to.file() == fileH) ? CastleType::Short : CastleType::Long;
+        }
+
+        // Move must be a legal castling move.
+        static constexpr CastlingRights moveCastlingRight(Move move)
+        {
+            if (move.to == h1) return CastlingRights::WhiteKingSide;
+            if (move.to == a1) return CastlingRights::WhiteQueenSide;
+            if (move.to == h8) return CastlingRights::WhiteKingSide;
+            if (move.to == a8) return CastlingRights::WhiteQueenSide;
+            return CastlingRights::None;
+        }
+    };
+
+    namespace parser_bits
+    {
+        [[nodiscard]] constexpr bool isFile(char c)
+        {
+            return c >= 'a' && c <= 'h';
+        }
+
+        [[nodiscard]] constexpr bool isRank(char c)
+        {
+            return c >= '1' && c <= '8';
+        }
+
+        [[nodiscard]] constexpr Rank parseRank(char c)
+        {
+            assert(isRank(c));
+
+            return fromOrdinal<Rank>(c - '1');
+        }
+
+        [[nodiscard]] constexpr File parseFile(char c)
+        {
+            assert(isFile(c));
+
+            return fromOrdinal<File>(c - 'a');
+        }
+
+        [[nodiscard]] constexpr bool isSquare(const char* s)
+        {
+            return isFile(s[0]) && isRank(s[1]);
+        }
+
+        [[nodiscard]] constexpr Square parseSquare(const char* s)
+        {
+            const File file = parseFile(s[0]);
+            const Rank rank = parseRank(s[1]);
+            return Square(file, rank);
+        }
+
+        [[nodiscard]] constexpr std::optional<Square> tryParseSquare(std::string_view s)
+        {
+            if (s.size() != 2) return {};
+            if (!isSquare(s.data())) return {};
+            return parseSquare(s.data());
+        }
+
+        [[nodiscard]] constexpr std::optional<Square> tryParseEpSquare(std::string_view s)
+        {
+            if (s == std::string_view("-")) return Square::none();
+            return tryParseSquare(s);
+        }
+
+        [[nodiscard]] constexpr std::optional<CastlingRights> tryParseCastlingRights(std::string_view s)
+        {
+            if (s == std::string_view("-")) return CastlingRights::None;
+
+            CastlingRights rights = CastlingRights::None;
+
+            for (auto& c : s)
+            {
+                CastlingRights toAdd = CastlingRights::None;
+                switch (c)
+                {
+                case 'K':
+                    toAdd = CastlingRights::WhiteKingSide;
+                    break;
+                case 'Q':
+                    toAdd = CastlingRights::WhiteQueenSide;
+                    break;
+                case 'k':
+                    toAdd = CastlingRights::BlackKingSide;
+                    break;
+                case 'q':
+                    toAdd = CastlingRights::BlackQueenSide;
+                    break;
+                }
+
+                // If there are duplicated castling rights specification we bail.
+                // If there is an invalid character we bail.
+                // (It always contains None)
+                if (contains(rights, toAdd)) return {};
+                else rights |= toAdd;
+            }
+
+            return rights;
+        }
+
+        [[nodiscard]] constexpr CastlingRights readCastlingRights(const char*& s)
+        {
+            CastlingRights rights = CastlingRights::None;
+
+            while (*s != ' ')
+            {
+                switch (*s)
+                {
+                case 'K':
+                    rights |= CastlingRights::WhiteKingSide;
+                    break;
+                case 'Q':
+                    rights |= CastlingRights::WhiteQueenSide;
+                    break;
+                case 'k':
+                    rights |= CastlingRights::BlackKingSide;
+                    break;
+                case 'q':
+                    rights |= CastlingRights::BlackQueenSide;
+                    break;
+                }
+
+                ++s;
+            }
+
+            return rights;
+        }
+
+        FORCEINLINE inline void appendCastlingRightsToString(CastlingRights rights, std::string& str)
+        {
+            if (rights == CastlingRights::None)
+            {
+                str += '-';
+            }
+            else
+            {
+                if (contains(rights, CastlingRights::WhiteKingSide)) str += 'K';
+                if (contains(rights, CastlingRights::WhiteQueenSide)) str += 'Q';
+                if (contains(rights, CastlingRights::BlackKingSide)) str += 'k';
+                if (contains(rights, CastlingRights::BlackQueenSide)) str += 'q';
+            }
+        }
+
+        FORCEINLINE inline void appendSquareToString(Square sq, std::string& str)
+        {
+            str += static_cast<char>('a' + ordinal(sq.file()));
+            str += static_cast<char>('1' + ordinal(sq.rank()));
+        }
+
+        FORCEINLINE inline void appendEpSquareToString(Square sq, std::string& str)
+        {
+            if (sq == Square::none())
+            {
+                str += '-';
+            }
+            else
+            {
+                appendSquareToString(sq, str);
+            }
+        }
+
+        FORCEINLINE inline void appendRankToString(Rank r, std::string& str)
+        {
+            str += static_cast<char>('1' + ordinal(r));
+        }
+
+        FORCEINLINE inline void appendFileToString(File f, std::string& str)
+        {
+            str += static_cast<char>('a' + ordinal(f));
+        }
+
+        [[nodiscard]] FORCEINLINE inline bool isDigit(char c)
+        {
+            return c >= '0' && c <= '9';
+        }
+
+        [[nodiscard]] inline std::uint16_t parseUInt16(std::string_view sv)
+        {
+            assert(sv.size() > 0);
+            assert(sv.size() <= 5);
+
+            std::uint16_t v = 0;
+
+            std::size_t idx = 0;
+            switch (sv.size())
+            {
+            case 5:
+                v += (sv[idx++] - '0') * 10000;
+            case 4:
+                v += (sv[idx++] - '0') * 1000;
+            case 3:
+                v += (sv[idx++] - '0') * 100;
+            case 2:
+                v += (sv[idx++] - '0') * 10;
+            case 1:
+                v += sv[idx] - '0';
+                break;
+
+            default:
+                assert(false);
+            }
+
+            return v;
+        }
+
+        [[nodiscard]] inline std::optional<std::uint16_t> tryParseUInt16(std::string_view sv)
+        {
+            if (sv.size() == 0 || sv.size() > 5) return std::nullopt;
+
+            std::uint32_t v = 0;
+
+            std::size_t idx = 0;
+            switch (sv.size())
+            {
+            case 5:
+                v += (sv[idx++] - '0') * 10000;
+            case 4:
+                v += (sv[idx++] - '0') * 1000;
+            case 3:
+                v += (sv[idx++] - '0') * 100;
+            case 2:
+                v += (sv[idx++] - '0') * 10;
+            case 1:
+                v += sv[idx] - '0';
+                break;
+
+            default:
+                assert(false);
+            }
+
+            if (v > std::numeric_limits<std::uint16_t>::max())
+            {
+                return std::nullopt;
+            }
+
+            return static_cast<std::uint16_t>(v);
+        }
+    }
+
+
+    struct Board
+    {
+        constexpr Board() noexcept :
+            m_pieces{},
+            m_pieceBB{},
+            m_piecesByColorBB{},
+            m_pieceCount{}
+        {
+            m_pieces.fill(Piece::none());
+            m_pieceBB.fill(Bitboard::none());
+            m_pieceBB[Piece::none()] = Bitboard::all();
+            m_piecesByColorBB.fill(Bitboard::none());
+            m_pieceCount.fill(0);
+            m_pieceCount[Piece::none()] = 64;
+        }
+
+        [[nodiscard]] inline bool isValid() const
+        {
+            if (piecesBB(whiteKing).count() != 1) return false;
+            if (piecesBB(blackKing).count() != 1) return false;
+            if (((piecesBB(whitePawn) | piecesBB(blackPawn)) & (bb::rank(rank1) | bb::rank(rank8))).any()) return false;
+            return true;
+        }
+
+        [[nodiscard]] inline std::string fen() const;
+
+        [[nodiscard]] inline bool trySet(std::string_view boardState)
+        {
+            File f = fileA;
+            Rank r = rank8;
+            bool lastWasSkip = false;
+            for (auto c : boardState)
+            {
+                Piece piece = Piece::none();
+                switch (c)
+                {
+                case 'r':
+                    piece = Piece(PieceType::Rook, Color::Black);
+                    break;
+                case 'n':
+                    piece = Piece(PieceType::Knight, Color::Black);
+                    break;
+                case 'b':
+                    piece = Piece(PieceType::Bishop, Color::Black);
+                    break;
+                case 'q':
+                    piece = Piece(PieceType::Queen, Color::Black);
+                    break;
+                case 'k':
+                    piece = Piece(PieceType::King, Color::Black);
+                    break;
+                case 'p':
+                    piece = Piece(PieceType::Pawn, Color::Black);
+                    break;
+
+                case 'R':
+                    piece = Piece(PieceType::Rook, Color::White);
+                    break;
+                case 'N':
+                    piece = Piece(PieceType::Knight, Color::White);
+                    break;
+                case 'B':
+                    piece = Piece(PieceType::Bishop, Color::White);
+                    break;
+                case 'Q':
+                    piece = Piece(PieceType::Queen, Color::White);
+                    break;
+                case 'K':
+                    piece = Piece(PieceType::King, Color::White);
+                    break;
+                case 'P':
+                    piece = Piece(PieceType::Pawn, Color::White);
+                    break;
+
+                case '1':
+                case '2':
+                case '3':
+                case '4':
+                case '5':
+                case '6':
+                case '7':
+                case '8':
+                {
+                    if (lastWasSkip) return false;
+                    lastWasSkip = true;
+
+                    const int skip = c - '0';
+                    f += skip;
+                    if (f > fileH + 1) return false;
+                    break;
+                }
+
+                case '/':
+                    lastWasSkip = false;
+                    if (f != fileH + 1) return false;
+                    f = fileA;
+                    --r;
+                    break;
+
+                default:
+                    return false;
+                }
+
+                if (piece != Piece::none())
+                {
+                    lastWasSkip = false;
+
+                    const Square sq(f, r);
+                    if (!sq.isOk()) return false;
+
+                    place(piece, sq);
+                    ++f;
+                }
+            }
+
+            if (f != fileH + 1) return false;
+            if (r != rank1) return false;
+
+            return isValid();
+        }
+
+        // returns side to move
+        [[nodiscard]] constexpr const char* set(const char* fen)
+        {
+            assert(fen != nullptr);
+
+            File f = fileA;
+            Rank r = rank8;
+            auto current = fen;
+            bool done = false;
+            while (*current != '\0')
+            {
+                Piece piece = Piece::none();
+                switch (*current)
+                {
+                case 'r':
+                    piece = Piece(PieceType::Rook, Color::Black);
+                    break;
+                case 'n':
+                    piece = Piece(PieceType::Knight, Color::Black);
+                    break;
+                case 'b':
+                    piece = Piece(PieceType::Bishop, Color::Black);
+                    break;
+                case 'q':
+                    piece = Piece(PieceType::Queen, Color::Black);
+                    break;
+                case 'k':
+                    piece = Piece(PieceType::King, Color::Black);
+                    break;
+                case 'p':
+                    piece = Piece(PieceType::Pawn, Color::Black);
+                    break;
+
+                case 'R':
+                    piece = Piece(PieceType::Rook, Color::White);
+                    break;
+                case 'N':
+                    piece = Piece(PieceType::Knight, Color::White);
+                    break;
+                case 'B':
+                    piece = Piece(PieceType::Bishop, Color::White);
+                    break;
+                case 'Q':
+                    piece = Piece(PieceType::Queen, Color::White);
+                    break;
+                case 'K':
+                    piece = Piece(PieceType::King, Color::White);
+                    break;
+                case 'P':
+                    piece = Piece(PieceType::Pawn, Color::White);
+                    break;
+
+                case ' ':
+                    done = true;
+                    break;
+
+                case '1':
+                case '2':
+                case '3':
+                case '4':
+                case '5':
+                case '6':
+                case '7':
+                case '8':
+                {
+                    const int skip = (*current) - '0';
+                    f += skip;
+                    break;
+                }
+
+                case '/':
+                    f = fileA;
+                    --r;
+                    break;
+
+                default:
+                    break;
+                }
+
+                if (done)
+                {
+                    break;
+                }
+
+                if (piece != Piece::none())
+                {
+                    place(piece, Square(f, r));
+                    ++f;
+                }
+
+                ++current;
+            }
+
+            return current;
+        }
+
+        static constexpr Board fromFen(const char* fen)
+        {
+            Board board;
+            (void)board.set(fen);
+            return board;
+        }
+
+        [[nodiscard]] constexpr friend bool operator==(const Board& lhs, const Board& rhs) noexcept
+        {
+            bool equal = true;
+            for (Square sq = a1; sq <= h8; ++sq)
+            {
+                if (lhs.m_pieces[sq] != rhs.m_pieces[sq])
+                {
+                    equal = false;
+                    break;
+                }
+            }
+
+            assert(bbsEqual(lhs, rhs) == equal);
+
+            return equal;
+        }
+
+        constexpr void place(Piece piece, Square sq)
+        {
+            assert(sq.isOk());
+
+            auto oldPiece = m_pieces[sq];
+            m_pieceBB[oldPiece] ^= sq;
+            if (oldPiece != Piece::none())
+            {
+                m_piecesByColorBB[oldPiece.color()] ^= sq;
+            }
+            m_pieces[sq] = piece;
+            m_pieceBB[piece] |= sq;
+            m_piecesByColorBB[piece.color()] |= sq;
+            --m_pieceCount[oldPiece];
+            ++m_pieceCount[piece];
+        }
+
+        // returns captured piece
+        // doesn't check validity
+        inline constexpr Piece doMove(Move move)
+        {
+            if (move.type == MoveType::Normal)
+            {
+                const Piece capturedPiece = m_pieces[move.to];
+                const Piece piece = m_pieces[move.from];
+
+                const Bitboard frombb = Bitboard::square(move.from);
+                const Bitboard tobb = Bitboard::square(move.to);
+                const Bitboard xormove = frombb ^ tobb;
+
+                m_pieces[move.to] = piece;
+                m_pieces[move.from] = Piece::none();
+
+                m_pieceBB[piece] ^= xormove;
+
+                m_piecesByColorBB[piece.color()] ^= xormove;
+
+                if (capturedPiece == Piece::none())
+                {
+                    m_pieceBB[Piece::none()] ^= xormove;
+                }
+                else
+                {
+                    m_pieceBB[capturedPiece] ^= tobb;
+                    m_pieceBB[Piece::none()] ^= frombb;
+
+                    m_piecesByColorBB[capturedPiece.color()] ^= tobb;
+
+                    --m_pieceCount[capturedPiece];
+                    ++m_pieceCount[Piece::none()];
+                }
+
+                return capturedPiece;
+            }
+
+            return doMoveColdPath(move);
+        }
+
+        inline constexpr Piece doMoveColdPath(Move move)
+        {
+            if (move.type == MoveType::Promotion)
+            {
+                // We split it even though it's similar just because
+                // the normal case is much more common.
+                const Piece capturedPiece = m_pieces[move.to];
+                const Piece fromPiece = m_pieces[move.from];
+                const Piece toPiece = move.promotedPiece;
+
+                m_pieces[move.to] = toPiece;
+                m_pieces[move.from] = Piece::none();
+
+                m_pieceBB[fromPiece] ^= move.from;
+                m_pieceBB[toPiece] ^= move.to;
+
+                m_pieceBB[capturedPiece] ^= move.to;
+                m_pieceBB[Piece::none()] ^= move.from;
+
+                m_piecesByColorBB[fromPiece.color()] ^= move.to;
+                m_piecesByColorBB[fromPiece.color()] ^= move.from;
+                if (capturedPiece != Piece::none())
+                {
+                    m_piecesByColorBB[capturedPiece.color()] ^= move.to;
+                    --m_pieceCount[capturedPiece];
+                    ++m_pieceCount[Piece::none()];
+                }
+
+                --m_pieceCount[fromPiece];
+                ++m_pieceCount[toPiece];
+
+                return capturedPiece;
+            }
+            else if (move.type == MoveType::EnPassant)
+            {
+                const Piece movedPiece = m_pieces[move.from];
+                const Piece capturedPiece(PieceType::Pawn, !movedPiece.color());
+                const Square capturedPieceSq(move.to.file(), move.from.rank());
+
+                // on ep move there are 3 squares involved
+                m_pieces[move.to] = movedPiece;
+                m_pieces[move.from] = Piece::none();
+                m_pieces[capturedPieceSq] = Piece::none();
+
+                m_pieceBB[movedPiece] ^= move.from;
+                m_pieceBB[movedPiece] ^= move.to;
+
+                m_pieceBB[Piece::none()] ^= move.from;
+                m_pieceBB[Piece::none()] ^= move.to;
+
+                m_pieceBB[capturedPiece] ^= capturedPieceSq;
+                m_pieceBB[Piece::none()] ^= capturedPieceSq;
+
+                m_piecesByColorBB[movedPiece.color()] ^= move.to;
+                m_piecesByColorBB[movedPiece.color()] ^= move.from;
+                m_piecesByColorBB[capturedPiece.color()] ^= capturedPieceSq;
+
+                --m_pieceCount[capturedPiece];
+                ++m_pieceCount[Piece::none()];
+
+                return capturedPiece;
+            }
+            else // if (move.type == MoveType::Castle)
+            {
+                const Square rookFromSq = move.to;
+                const Square kingFromSq = move.from;
+
+                const Piece rook = m_pieces[rookFromSq];
+                const Piece king = m_pieces[kingFromSq];
+                const Color color = king.color();
+
+                const CastleType castleType = CastlingTraits::moveCastlingType(move);
+                const Square rookToSq = CastlingTraits::rookDestination[color][castleType];
+                const Square kingToSq = CastlingTraits::kingDestination[color][castleType];
+
+                // 4 squares are involved
+                m_pieces[rookFromSq] = Piece::none();
+                m_pieces[kingFromSq] = Piece::none();
+                m_pieces[rookToSq] = rook;
+                m_pieces[kingToSq] = king;
+
+                m_pieceBB[rook] ^= rookFromSq;
+                m_pieceBB[rook] ^= rookToSq;
+
+                m_pieceBB[king] ^= kingFromSq;
+                m_pieceBB[king] ^= kingToSq;
+
+                m_pieceBB[Piece::none()] ^= rookFromSq;
+                m_pieceBB[Piece::none()] ^= rookToSq;
+
+                m_pieceBB[Piece::none()] ^= kingFromSq;
+                m_pieceBB[Piece::none()] ^= kingToSq;
+
+                m_piecesByColorBB[color] ^= rookFromSq;
+                m_piecesByColorBB[color] ^= rookToSq;
+                m_piecesByColorBB[color] ^= kingFromSq;
+                m_piecesByColorBB[color] ^= kingToSq;
+
+                return Piece::none();
+            }
+        }
+
+        constexpr void undoMove(Move move, Piece capturedPiece)
+        {
+            if (move.type == MoveType::Normal || move.type == MoveType::Promotion)
+            {
+                const Piece toPiece = m_pieces[move.to];
+                const Piece fromPiece = move.promotedPiece == Piece::none() ? toPiece : Piece(PieceType::Pawn, toPiece.color());
+
+                m_pieces[move.from] = fromPiece;
+                m_pieces[move.to] = capturedPiece;
+
+                m_pieceBB[fromPiece] ^= move.from;
+                m_pieceBB[toPiece] ^= move.to;
+
+                m_pieceBB[capturedPiece] ^= move.to;
+                m_pieceBB[Piece::none()] ^= move.from;
+
+                m_piecesByColorBB[fromPiece.color()] ^= move.to;
+                m_piecesByColorBB[fromPiece.color()] ^= move.from;
+                if (capturedPiece != Piece::none())
+                {
+                    m_piecesByColorBB[capturedPiece.color()] ^= move.to;
+                    ++m_pieceCount[capturedPiece];
+                    --m_pieceCount[Piece::none()];
+                }
+
+                if (move.type == MoveType::Promotion)
+                {
+                    --m_pieceCount[toPiece];
+                    ++m_pieceCount[fromPiece];
+                }
+            }
+            else if (move.type == MoveType::EnPassant)
+            {
+                const Piece movedPiece = m_pieces[move.to];
+                const Piece capturedPiece_(PieceType::Pawn, !movedPiece.color());
+                const Square capturedPieceSq(move.to.file(), move.from.rank());
+
+                m_pieces[move.to] = Piece::none();
+                m_pieces[move.from] = movedPiece;
+                m_pieces[capturedPieceSq] = capturedPiece_;
+
+                m_pieceBB[movedPiece] ^= move.from;
+                m_pieceBB[movedPiece] ^= move.to;
+
+                m_pieceBB[Piece::none()] ^= move.from;
+                m_pieceBB[Piece::none()] ^= move.to;
+
+                // on ep move there are 3 squares involved
+                m_pieceBB[capturedPiece_] ^= capturedPieceSq;
+                m_pieceBB[Piece::none()] ^= capturedPieceSq;
+
+                m_piecesByColorBB[movedPiece.color()] ^= move.to;
+                m_piecesByColorBB[movedPiece.color()] ^= move.from;
+                m_piecesByColorBB[capturedPiece_.color()] ^= capturedPieceSq;
+
+                ++m_pieceCount[capturedPiece_];
+                --m_pieceCount[Piece::none()];
+            }
+            else // if (move.type == MoveType::Castle)
+            {
+                const Square rookFromSq = move.to;
+                const Square kingFromSq = move.from;
+
+                const Color color = move.to.rank() == rank1 ? Color::White : Color::Black;
+
+                const CastleType castleType = CastlingTraits::moveCastlingType(move);
+                const Square rookToSq = CastlingTraits::rookDestination[color][castleType];
+                const Square kingToSq = CastlingTraits::kingDestination[color][castleType];
+
+                const Piece rook = m_pieces[rookToSq];
+                const Piece king = m_pieces[kingToSq];
+
+                // 4 squares are involved
+                m_pieces[rookFromSq] = rook;
+                m_pieces[kingFromSq] = king;
+                m_pieces[rookToSq] = Piece::none();
+                m_pieces[kingToSq] = Piece::none();
+
+                m_pieceBB[rook] ^= rookFromSq;
+                m_pieceBB[rook] ^= rookToSq;
+
+                m_pieceBB[king] ^= kingFromSq;
+                m_pieceBB[king] ^= kingToSq;
+
+                m_pieceBB[Piece::none()] ^= rookFromSq;
+                m_pieceBB[Piece::none()] ^= rookToSq;
+
+                m_pieceBB[Piece::none()] ^= kingFromSq;
+                m_pieceBB[Piece::none()] ^= kingToSq;
+
+                m_piecesByColorBB[color] ^= rookFromSq;
+                m_piecesByColorBB[color] ^= rookToSq;
+                m_piecesByColorBB[color] ^= kingFromSq;
+                m_piecesByColorBB[color] ^= kingToSq;
+            }
+        }
+
+        // Returns whether a given square is attacked by any piece
+        // of `attackerColor` side.
+        [[nodiscard]] inline bool isSquareAttacked(Square sq, Color attackerColor) const;
+
+        // Returns whether a given square is attacked by any piece
+        // of `attackerColor` side after `move` is made.
+        // Move must be pseudo legal.
+        [[nodiscard]] inline bool isSquareAttackedAfterMove(Move move, Square sq, Color attackerColor) const;
+
+        // Move must be pseudo legal.
+        // Must not be a king move.
+        [[nodiscard]] inline bool createsDiscoveredAttackOnOwnKing(Move move) const;
+
+        // Returns whether a piece on a given square is attacked
+        // by any enemy piece. False if square is empty.
+        [[nodiscard]] inline bool isPieceAttacked(Square sq) const;
+
+        // Returns whether a piece on a given square is attacked
+        // by any enemy piece after `move` is made. False if square is empty.
+        // Move must be pseudo legal.
+        [[nodiscard]] inline bool isPieceAttackedAfterMove(Move move, Square sq) const;
+
+        // Returns whether the king of the moving side is attacked
+        // by any enemy piece after a move is made.
+        // Move must be pseudo legal.
+        [[nodiscard]] inline bool isOwnKingAttackedAfterMove(Move move) const;
+
+        // Return a bitboard with all (pseudo legal) attacks by the piece on
+        // the given square. Empty if no piece on the square.
+        [[nodiscard]] inline Bitboard attacks(Square sq) const;
+
+        // Returns a bitboard with all squared that have pieces
+        // that attack a given square (pseudo legally)
+        [[nodiscard]] inline Bitboard attackers(Square sq, Color attackerColor) const;
+
+        [[nodiscard]] constexpr Piece pieceAt(Square sq) const
+        {
+            assert(sq.isOk());
+
+            return m_pieces[sq];
+        }
+
+        [[nodiscard]] constexpr Bitboard piecesBB(Color c) const
+        {
+            return m_piecesByColorBB[c];
+        }
+
+        [[nodiscard]] inline Square kingSquare(Color c) const
+        {
+            return piecesBB(Piece(PieceType::King, c)).first();
+        }
+
+        [[nodiscard]] constexpr Bitboard piecesBB(Piece pc) const
+        {
+            return m_pieceBB[pc];
+        }
+
+        [[nodiscard]] constexpr Bitboard piecesBB() const
+        {
+            Bitboard bb{};
+
+            // don't collect from null piece
+            return piecesBB(Color::White) | piecesBB(Color::Black);
+
+            return bb;
+        }
+
+        [[nodiscard]] constexpr std::uint8_t pieceCount(Piece pt) const
+        {
+            return m_pieceCount[pt];
+        }
+
+        [[nodiscard]] constexpr bool isPromotion(Square from, Square to) const
+        {
+            assert(from.isOk() && to.isOk());
+
+            return m_pieces[from].type() == PieceType::Pawn && (to.rank() == rank1 || to.rank() == rank8);
+        }
+
+        const Piece* piecesRaw() const;
+
+    private:
+        EnumArray<Square, Piece> m_pieces;
+        EnumArray<Piece, Bitboard> m_pieceBB;
+        EnumArray<Color, Bitboard> m_piecesByColorBB;
+        EnumArray<Piece, uint8_t> m_pieceCount;
+
+        // NOTE: currently we don't track it because it's not
+        // required to perform ep if we don't need to check validity
+        // Square m_epSquare = Square::none();
+
+        [[nodiscard]] static constexpr bool bbsEqual(const Board& lhs, const Board& rhs) noexcept
+        {
+            for (Piece pc : values<Piece>())
+            {
+                if (lhs.m_pieceBB[pc] != rhs.m_pieceBB[pc])
+                {
+                    return false;
+                }
+            }
+
+            return true;
+        }
+    };
+
+    struct Position;
+
+    struct CompressedPosition;
+
+    struct PositionHash128
+    {
+        std::uint64_t high;
+        std::uint64_t low;
+    };
+
+    struct Position;
+
+    struct MoveLegalityChecker
+    {
+        MoveLegalityChecker(const Position& position);
+
+        [[nodiscard]] bool isPseudoLegalMoveLegal(const Move& move) const;
+
+    private:
+        const Position* m_position;
+        Bitboard m_checkers;
+        Bitboard m_ourBlockersForKing;
+        Bitboard m_potentialCheckRemovals;
+        Square m_ksq;
+    };
+
+    struct Position : public Board
+    {
+        using BaseType = Board;
+
+        constexpr Position() noexcept :
+            Board(),
+            m_sideToMove(Color::White),
+            m_epSquare(Square::none()),
+            m_castlingRights(CastlingRights::All),
+            m_rule50Counter(0),
+            m_ply(0)
+        {
+        }
+
+        constexpr Position(const Board& board, Color sideToMove, Square epSquare, CastlingRights castlingRights) :
+            Board(board),
+            m_sideToMove(sideToMove),
+            m_epSquare(epSquare),
+            m_castlingRights(castlingRights),
+            m_rule50Counter(0),
+            m_ply(0)
+        {
+        }
+
+        inline void set(std::string_view fen);
+
+        // Returns false if the fen was not valid
+        // If the returned value was false the position
+        // is in unspecified state.
+        [[nodiscard]] inline bool trySet(std::string_view fen);
+
+        [[nodiscard]] static inline Position fromFen(std::string_view fen);
+
+        [[nodiscard]] static inline std::optional<Position> tryFromFen(std::string_view fen);
+
+        [[nodiscard]] static inline Position startPosition();
+
+        [[nodiscard]] inline std::string fen() const;
+
+        [[nodiscard]] MoveLegalityChecker moveLegalityChecker() const
+        {
+            return { *this };
+        }
+
+        constexpr void setEpSquareUnchecked(Square sq)
+        {
+            m_epSquare = sq;
+        }
+
+        void setEpSquare(Square sq)
+        {
+            m_epSquare = sq;
+            nullifyEpSquareIfNotPossible();
+        }
+
+        constexpr void setSideToMove(Color color)
+        {
+            m_sideToMove = color;
+        }
+
+        constexpr void addCastlingRights(CastlingRights rights)
+        {
+            m_castlingRights |= rights;
+        }
+
+        constexpr void setCastlingRights(CastlingRights rights)
+        {
+            m_castlingRights = rights;
+        }
+
+        constexpr void setRule50Counter(std::uint8_t v)
+        {
+            m_rule50Counter = v;
+        }
+
+        constexpr void setPly(std::uint16_t ply)
+        {
+            m_ply = ply;
+        }
+
+        inline ReverseMove doMove(const Move& move);
+
+        constexpr void undoMove(const ReverseMove& reverseMove)
+        {
+            const Move& move = reverseMove.move;
+            BaseType::undoMove(move, reverseMove.capturedPiece);
+
+            m_epSquare = reverseMove.oldEpSquare;
+            m_castlingRights = reverseMove.oldCastlingRights;
+
+            m_sideToMove = !m_sideToMove;
+
+            --m_ply;
+            if (m_rule50Counter > 0)
+            {
+                m_rule50Counter -= 1;
+            }
+        }
+
+        [[nodiscard]] constexpr Color sideToMove() const
+        {
+            return m_sideToMove;
+        }
+
+        [[nodiscard]] inline std::uint8_t rule50Counter() const
+        {
+            return m_rule50Counter;
+        }
+
+        [[nodiscard]] inline std::uint16_t ply() const
+        {
+            return m_ply;
+        }
+
+        [[nodiscard]] inline std::uint16_t fullMove() const
+        {
+            return (m_ply + 1) / 2;
+        }
+
+        inline void setFullMove(std::uint16_t hm)
+        {
+            m_ply = 2 * hm - 1 + (m_sideToMove == Color::Black);
+        }
+
+        [[nodiscard]] inline bool isCheck() const;
+
+        [[nodiscard]] inline Bitboard checkers() const;
+
+        [[nodiscard]] inline bool isCheckAfterMove(Move move) const;
+
+        [[nodiscard]] inline bool isMoveLegal(Move move) const;
+
+        [[nodiscard]] inline bool isPseudoLegalMoveLegal(Move move) const;
+
+        [[nodiscard]] inline bool isMovePseudoLegal(Move move) const;
+
+        // Returns all pieces that block a slider
+        // from attacking our king. When two or more
+        // pieces block a single slider then none
+        // of these pieces are included.
+        [[nodiscard]] inline Bitboard blockersForKing(Color color) const;
+
+        [[nodiscard]] constexpr Square epSquare() const
+        {
+            return m_epSquare;
+        }
+
+        [[nodiscard]] constexpr CastlingRights castlingRights() const
+        {
+            return m_castlingRights;
+        }
+
+        [[nodiscard]] constexpr bool friend operator==(const Position& lhs, const Position& rhs) noexcept
+        {
+            return
+                lhs.m_sideToMove == rhs.m_sideToMove
+                && lhs.m_epSquare == rhs.m_epSquare
+                && lhs.m_castlingRights == rhs.m_castlingRights
+                && static_cast<const Board&>(lhs) == static_cast<const Board&>(rhs);
+        }
+
+        [[nodiscard]] constexpr bool friend operator!=(const Position& lhs, const Position& rhs) noexcept
+        {
+            return !(lhs == rhs);
+        }
+
+        // these are supposed to be used only for testing
+        // that's why there's this assert in afterMove
+
+        [[nodiscard]] constexpr Position beforeMove(const ReverseMove& reverseMove) const
+        {
+            Position cpy(*this);
+            cpy.undoMove(reverseMove);
+            return cpy;
+        }
+
+        [[nodiscard]] inline Position afterMove(Move move) const;
+
+        [[nodiscard]] constexpr bool isEpPossible() const
+        {
+            return m_epSquare != Square::none();
+        }
+
+        [[nodiscard]] inline CompressedPosition compress() const;
+
+    protected:
+        Color m_sideToMove;
+        Square m_epSquare;
+        CastlingRights m_castlingRights;
+        std::uint8_t m_rule50Counter;
+        std::uint16_t m_ply;
+
+        static_assert(sizeof(Color) + sizeof(Square) + sizeof(CastlingRights) + sizeof(std::uint8_t) == 4);
+
+        [[nodiscard]] inline bool isEpPossible(Square epSquare, Color sideToMove) const;
+
+        [[nodiscard]] inline bool isEpPossibleColdPath(Square epSquare, Bitboard pawnsAttackingEpSquare, Color sideToMove) const;
+
+        inline void nullifyEpSquareIfNotPossible();
+    };
+
+    struct CompressedPosition
+    {
+        friend struct Position;
+
+        // Occupied bitboard has bits set for
+        // each square with a piece on it.
+        // Each packedState byte holds 2 values (nibbles).
+        // First one at low bits, second one at high bits.
+        // Values correspond to consecutive squares
+        // in bitboard iteration order.
+        // Nibble values:
+        // these are the same as for Piece
+        // knights, bishops, queens can just be copied
+        //  0 : white pawn
+        //  1 : black pawn
+        //  2 : white knight
+        //  3 : black knight
+        //  4 : white bishop
+        //  5 : black bishop
+        //  6 : white rook
+        //  7 : black rook
+        //  8 : white queen
+        //  9 : black queen
+        // 10 : white king
+        // 11 : black king
+        //
+        // these are special
+        // 12 : pawn with ep square behind (white or black, depending on rank)
+        // 13 : white rook with coresponding castling rights
+        // 14 : black rook with coresponding castling rights
+        // 15 : black king and black is side to move
+        //
+        // Let N be the number of bits set in occupied bitboard.
+        // Only N nibbles are present. (N+1)/2 bytes are initialized.
+
+        static CompressedPosition readFromBigEndian(const unsigned char* data)
+        {
+            CompressedPosition pos{};
+            pos.m_occupied = Bitboard::fromBits(
+                (std::uint64_t)data[0] << 56
+                | (std::uint64_t)data[1] << 48
+                | (std::uint64_t)data[2] << 40
+                | (std::uint64_t)data[3] << 32
+                | (std::uint64_t)data[4] << 24
+                | (std::uint64_t)data[5] << 16
+                | (std::uint64_t)data[6] << 8
+                | (std::uint64_t)data[7]
+                );
+            std::memcpy(pos.m_packedState, data + 8, 16);
+            return pos;
+        }
+
+        constexpr CompressedPosition() :
+            m_occupied{},
+            m_packedState{}
+        {
+        }
+
+        [[nodiscard]] friend bool operator<(const CompressedPosition& lhs, const CompressedPosition& rhs)
+        {
+            if (lhs.m_occupied.bits() < rhs.m_occupied.bits()) return true;
+            if (lhs.m_occupied.bits() > rhs.m_occupied.bits()) return false;
+
+            return std::strcmp(reinterpret_cast<const char*>(lhs.m_packedState), reinterpret_cast<const char*>(rhs.m_packedState)) < 0;
+        }
+
+        [[nodiscard]] friend bool operator==(const CompressedPosition& lhs, const CompressedPosition& rhs)
+        {
+            return lhs.m_occupied == rhs.m_occupied
+                && std::strcmp(reinterpret_cast<const char*>(lhs.m_packedState), reinterpret_cast<const char*>(rhs.m_packedState)) == 0;
+        }
+
+        [[nodiscard]] inline Position decompress() const;
+
+        [[nodiscard]] constexpr Bitboard pieceBB() const
+        {
+            return m_occupied;
+        }
+
+        void writeToBigEndian(unsigned char* data)
+        {
+            const auto occupied = m_occupied.bits();
+            *data++ = occupied >> 56;
+            *data++ = (occupied >> 48) & 0xFF;
+            *data++ = (occupied >> 40) & 0xFF;
+            *data++ = (occupied >> 32) & 0xFF;
+            *data++ = (occupied >> 24) & 0xFF;
+            *data++ = (occupied >> 16) & 0xFF;
+            *data++ = (occupied >> 8) & 0xFF;
+            *data++ = occupied & 0xFF;
+            std::memcpy(data, m_packedState, 16);
+        }
+
+    private:
+        Bitboard m_occupied;
+        std::uint8_t m_packedState[16];
+    };
+
+    namespace movegen
+    {
+        // For a pseudo-legal move the following are true:
+        //  - the moving piece has the pos.sideToMove() color
+        //  - the destination square is either empty or has a piece of the opposite color
+        //  - if it is a pawn move it is valid (but may be illegal due to discovered checks)
+        //  - if it is not a pawn move then the destination square is contained in attacks()
+        //  - if it is a castling it is legal
+        //  - a move other than castling may create a discovered attack on the king
+        //  - a king may walk into a check
+
+        template <typename FuncT>
+        inline void forEachPseudoLegalPawnMove(const Position& pos, Square from, FuncT&& f)
+        {
+            const Color sideToMove = pos.sideToMove();
+            const Square epSquare = pos.epSquare();
+            const Bitboard ourPieces = pos.piecesBB(sideToMove);
+            const Bitboard theirPieces = pos.piecesBB(!sideToMove);
+            const Bitboard occupied = ourPieces | theirPieces;
+
+            Bitboard attackTargets = theirPieces;
+            if (epSquare != Square::none())
+            {
+                attackTargets |= epSquare;
+            }
+
+            const Bitboard attacks = bb::pawnAttacks(Bitboard::square(from), sideToMove) & attackTargets;
+
+            const Rank secondToLastRank = sideToMove == Color::White ? rank7 : rank2;
+            const auto forward = sideToMove == Color::White ? FlatSquareOffset(0, 1) : FlatSquareOffset(0, -1);
+
+            // promotions
+            if (from.rank() == secondToLastRank)
+            {
+                // capture promotions
+                for (Square toSq : attacks)
+                {
+                    for (PieceType pt : { PieceType::Knight, PieceType::Bishop, PieceType::Rook, PieceType::Queen })
+                    {
+                        Move move{ from, toSq, MoveType::Promotion, Piece(pt, sideToMove) };
+                        f(move);
+                    }
+                }
+
+                // push promotions
+                const Square toSq = from + forward;
+                if (!occupied.isSet(toSq))
+                {
+                    for (PieceType pt : { PieceType::Knight, PieceType::Bishop, PieceType::Rook, PieceType::Queen })
+                    {
+                        Move move{ from, toSq, MoveType::Promotion, Piece(pt, sideToMove) };
+                        f(move);
+                    }
+                }
+            }
+            else
+            {
+                // captures
+                for (Square toSq : attacks)
+                {
+                    Move move{ from, toSq, (toSq == epSquare) ? MoveType::EnPassant : MoveType::Normal };
+                    f(move);
+                }
+
+                const Square toSq = from + forward;
+
+                // single push
+                if (!occupied.isSet(toSq))
+                {
+                    const Rank startRank = sideToMove == Color::White ? rank2 : rank7;
+                    if (from.rank() == startRank)
+                    {
+                        // double push
+                        const Square toSq2 = toSq + forward;
+                        if (!occupied.isSet(toSq2))
+                        {
+                            Move move{ from, toSq2 };
+                            f(move);
+                        }
+                    }
+
+                    Move move{ from, toSq };
+                    f(move);
+                }
+            }
+        }
+
+        template <Color SideToMoveV, typename FuncT>
+        inline void forEachPseudoLegalPawnMove(const Position& pos, FuncT&& f)
+        {
+            const Square epSquare = pos.epSquare();
+            const Bitboard ourPieces = pos.piecesBB(SideToMoveV);
+            const Bitboard theirPieces = pos.piecesBB(!SideToMoveV);
+            const Bitboard occupied = ourPieces | theirPieces;
+            const Bitboard pawns = pos.piecesBB(Piece(PieceType::Pawn, SideToMoveV));
+
+            const Bitboard secondToLastRank = SideToMoveV == Color::White ? bb::rank7 : bb::rank2;
+            const Bitboard secondRank = SideToMoveV == Color::White ? bb::rank2 : bb::rank7;
+
+            const auto singlePawnMoveDestinationOffset = SideToMoveV == Color::White ? FlatSquareOffset(0, 1) : FlatSquareOffset(0, -1);
+            const auto doublePawnMoveDestinationOffset = SideToMoveV == Color::White ? FlatSquareOffset(0, 2) : FlatSquareOffset(0, -2);
+
+            {
+                const int backward = SideToMoveV == Color::White ? -1 : 1;
+                const int backward2 = backward * 2;
+
+                const Bitboard doublePawnMoveStarts =
+                    pawns
+                    & secondRank
+                    & ~(occupied.shiftedVertically(backward) | occupied.shiftedVertically(backward2));
+
+                const Bitboard singlePawnMoveStarts =
+                    pawns
+                    & ~secondToLastRank
+                    & ~occupied.shiftedVertically(backward);
+
+                for (Square from : doublePawnMoveStarts)
+                {
+                    const Square to = from + doublePawnMoveDestinationOffset;
+                    f(Move::normal(from, to));
+                }
+
+                for (Square from : singlePawnMoveStarts)
+                {
+                    const Square to = from + singlePawnMoveDestinationOffset;
+                    f(Move::normal(from, to));
+                }
+            }
+
+            {
+                const Bitboard lastRank = SideToMoveV == Color::White ? bb::rank8 : bb::rank1;
+                const FlatSquareOffset westCaptureOffset = SideToMoveV == Color::White ? FlatSquareOffset(-1, 1) : FlatSquareOffset(-1, -1);
+                const FlatSquareOffset eastCaptureOffset = SideToMoveV == Color::White ? FlatSquareOffset(1, 1) : FlatSquareOffset(1, -1);
+
+                const Bitboard pawnsWithWestCapture = bb::eastPawnAttacks(theirPieces & ~lastRank, !SideToMoveV) & pawns;
+                const Bitboard pawnsWithEastCapture = bb::westPawnAttacks(theirPieces & ~lastRank, !SideToMoveV) & pawns;
+
+                for (Square from : pawnsWithWestCapture)
+                {
+                    f(Move::normal(from, from + westCaptureOffset));
+                }
+
+                for (Square from : pawnsWithEastCapture)
+                {
+                    f(Move::normal(from, from + eastCaptureOffset));
+                }
+            }
+
+            if (epSquare != Square::none())
+            {
+                const Bitboard pawnsThatCanCapture = bb::pawnAttacks(Bitboard::square(epSquare), !SideToMoveV) & pawns;
+                for (Square from : pawnsThatCanCapture)
+                {
+                    f(Move::enPassant(from, epSquare));
+                }
+            }
+
+            for (Square from : pawns & secondToLastRank)
+            {
+                const Bitboard attacks = bb::pawnAttacks(Bitboard::square(from), SideToMoveV) & theirPieces;
+
+                // capture promotions
+                for (Square to : attacks)
+                {
+                    for (PieceType pt : { PieceType::Knight, PieceType::Bishop, PieceType::Rook, PieceType::Queen })
+                    {
+                        Move move{ from, to, MoveType::Promotion, Piece(pt, SideToMoveV) };
+                        f(move);
+                    }
+                }
+
+                // push promotions
+                const Square to = from + singlePawnMoveDestinationOffset;
+                if (!occupied.isSet(to))
+                {
+                    for (PieceType pt : { PieceType::Knight, PieceType::Bishop, PieceType::Rook, PieceType::Queen })
+                    {
+                        Move move{ from, to, MoveType::Promotion, Piece(pt, SideToMoveV) };
+                        f(move);
+                    }
+                }
+            }
+        }
+
+        template <typename FuncT>
+        inline void forEachPseudoLegalPawnMove(const Position& pos, FuncT&& f)
+        {
+            if (pos.sideToMove() == Color::White)
+            {
+                forEachPseudoLegalPawnMove<Color::White>(pos, std::forward<FuncT>(f));
+            }
+            else
+            {
+                forEachPseudoLegalPawnMove<Color::Black>(pos, std::forward<FuncT>(f));
+            }
+        }
+
+        template <PieceType PieceTypeV, typename FuncT>
+        inline void forEachPseudoLegalPieceMove(const Position& pos, Square from, FuncT&& f)
+        {
+            static_assert(PieceTypeV != PieceType::None);
+
+            if constexpr (PieceTypeV == PieceType::Pawn)
+            {
+                forEachPseudoLegalPawnMove(pos, from, f);
+            }
+            else
+            {
+                const Color sideToMove = pos.sideToMove();
+                const Bitboard ourPieces = pos.piecesBB(sideToMove);
+                const Bitboard theirPieces = pos.piecesBB(!sideToMove);
+                const Bitboard occupied = ourPieces | theirPieces;
+                const Bitboard attacks = bb::attacks<PieceTypeV>(from, occupied) & ~ourPieces;
+
+                for (Square toSq : attacks)
+                {
+                    Move move{ from, toSq };
+                    f(move);
+                }
+            }
+        }
+
+        template <PieceType PieceTypeV, typename FuncT>
+        inline void forEachPseudoLegalPieceMove(const Position& pos, FuncT&& f)
+        {
+            static_assert(PieceTypeV != PieceType::None);
+
+            if constexpr (PieceTypeV == PieceType::Pawn)
+            {
+                forEachPseudoLegalPawnMove(pos, f);
+            }
+            else
+            {
+                const Color sideToMove = pos.sideToMove();
+                const Bitboard ourPieces = pos.piecesBB(sideToMove);
+                const Bitboard theirPieces = pos.piecesBB(!sideToMove);
+                const Bitboard occupied = ourPieces | theirPieces;
+                const Bitboard pieces = pos.piecesBB(Piece(PieceTypeV, sideToMove));
+                for (Square fromSq : pieces)
+                {
+                    const Bitboard attacks = bb::attacks<PieceTypeV>(fromSq, occupied) & ~ourPieces;
+                    for (Square toSq : attacks)
+                    {
+                        Move move{ fromSq, toSq };
+                        f(move);
+                    }
+                }
+            }
+        }
+
+        template <typename FuncT>
+        inline void forEachCastlingMove(const Position& pos, FuncT&& f)
+        {
+            CastlingRights rights = pos.castlingRights();
+            if (rights == CastlingRights::None)
+            {
+                return;
+            }
+
+            const Color sideToMove = pos.sideToMove();
+            const Bitboard ourPieces = pos.piecesBB(sideToMove);
+            const Bitboard theirPieces = pos.piecesBB(!sideToMove);
+            const Bitboard occupied = ourPieces | theirPieces;
+
+            // we first reduce the set of legal castlings by checking the paths for pieces
+            if (sideToMove == Color::White)
+            {
+                if ((CastlingTraits::castlingPath[Color::White][CastleType::Short] & occupied).any()) rights &= ~CastlingRights::WhiteKingSide;
+                if ((CastlingTraits::castlingPath[Color::White][CastleType::Long] & occupied).any()) rights &= ~CastlingRights::WhiteQueenSide;
+                rights &= ~CastlingRights::Black;
+            }
+            else
+            {
+                if ((CastlingTraits::castlingPath[Color::Black][CastleType::Short] & occupied).any()) rights &= ~CastlingRights::BlackKingSide;
+                if ((CastlingTraits::castlingPath[Color::Black][CastleType::Long] & occupied).any()) rights &= ~CastlingRights::BlackQueenSide;
+                rights &= ~CastlingRights::White;
+            }
+
+            if (rights == CastlingRights::None)
+            {
+                return;
+            }
+
+            // King must not be in check. Done here because it is quite expensive.
+            const Square ksq = pos.kingSquare(sideToMove);
+            if (pos.isSquareAttacked(ksq, !sideToMove))
+            {
+                return;
+            }
+
+            // Loop through all possible castlings.
+            for (CastleType castlingType : values<CastleType>())
+            {
+                const CastlingRights right = CastlingTraits::castlingRights[sideToMove][castlingType];
+
+                if (!contains(rights, right))
+                {
+                    continue;
+                }
+
+                // If we have this castling right
+                // we check whether the king passes an attacked square.
+                const Square passedSquare = CastlingTraits::squarePassedByKing[sideToMove][castlingType];
+                if (pos.isSquareAttacked(passedSquare, !sideToMove))
+                {
+                    continue;
+                }
+
+                // If it's a castling move then the change in square occupation
+                // cannot have an effect because otherwise there would be
+                // a slider attacker attacking the castling king.
+                if (pos.isSquareAttacked(CastlingTraits::kingDestination[sideToMove][castlingType], !sideToMove))
+                {
+                    continue;
+                }
+
+                // If not we can castle.
+                Move move = Move::castle(castlingType, sideToMove);
+                f(move);
+            }
+        }
+
+        // Calls a given function for all pseudo legal moves for the position.
+        // `pos` must be a legal chess position
+        template <typename FuncT>
+        inline void forEachPseudoLegalMove(const Position& pos, FuncT&& func)
+        {
+            forEachPseudoLegalPieceMove<PieceType::Pawn>(pos, func);
+            forEachPseudoLegalPieceMove<PieceType::Knight>(pos, func);
+            forEachPseudoLegalPieceMove<PieceType::Bishop>(pos, func);
+            forEachPseudoLegalPieceMove<PieceType::Rook>(pos, func);
+            forEachPseudoLegalPieceMove<PieceType::Queen>(pos, func);
+            forEachPseudoLegalPieceMove<PieceType::King>(pos, func);
+            forEachCastlingMove(pos, func);
+        }
+
+        // Calls a given function for all legal moves for the position.
+        // `pos` must be a legal chess position
+        template <typename FuncT>
+        inline void forEachLegalMove(const Position& pos, FuncT&& func)
+        {
+            auto funcIfLegal = [&func, checker = pos.moveLegalityChecker()](Move move) {
+                if (checker.isPseudoLegalMoveLegal(move))
+                {
+                    func(move);
+                }
+            };
+
+            forEachPseudoLegalPieceMove<PieceType::Pawn>(pos, funcIfLegal);
+            forEachPseudoLegalPieceMove<PieceType::Knight>(pos, funcIfLegal);
+            forEachPseudoLegalPieceMove<PieceType::Bishop>(pos, funcIfLegal);
+            forEachPseudoLegalPieceMove<PieceType::Rook>(pos, funcIfLegal);
+            forEachPseudoLegalPieceMove<PieceType::Queen>(pos, funcIfLegal);
+            forEachPseudoLegalPieceMove<PieceType::King>(pos, funcIfLegal);
+            forEachCastlingMove(pos, func);
+        }
+
+        // Generates all pseudo legal moves for the position.
+        // `pos` must be a legal chess position
+        [[nodiscard]] std::vector<Move> generatePseudoLegalMoves(const Position& pos);
+
+        // Generates all legal moves for the position.
+        // `pos` must be a legal chess position
+        [[nodiscard]] std::vector<Move> generateLegalMoves(const Position& pos);
+    }
+
+    [[nodiscard]] inline bool Position::isCheck() const
+    {
+        return BaseType::isSquareAttacked(kingSquare(m_sideToMove), !m_sideToMove);
+    }
+
+    [[nodiscard]] inline Bitboard Position::checkers() const
+    {
+        return BaseType::attackers(kingSquare(m_sideToMove), !m_sideToMove);
+    }
+
+    [[nodiscard]] inline bool Position::isCheckAfterMove(Move move) const
+    {
+        return BaseType::isSquareAttackedAfterMove(move, kingSquare(!m_sideToMove), m_sideToMove);
+    }
+
+    [[nodiscard]] inline bool Position::isMoveLegal(Move move) const
+    {
+        return
+            isMovePseudoLegal(move)
+            && isPseudoLegalMoveLegal(move);
+    }
+
+    [[nodiscard]] inline bool Position::isPseudoLegalMoveLegal(Move move) const
+    {
+        return
+            (move.type == MoveType::Castle)
+            || !isOwnKingAttackedAfterMove(move);
+    }
+
+    [[nodiscard]] inline bool Position::isMovePseudoLegal(Move move) const
+    {
+        if (!move.from.isOk() || !move.to.isOk())
+        {
+            return false;
+        }
+
+        if (move.from == move.to)
+        {
+            return false;
+        }
+
+        if (move.type != MoveType::Promotion && move.promotedPiece != Piece::none())
+        {
+            return false;
+        }
+
+        const Piece movedPiece = pieceAt(move.from);
+        if (movedPiece == Piece::none())
+        {
+            return false;
+        }
+
+        if (movedPiece.color() != m_sideToMove)
+        {
+            return false;
+        }
+
+        const Bitboard occupied = piecesBB();
+        const Bitboard ourPieces = piecesBB(m_sideToMove);
+        const bool isNormal = move.type == MoveType::Normal;
+
+        switch (movedPiece.type())
+        {
+        case PieceType::Pawn:
+        {
+            bool isValid = false;
+            // TODO: use iterators so we don't loop over all moves
+            //       when we can avoid it.
+            movegen::forEachPseudoLegalPawnMove(*this, move.from, [&isValid, &move](const Move& genMove) {
+                if (move == genMove)
+                {
+                    isValid = true;
+                }
+                });
+            return isValid;
+        }
+
+        case PieceType::Bishop:
+            return isNormal && (bb::attacks<PieceType::Bishop>(move.from, occupied) & ~ourPieces).isSet(move.to);
+
+        case PieceType::Knight:
+            return isNormal && (bb::pseudoAttacks<PieceType::Knight>(move.from) & ~ourPieces).isSet(move.to);
+
+        case PieceType::Rook:
+            return isNormal && (bb::attacks<PieceType::Rook>(move.from, occupied) & ~ourPieces).isSet(move.to);
+
+        case PieceType::Queen:
+            return isNormal && (bb::attacks<PieceType::Queen>(move.from, occupied) & ~ourPieces).isSet(move.to);
+
+        case PieceType::King:
+        {
+            if (move.type == MoveType::Castle)
+            {
+                bool isValid = false;
+                movegen::forEachCastlingMove(*this, [&isValid, &move](const Move& genMove) {
+                    if (move == genMove)
+                    {
+                        isValid = true;
+                    }
+                    });
+                return isValid;
+            }
+            else
+            {
+                return isNormal && (bb::pseudoAttacks<PieceType::King>(move.from) & ~ourPieces).isSet(move.to);
+            }
+        }
+
+        default:
+            return false;
+        }
+    }
+
+    [[nodiscard]] inline Bitboard Position::blockersForKing(Color color) const
+    {
+        const Color attackerColor = !color;
+
+        const Bitboard occupied = piecesBB();
+
+        const Bitboard bishops = piecesBB(Piece(PieceType::Bishop, attackerColor));
+        const Bitboard rooks = piecesBB(Piece(PieceType::Rook, attackerColor));
+        const Bitboard queens = piecesBB(Piece(PieceType::Queen, attackerColor));
+
+        const Square ksq = kingSquare(color);
+
+        const Bitboard opponentBishopLikePieces = (bishops | queens);
+        const Bitboard bishopPseudoAttacks = bb::pseudoAttacks<PieceType::Bishop>(ksq);
+
+        const Bitboard opponentRookLikePieces = (rooks | queens);
+        const Bitboard rookPseudoAttacks = bb::pseudoAttacks<PieceType::Rook>(ksq);
+
+        const Bitboard xrayers =
+            (bishopPseudoAttacks & opponentBishopLikePieces)
+            | (rookPseudoAttacks & opponentRookLikePieces);
+
+        Bitboard allBlockers = Bitboard::none();
+
+        for (Square xrayer : xrayers)
+        {
+            const Bitboard blockers = bb::between(xrayer, ksq) & occupied;
+            if (blockers.exactlyOne())
+            {
+                allBlockers |= blockers;
+            }
+        }
+
+        return allBlockers;
+    }
+
+    inline MoveLegalityChecker::MoveLegalityChecker(const Position& position) :
+        m_position(&position),
+        m_checkers(position.checkers()),
+        m_ourBlockersForKing(
+            position.blockersForKing(position.sideToMove())
+            & position.piecesBB(position.sideToMove())
+        ),
+        m_ksq(position.kingSquare(position.sideToMove()))
+    {
+        if (m_checkers.exactlyOne())
+        {
+            const Bitboard knightCheckers = m_checkers & bb::pseudoAttacks<PieceType::Knight>(m_ksq);
+            if (knightCheckers.any())
+            {
+                // We're checked by a knight, we have to remove it or move the king.
+                m_potentialCheckRemovals = knightCheckers;
+            }
+            else
+            {
+                // If we're not checked by a knight we can block it.
+                m_potentialCheckRemovals = bb::between(m_ksq, m_checkers.first()) | m_checkers;
+            }
+        }
+        else
+        {
+            // Double check, king has to move.
+            m_potentialCheckRemovals = Bitboard::none();
+        }
+    }
+
+    [[nodiscard]] inline bool MoveLegalityChecker::isPseudoLegalMoveLegal(const Move& move) const
+    {
+        if (m_checkers.any())
+        {
+            if (move.from == m_ksq || move.type == MoveType::EnPassant)
+            {
+                return m_position->isPseudoLegalMoveLegal(move);
+            }
+            else
+            {
+                // This means there's only one check and we either
+                // blocked it or removed the piece that attacked
+                // our king. So the only threat is if it's a discovered check.
+                return
+                    m_potentialCheckRemovals.isSet(move.to)
+                    && !m_ourBlockersForKing.isSet(move.from);
+            }
+        }
+        else
+        {
+            if (move.from == m_ksq)
+            {
+                return m_position->isPseudoLegalMoveLegal(move);
+            }
+            else if (move.type == MoveType::EnPassant)
+            {
+                return !m_position->createsDiscoveredAttackOnOwnKing(move);
+            }
+            else if (m_ourBlockersForKing.isSet(move.from))
+            {
+                // If it was a blocker it may have only moved in line with our king.
+                // Otherwise it's a discovered check.
+                return bb::line(m_ksq, move.from).isSet(move.to);
+            }
+            else
+            {
+                return true;
+            }
+        }
+    }
+
+    static_assert(sizeof(CompressedPosition) == 24);
+    static_assert(std::is_trivially_copyable_v<CompressedPosition>);
+
+    namespace detail
+    {
+        [[nodiscard]] FORCEINLINE constexpr std::uint8_t compressOrdinaryPiece(const Position&, Square, Piece piece)
+        {
+            return static_cast<std::uint8_t>(ordinal(piece));
+        }
+
+        [[nodiscard]] FORCEINLINE constexpr std::uint8_t compressPawn(const Position& position, Square sq, Piece piece)
+        {
+            const Square epSquare = position.epSquare();
+            if (epSquare == Square::none())
+            {
+                return static_cast<std::uint8_t>(ordinal(piece));
+            }
+            else
+            {
+                const Color sideToMove = position.sideToMove();
+                const Rank rank = sq.rank();
+                const File file = sq.file();
+                // use bitwise operators, there is a lot of unpredictable branches but in
+                // total the result is quite predictable
+                if (
+                    (file == epSquare.file())
+                    && (
+                    ((rank == rank4) & (sideToMove == Color::Black))
+                        | ((rank == rank5) & (sideToMove == Color::White))
+                        )
+                    )
+                {
+                    return 12;
+                }
+                else
+                {
+                    return static_cast<std::uint8_t>(ordinal(piece));
+                }
+            }
+        }
+
+        [[nodiscard]] FORCEINLINE constexpr std::uint8_t compressRook(const Position& position, Square sq, Piece piece)
+        {
+            const CastlingRights castlingRights = position.castlingRights();
+            const Color color = piece.color();
+
+            if (color == Color::White
+                && (
+                (sq == a1 && contains(castlingRights, CastlingRights::WhiteQueenSide))
+                    || (sq == h1 && contains(castlingRights, CastlingRights::WhiteKingSide))
+                    )
+                )
+            {
+                return 13;
+            }
+            else if (
+                color == Color::Black
+                && (
+                (sq == a8 && contains(castlingRights, CastlingRights::BlackQueenSide))
+                    || (sq == h8 && contains(castlingRights, CastlingRights::BlackKingSide))
+                    )
+                )
+            {
+                return 14;
+            }
+            else
+            {
+                return static_cast<std::uint8_t>(ordinal(piece));
+            }
+        }
+
+        [[nodiscard]] FORCEINLINE constexpr std::uint8_t compressKing(const Position& position, Square /* sq */, Piece piece)
+        {
+            const Color color = piece.color();
+            const Color sideToMove = position.sideToMove();
+
+            if (color == Color::White)
+            {
+                return 10;
+            }
+            else if (sideToMove == Color::White)
+            {
+                return 11;
+            }
+            else
+            {
+                return 15;
+            }
+        }
+    }
+
+    namespace detail::lookup
+    {
+        static constexpr EnumArray<PieceType, std::uint8_t(*)(const Position&, Square, Piece)> pieceCompressorFunc = []() {
+            EnumArray<PieceType, std::uint8_t(*)(const Position&, Square, Piece)> pieceCompressorFunc_{};
+
+            pieceCompressorFunc_[PieceType::Knight] = detail::compressOrdinaryPiece;
+            pieceCompressorFunc_[PieceType::Bishop] = detail::compressOrdinaryPiece;
+            pieceCompressorFunc_[PieceType::Queen] = detail::compressOrdinaryPiece;
+
+            pieceCompressorFunc_[PieceType::Pawn] = detail::compressPawn;
+            pieceCompressorFunc_[PieceType::Rook] = detail::compressRook;
+            pieceCompressorFunc_[PieceType::King] = detail::compressKing;
+
+            pieceCompressorFunc_[PieceType::None] = [](const Position&, Square, Piece) -> std::uint8_t { /* should never happen */ return 0; };
+
+            return pieceCompressorFunc_;
+        }();
+    }
+
+    [[nodiscard]] inline CompressedPosition Position::compress() const
+    {
+        auto compressPiece = [this](Square sq, Piece piece) -> std::uint8_t {
+            if (piece.type() == PieceType::Pawn) // it's likely to be a pawn
+            {
+                return detail::compressPawn(*this, sq, piece);
+            }
+            else
+            {
+                return detail::lookup::pieceCompressorFunc[piece.type()](*this, sq, piece);
+            }
+        };
+
+        const Bitboard occ = piecesBB();
+
+        CompressedPosition compressed;
+        compressed.m_occupied = occ;
+
+        auto it = occ.begin();
+        auto end = occ.end();
+        for (int i = 0;; ++i)
+        {
+            if (it == end) break;
+            compressed.m_packedState[i] = compressPiece(*it, pieceAt(*it));
+            ++it;
+
+            if (it == end) break;
+            compressed.m_packedState[i] |= compressPiece(*it, pieceAt(*it)) << 4;
+            ++it;
+        }
+
+        return compressed;
+    }
+
+    [[nodiscard]] inline Position CompressedPosition::decompress() const
+    {
+        Position pos;
+        pos.setCastlingRights(CastlingRights::None);
+
+        auto decompressPiece = [&pos](Square sq, std::uint8_t nibble) {
+            switch (nibble)
+            {
+            case 0:
+            case 1:
+            case 2:
+            case 3:
+            case 4:
+            case 5:
+            case 6:
+            case 7:
+            case 8:
+            case 9:
+            case 10:
+            case 11:
+            {
+                pos.place(fromOrdinal<Piece>(nibble), sq);
+                return;
+            }
+
+            case 12:
+            {
+                const Rank rank = sq.rank();
+                if (rank == rank4)
+                {
+                    pos.place(whitePawn, sq);
+                    pos.setEpSquareUnchecked(sq + Offset{ 0, -1 });
+                }
+                else // (rank == rank5)
+                {
+                    pos.place(blackPawn, sq);
+                    pos.setEpSquareUnchecked(sq + Offset{ 0, 1 });
+                }
+                return;
+            }
+
+            case 13:
+            {
+                pos.place(whiteRook, sq);
+                if (sq == a1)
+                {
+                    pos.addCastlingRights(CastlingRights::WhiteQueenSide);
+                }
+                else // (sq == H1)
+                {
+                    pos.addCastlingRights(CastlingRights::WhiteKingSide);
+                }
+                return;
+            }
+
+            case 14:
+            {
+                pos.place(blackRook, sq);
+                if (sq == a8)
+                {
+                    pos.addCastlingRights(CastlingRights::BlackQueenSide);
+                }
+                else // (sq == H8)
+                {
+                    pos.addCastlingRights(CastlingRights::BlackKingSide);
+                }
+                return;
+            }
+
+            case 15:
+            {
+                pos.place(blackKing, sq);
+                pos.setSideToMove(Color::Black);
+                return;
+            }
+
+            }
+
+            return;
+        };
+
+        const Bitboard occ = m_occupied;
+
+        auto it = occ.begin();
+        auto end = occ.end();
+        for (int i = 0;; ++i)
+        {
+            if (it == end) break;
+            decompressPiece(*it, m_packedState[i] & 0xF);
+            ++it;
+
+            if (it == end) break;
+            decompressPiece(*it, m_packedState[i] >> 4);
+            ++it;
+        }
+
+        return pos;
+    }
+
+
+    [[nodiscard]] bool Board::isSquareAttacked(Square sq, Color attackerColor) const
+    {
+        assert(sq.isOk());
+
+        const Bitboard occupied = piecesBB();
+        const Bitboard bishops = piecesBB(Piece(PieceType::Bishop, attackerColor));
+        const Bitboard rooks = piecesBB(Piece(PieceType::Rook, attackerColor));
+        const Bitboard queens = piecesBB(Piece(PieceType::Queen, attackerColor));
+
+        const Bitboard allSliders = (bishops | rooks | queens);
+        if ((bb::pseudoAttacks<PieceType::Queen>(sq) & allSliders).any())
+        {
+            if (bb::isAttackedBySlider(
+                sq,
+                bishops,
+                rooks,
+                queens,
+                occupied
+            ))
+            {
+                return true;
+            }
+        }
+
+        const Bitboard king = piecesBB(Piece(PieceType::King, attackerColor));
+        if ((bb::pseudoAttacks<PieceType::King>(sq) & king).any())
+        {
+            return true;
+        }
+
+        const Bitboard knights = piecesBB(Piece(PieceType::Knight, attackerColor));
+        if ((bb::pseudoAttacks<PieceType::Knight>(sq) & knights).any())
+        {
+            return true;
+        }
+
+        const Bitboard pawns = piecesBB(Piece(PieceType::Pawn, attackerColor));
+        const Bitboard pawnAttacks = bb::pawnAttacks(pawns, attackerColor);
+
+        return pawnAttacks.isSet(sq);
+    }
+
+    [[nodiscard]] bool Board::isSquareAttackedAfterMove(Move move, Square sq, Color attackerColor) const
+    {
+        const Bitboard occupiedChange = Bitboard::square(move.from) | move.to;
+
+        Bitboard occupied = (piecesBB() ^ move.from) | move.to;
+
+        Bitboard bishops = piecesBB(Piece(PieceType::Bishop, attackerColor));
+        Bitboard rooks = piecesBB(Piece(PieceType::Rook, attackerColor));
+        Bitboard queens = piecesBB(Piece(PieceType::Queen, attackerColor));
+        Bitboard king = piecesBB(Piece(PieceType::King, attackerColor));
+        Bitboard knights = piecesBB(Piece(PieceType::Knight, attackerColor));
+        Bitboard pawns = piecesBB(Piece(PieceType::Pawn, attackerColor));
+
+        if (move.type == MoveType::EnPassant)
+        {
+            const Square capturedPawnSq(move.to.file(), move.from.rank());
+            occupied ^= capturedPawnSq;
+            pawns ^= capturedPawnSq;
+        }
+        else if (pieceAt(move.to) != Piece::none())
+        {
+            const Bitboard notCaptured = ~Bitboard::square(move.to);
+            bishops &= notCaptured;
+            rooks &= notCaptured;
+            queens &= notCaptured;
+            knights &= notCaptured;
+            pawns &= notCaptured;
+        }
+
+        // Potential attackers may have moved.
+        const Piece movedPiece = pieceAt(move.from);
+        if (movedPiece.color() == attackerColor)
+        {
+            switch (movedPiece.type())
+            {
+            case PieceType::Pawn:
+                pawns ^= occupiedChange;
+                break;
+            case PieceType::Knight:
+                knights ^= occupiedChange;
+                break;
+            case PieceType::Bishop:
+                bishops ^= occupiedChange;
+                break;
+            case PieceType::Rook:
+                rooks ^= occupiedChange;
+                break;
+            case PieceType::Queen:
+                queens ^= occupiedChange;
+                break;
+            case PieceType::King:
+            {
+                if (move.type == MoveType::Castle)
+                {
+                    const CastleType castleType = CastlingTraits::moveCastlingType(move);
+
+                    king ^= move.from;
+                    king ^= CastlingTraits::kingDestination[attackerColor][castleType];
+                    rooks ^= move.to;
+                    rooks ^= CastlingTraits::rookDestination[attackerColor][castleType];
+                }
+                else
+                {
+                    king ^= occupiedChange;
+                }
+
+                break;
+            }
+            case PieceType::None:
+                assert(false);
+            }
+        }
+
+        // If it's a castling move then the change in square occupation
+        // cannot have an effect because otherwise there would be
+        // a slider attacker attacking the castling king.
+        // (It could have an effect in chess960 if the slider
+        // attacker was behind the rook involved in castling,
+        // but we don't care about chess960.)
+
+        const Bitboard allSliders = (bishops | rooks | queens);
+        if ((bb::pseudoAttacks<PieceType::Queen>(sq) & allSliders).any())
+        {
+            if (bb::isAttackedBySlider(
+                sq,
+                bishops,
+                rooks,
+                queens,
+                occupied
+            ))
+            {
+                return true;
+            }
+        }
+
+        if ((bb::pseudoAttacks<PieceType::King>(sq) & king).any())
+        {
+            return true;
+        }
+
+        if ((bb::pseudoAttacks<PieceType::Knight>(sq) & knights).any())
+        {
+            return true;
+        }
+
+        const Bitboard pawnAttacks = bb::pawnAttacks(pawns, attackerColor);
+
+        return pawnAttacks.isSet(sq);
+    }
+
+    [[nodiscard]] bool Board::createsDiscoveredAttackOnOwnKing(Move move) const
+    {
+        Bitboard occupied = (piecesBB() ^ move.from) | move.to;
+
+        const Piece movedPiece = pieceAt(move.from);
+        const Color kingColor = movedPiece.color();
+        const Color attackerColor = !kingColor;
+        const Square ksq = kingSquare(kingColor);
+
+        Bitboard bishops = piecesBB(Piece(PieceType::Bishop, attackerColor));
+        Bitboard rooks = piecesBB(Piece(PieceType::Rook, attackerColor));
+        Bitboard queens = piecesBB(Piece(PieceType::Queen, attackerColor));
+
+        if (move.type == MoveType::EnPassant)
+        {
+            const Square capturedPawnSq(move.to.file(), move.from.rank());
+            occupied ^= capturedPawnSq;
+        }
+        else if (pieceAt(move.to) != Piece::none())
+        {
+            const Bitboard notCaptured = ~Bitboard::square(move.to);
+            bishops &= notCaptured;
+            rooks &= notCaptured;
+            queens &= notCaptured;
+        }
+
+        const Bitboard allSliders = (bishops | rooks | queens);
+        if ((bb::pseudoAttacks<PieceType::Queen>(ksq) & allSliders).any())
+        {
+            if (bb::isAttackedBySlider(
+                ksq,
+                bishops,
+                rooks,
+                queens,
+                occupied
+            ))
+            {
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+    [[nodiscard]] bool Board::isPieceAttacked(Square sq) const
+    {
+        const Piece piece = pieceAt(sq);
+
+        if (piece == Piece::none())
+        {
+            return false;
+        }
+
+        return isSquareAttacked(sq, !piece.color());
+    }
+
+    [[nodiscard]] bool Board::isPieceAttackedAfterMove(Move move, Square sq) const
+    {
+        const Piece piece = pieceAt(sq);
+
+        if (piece == Piece::none())
+        {
+            return false;
+        }
+
+        if (sq == move.from)
+        {
+            // We moved the piece we're interested in.
+            // For every move the piece ends up on the move.to except
+            // for the case of castling moves.
+            // But we know pseudo legal castling moves
+            // are already legal, so the king cannot be in check after.
+            if (move.type == MoveType::Castle)
+            {
+                return false;
+            }
+
+            // So update the square we're interested in.
+            sq = move.to;
+        }
+
+        return isSquareAttackedAfterMove(move, sq, !piece.color());
+    }
+
+    [[nodiscard]] bool Board::isOwnKingAttackedAfterMove(Move move) const
+    {
+        if (move.type == MoveType::Castle)
+        {
+            // Pseudo legal castling moves are already legal.
+            // This is ensured by the move generator.
+            return false;
+        }
+
+        const Piece movedPiece = pieceAt(move.from);
+
+        return isPieceAttackedAfterMove(move, kingSquare(movedPiece.color()));
+    }
+
+    [[nodiscard]] Bitboard Board::attacks(Square sq) const
+    {
+        const Piece piece = pieceAt(sq);
+        if (piece == Piece::none())
+        {
+            return Bitboard::none();
+        }
+
+        if (piece.type() == PieceType::Pawn)
+        {
+            return bb::pawnAttacks(Bitboard::square(sq), piece.color());
+        }
+        else
+        {
+            return bb::attacks(piece.type(), sq, piecesBB());
+        }
+    }
+
+    [[nodiscard]] Bitboard Board::attackers(Square sq, Color attackerColor) const
+    {
+        // En-passant square is not included.
+
+        Bitboard allAttackers = Bitboard::none();
+
+        const Bitboard occupied = piecesBB();
+
+        const Bitboard bishops = piecesBB(Piece(PieceType::Bishop, attackerColor));
+        const Bitboard rooks = piecesBB(Piece(PieceType::Rook, attackerColor));
+        const Bitboard queens = piecesBB(Piece(PieceType::Queen, attackerColor));
+
+        const Bitboard bishopLikePieces = (bishops | queens);
+        const Bitboard bishopAttacks = bb::attacks<PieceType::Bishop>(sq, occupied);
+        allAttackers |= bishopAttacks & bishopLikePieces;
+
+        const Bitboard rookLikePieces = (rooks | queens);
+        const Bitboard rookAttacks = bb::attacks<PieceType::Rook>(sq, occupied);
+        allAttackers |= rookAttacks & rookLikePieces;
+
+        const Bitboard king = piecesBB(Piece(PieceType::King, attackerColor));
+        allAttackers |= bb::pseudoAttacks<PieceType::King>(sq) & king;
+
+        const Bitboard knights = piecesBB(Piece(PieceType::Knight, attackerColor));
+        allAttackers |= bb::pseudoAttacks<PieceType::Knight>(sq) & knights;
+
+        const Bitboard pawns = piecesBB(Piece(PieceType::Pawn, attackerColor));
+        allAttackers |= bb::pawnAttacks(Bitboard::square(sq), !attackerColor) & pawns;
+
+        return allAttackers;
+    }
+
+    inline const Piece* Board::piecesRaw() const
+    {
+        return m_pieces.data();
+    }
+
+    namespace detail::lookup
+    {
+        static constexpr EnumArray<Piece, char> fenPiece = []() {
+            EnumArray<Piece, char> fenPiece_{};
+
+            fenPiece_[whitePawn] = 'P';
+            fenPiece_[blackPawn] = 'p';
+            fenPiece_[whiteKnight] = 'N';
+            fenPiece_[blackKnight] = 'n';
+            fenPiece_[whiteBishop] = 'B';
+            fenPiece_[blackBishop] = 'b';
+            fenPiece_[whiteRook] = 'R';
+            fenPiece_[blackRook] = 'r';
+            fenPiece_[whiteQueen] = 'Q';
+            fenPiece_[blackQueen] = 'q';
+            fenPiece_[whiteKing] = 'K';
+            fenPiece_[blackKing] = 'k';
+            fenPiece_[Piece::none()] = 'X';
+
+            return fenPiece_;
+        }();
+    }
+
+    [[nodiscard]] inline std::string Board::fen() const
+    {
+        std::string fen;
+        fen.reserve(96); // longest fen is probably in range of around 88
+
+        Rank rank = rank8;
+        File file = fileA;
+        std::uint8_t emptyCounter = 0;
+
+        for (;;)
+        {
+            const Square sq(file, rank);
+            const Piece piece = m_pieces[sq];
+
+            if (piece == Piece::none())
+            {
+                ++emptyCounter;
+            }
+            else
+            {
+                if (emptyCounter != 0)
+                {
+                    fen.push_back(static_cast<char>(emptyCounter) + '0');
+                    emptyCounter = 0;
+                }
+
+                fen.push_back(detail::lookup::fenPiece[piece]);
+            }
+
+            ++file;
+            if (file > fileH)
+            {
+                file = fileA;
+                --rank;
+
+                if (emptyCounter != 0)
+                {
+                    fen.push_back(static_cast<char>(emptyCounter) + '0');
+                    emptyCounter = 0;
+                }
+
+                if (rank < rank1)
+                {
+                    break;
+                }
+                fen.push_back('/');
+            }
+        }
+
+        return fen;
+    }
+
+    void Position::set(std::string_view fen)
+    {
+        (void)trySet(fen);
+    }
+
+    // Returns false if the fen was not valid
+    // If the returned value was false the position
+    // is in unspecified state.
+    [[nodiscard]] bool Position::trySet(std::string_view fen)
+    {
+        // Lazily splits by ' '. Returns empty string views if at the end.
+        auto nextPart = [fen, start = std::size_t{ 0 }]() mutable {
+            std::size_t end = fen.find(' ', start);
+            if (end == std::string::npos)
+            {
+                std::string_view substr = fen.substr(start);
+                start = fen.size();
+                return substr;
+            }
+            else
+            {
+                std::string_view substr = fen.substr(start, end - start);
+                start = end + 1; // to skip whitespace
+                return substr;
+            }
+        };
+
+        if (!BaseType::trySet(nextPart())) return false;
+
+        {
+            const auto side = nextPart();
+            if (side == std::string_view("w")) m_sideToMove = Color::White;
+            else if (side == std::string_view("b")) m_sideToMove = Color::Black;
+            else return false;
+
+            if (isSquareAttacked(kingSquare(!m_sideToMove), m_sideToMove)) return false;
+        }
+
+        {
+            const auto castlingRights = nextPart();
+            auto castlingRightsOpt = parser_bits::tryParseCastlingRights(castlingRights);
+            if (!castlingRightsOpt.has_value())
+            {
+                return false;
+            }
+            else
+            {
+                m_castlingRights = *castlingRightsOpt;
+            }
+        }
+
+        {
+            const auto epSquare = nextPart();
+            auto epSquareOpt = parser_bits::tryParseEpSquare(epSquare);
+            if (!epSquareOpt.has_value())
+            {
+                return false;
+            }
+            else
+            {
+                m_epSquare = *epSquareOpt;
+            }
+        }
+
+        {
+            const auto rule50 = nextPart();
+            if (!rule50.empty())
+            {
+                m_rule50Counter = std::stoi(rule50.data());
+            }
+            else
+            {
+                m_rule50Counter = 0;
+            }
+        }
+
+        {
+            const auto fullMove = nextPart();
+            if (!fullMove.empty())
+            {
+                m_ply = std::stoi(fullMove.data()) * 2 - (m_sideToMove == Color::White);
+            }
+            else
+            {
+                m_ply = 0;
+            }
+        }
+
+        nullifyEpSquareIfNotPossible();
+
+        return true;
+    }
+
+    [[nodiscard]] Position Position::fromFen(std::string_view fen)
+    {
+        Position pos{};
+        pos.set(fen);
+        return pos;
+    }
+
+    [[nodiscard]] std::optional<Position> Position::tryFromFen(std::string_view fen)
+    {
+        Position pos{};
+        if (pos.trySet(fen)) return pos;
+        else return {};
+    }
+
+    [[nodiscard]] Position Position::startPosition()
+    {
+        static const Position pos = fromFen("rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1");
+        return pos;
+    }
+
+    [[nodiscard]] std::string Position::fen() const
+    {
+        std::string fen = Board::fen();
+
+        fen += ' ';
+        fen += m_sideToMove == Color::White ? 'w' : 'b';
+
+        fen += ' ';
+        parser_bits::appendCastlingRightsToString(m_castlingRights, fen);
+
+        fen += ' ';
+        parser_bits::appendEpSquareToString(m_epSquare, fen);
+
+        fen += ' ';
+        fen += std::to_string(m_rule50Counter);
+
+        fen += ' ';
+        fen += std::to_string(fullMove());
+
+        return fen;
+    }
+
+    namespace detail::lookup
+    {
+        static constexpr EnumArray<Square, CastlingRights> preservedCastlingRights = []() {
+            EnumArray<Square, CastlingRights> preservedCastlingRights_{};
+            for (CastlingRights& rights : preservedCastlingRights_)
+            {
+                rights = ~CastlingRights::None;
+            }
+
+            preservedCastlingRights_[e1] = ~CastlingRights::White;
+            preservedCastlingRights_[e8] = ~CastlingRights::Black;
+
+            preservedCastlingRights_[h1] = ~CastlingRights::WhiteKingSide;
+            preservedCastlingRights_[a1] = ~CastlingRights::WhiteQueenSide;
+            preservedCastlingRights_[h8] = ~CastlingRights::BlackKingSide;
+            preservedCastlingRights_[a8] = ~CastlingRights::BlackQueenSide;
+
+            return preservedCastlingRights_;
+        }();
+    }
+
+    inline ReverseMove Position::doMove(const Move& move)
+    {
+        assert(move.from.isOk() && move.to.isOk());
+
+        const PieceType movedPiece = pieceAt(move.from).type();
+
+        m_ply += 1;
+        m_rule50Counter += 1;
+
+        if (move.type != MoveType::Castle && (movedPiece == PieceType::Pawn || pieceAt(move.to) != Piece::none()))
+        {
+            m_rule50Counter = 0;
+        }
+
+        const Square oldEpSquare = m_epSquare;
+        const CastlingRights oldCastlingRights = m_castlingRights;
+        m_castlingRights &= detail::lookup::preservedCastlingRights[move.from];
+        m_castlingRights &= detail::lookup::preservedCastlingRights[move.to];
+
+        m_epSquare = Square::none();
+        // for double pushes move index differs by 16 or -16;
+        if((movedPiece == PieceType::Pawn) & ((ordinal(move.to) ^ ordinal(move.from)) == 16))
+        {
+            const Square potentialEpSquare = fromOrdinal<Square>((ordinal(move.to) + ordinal(move.from)) >> 1);
+            // Even though the move has not yet been made we can safely call
+            // this function and get the right result because the position of the
+            // pawn to be captured is not really relevant.
+            if (isEpPossible(potentialEpSquare, !m_sideToMove))
+            {
+                m_epSquare = potentialEpSquare;
+            }
+        }
+
+        const Piece captured = BaseType::doMove(move);
+        m_sideToMove = !m_sideToMove;
+        return { move, captured, oldEpSquare, oldCastlingRights };
+    }
+
+    [[nodiscard]] inline Position Position::afterMove(Move move) const
+    {
+        Position cpy(*this);
+        auto pc = cpy.doMove(move);
+
+        (void)pc;
+        //assert(cpy.beforeMove(move, pc) == *this); // this assert would result in infinite recursion
+
+        return cpy;
+    }
+
+    [[nodiscard]] inline bool Position::isEpPossible(Square epSquare, Color sideToMove) const
+    {
+        const Bitboard pawnsAttackingEpSquare =
+            bb::pawnAttacks(Bitboard::square(epSquare), !sideToMove)
+            & piecesBB(Piece(PieceType::Pawn, sideToMove));
+
+        if (!pawnsAttackingEpSquare.any())
+        {
+            return false;
+        }
+
+        return isEpPossibleColdPath(epSquare, pawnsAttackingEpSquare, sideToMove);
+    }
+
+    [[nodiscard]] inline bool Position::isEpPossibleColdPath(Square epSquare, Bitboard pawnsAttackingEpSquare, Color sideToMove) const
+    {
+        // only set m_epSquare when it matters, ie. when
+        // the opposite side can actually capture
+        for (Square sq : pawnsAttackingEpSquare)
+        {
+            // If we're here the previous move by other side
+            // was a double pawn move so our king is either not in check
+            // or is attacked only by the moved pawn - in which
+            // case it can be captured by our pawn if it doesn't
+            // create a discovered check on our king.
+            // So overall we only have to check whether our king
+            // ends up being uncovered to a slider attack.
+
+            const Square ksq = kingSquare(sideToMove);
+
+            const Bitboard bishops = piecesBB(Piece(PieceType::Bishop, !sideToMove));
+            const Bitboard rooks = piecesBB(Piece(PieceType::Rook, !sideToMove));
+            const Bitboard queens = piecesBB(Piece(PieceType::Queen, !sideToMove));
+
+            const Bitboard relevantAttackers = bishops | rooks | queens;
+            const Bitboard pseudoSliderAttacksFromKing = bb::pseudoAttacks<PieceType::Queen>(ksq);
+            if ((relevantAttackers & pseudoSliderAttacksFromKing).isEmpty())
+            {
+                // It's enough that one pawn can capture.
+                return true;
+            }
+
+            const Square capturedPawnSq(epSquare.file(), sq.rank());
+            const Bitboard occupied = ((piecesBB() ^ sq) | epSquare) ^ capturedPawnSq;
+
+            if (!bb::isAttackedBySlider(
+                ksq,
+                bishops,
+                rooks,
+                queens,
+                occupied
+            ))
+            {
+                // It's enough that one pawn can capture.
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+    inline void Position::nullifyEpSquareIfNotPossible()
+    {
+        if (m_epSquare != Square::none() && !isEpPossible(m_epSquare, m_sideToMove))
+        {
+            m_epSquare = Square::none();
+        }
+    }
+
+    namespace uci
+    {
+        [[nodiscard]] inline std::string moveToUci(const Position& pos, const Move& move);
+        [[nodiscard]] inline Move uciToMove(const Position& pos, std::string_view sv);
+
+        [[nodiscard]] inline std::string moveToUci(const Position& pos, const Move& move)
+        {
+            std::string s;
+
+            parser_bits::appendSquareToString(move.from, s);
+
+            if (move.type == MoveType::Castle)
+            {
+                const CastleType castleType = CastlingTraits::moveCastlingType(move);
+
+                const Square kingDestination = CastlingTraits::kingDestination[pos.sideToMove()][castleType];
+                parser_bits::appendSquareToString(kingDestination, s);
+            }
+            else
+            {
+                parser_bits::appendSquareToString(move.to, s);
+
+                if (move.type == MoveType::Promotion)
+                {
+                    // lowercase piece symbol
+                    s += EnumTraits<PieceType>::toChar(move.promotedPiece.type(), Color::Black);
+                }
+            }
+
+            return s;
+        }
+
+        [[nodiscard]] inline Move uciToMove(const Position& pos, std::string_view sv)
+        {
+            const Square from = parser_bits::parseSquare(sv.data());
+            const Square to = parser_bits::parseSquare(sv.data() + 2);
+
+            if (sv.size() == 5)
+            {
+                const PieceType promotedPieceType = *fromChar<PieceType>(sv[4]);
+                return Move::promotion(from, to, Piece(promotedPieceType, pos.sideToMove()));
+            }
+            else
+            {
+                if (
+                    pos.pieceAt(from).type() == PieceType::King
+                    && std::abs(from.file() - to.file()) > 1
+                    )
+                {
+                    // uci king destinations are on files C or G.
+                    const CastleType castleType =
+                        (to.file() == fileG)
+                        ? CastleType::Short
+                        : CastleType::Long;
+
+                    return Move::castle(castleType, pos.sideToMove());
+                }
+                else if (pos.epSquare() == to)
+                {
+                    return Move::enPassant(from, to);
+                }
+                else
+                {
+                    return Move::normal(from, to);
+                }
+            }
+        }
+    }
+}
+
+namespace binpack
+{
+    constexpr std::size_t KiB = 1024;
+    constexpr std::size_t MiB = (1024*KiB);
+    constexpr std::size_t GiB = (1024*MiB);
+
+    constexpr std::size_t suggestedChunkSize = MiB;
+    constexpr std::size_t maxMovelistSize = 10*KiB; // a safe upper bound
+    constexpr std::size_t maxChunkSize = 100*MiB; // to prevent malformed files from causing huge allocations
+
+    using namespace std::literals;
+
+    namespace nodchip
+    {
+        // This namespace contains modified code from https://github.com/nodchip/Stockfish
+        // which is released under GPL v3 license https://www.gnu.org/licenses/gpl-3.0.html
+
+        using namespace std;
+
+        struct StockfishMove
+        {
+            [[nodiscard]] static StockfishMove fromMove(chess::Move move)
+            {
+                StockfishMove sfm;
+
+                sfm.m_raw = 0;
+
+                unsigned moveFlag = 0;
+                if (move.type == chess::MoveType::Promotion) moveFlag = 1;
+                else if (move.type == chess::MoveType::EnPassant) moveFlag = 2;
+                else if (move.type == chess::MoveType::Castle) moveFlag = 3;
+
+                unsigned promotionIndex = 0;
+                if (move.type == chess::MoveType::Promotion)
+                {
+                    promotionIndex = static_cast<int>(move.promotedPiece.type()) - static_cast<int>(chess::PieceType::Knight);
+                }
+
+                sfm.m_raw |= static_cast<std::uint16_t>(moveFlag);
+                sfm.m_raw <<= 2;
+                sfm.m_raw |= static_cast<std::uint16_t>(promotionIndex);
+                sfm.m_raw <<= 6;
+                sfm.m_raw |= static_cast<int>(move.from);
+                sfm.m_raw <<= 6;
+                sfm.m_raw |= static_cast<int>(move.to);
+
+                return sfm;
+            }
+
+            [[nodiscard]] chess::Move toMove() const
+            {
+                const chess::Square to = static_cast<chess::Square>((m_raw & (0b111111 << 0) >> 0));
+                const chess::Square from = static_cast<chess::Square>((m_raw & (0b111111 << 6)) >> 6);
+
+                const unsigned promotionIndex = (m_raw & (0b11 << 12)) >> 12;
+                const chess::PieceType promotionType = static_cast<chess::PieceType>(static_cast<int>(chess::PieceType::Knight) + promotionIndex);
+
+                const unsigned moveFlag = (m_raw & (0b11 << 14)) >> 14;
+                chess::MoveType type = chess::MoveType::Normal;
+                if (moveFlag == 1) type = chess::MoveType::Promotion;
+                else if (moveFlag == 2) type = chess::MoveType::EnPassant;
+                else if (moveFlag == 3) type = chess::MoveType::Castle;
+
+                if (type == chess::MoveType::Promotion)
+                {
+                    const chess::Color stm = to.rank() == chess::rank8 ? chess::Color::White : chess::Color::Black;
+                    return chess::Move{from, to, type, chess::Piece(promotionType, stm)};
+                }
+
+                return chess::Move{from, to, type};
+            }
+
+            [[nodiscard]] std::string toString() const
+            {
+                const chess::Square to = static_cast<chess::Square>((m_raw & (0b111111 << 0) >> 0));
+                const chess::Square from = static_cast<chess::Square>((m_raw & (0b111111 << 6)) >> 6);
+
+                const unsigned promotionIndex = (m_raw & (0b11 << 12)) >> 12;
+                const chess::PieceType promotionType = static_cast<chess::PieceType>(static_cast<int>(chess::PieceType::Knight) + promotionIndex);
+
+                std::string r;
+                chess::parser_bits::appendSquareToString(from, r);
+                chess::parser_bits::appendSquareToString(to, r);
+                if (promotionType != chess::PieceType::None)
+                {
+                    r += chess::EnumTraits<chess::PieceType>::toChar(promotionType, chess::Color::Black);
+                }
+
+                return r;
+            }
+
+        private:
+            std::uint16_t m_raw;
+        };
+        static_assert(sizeof(StockfishMove) == sizeof(std::uint16_t));
+
+        struct PackedSfen
+        {
+            uint8_t data[32];
+        };
+
+        struct PackedSfenValue
+        {
+            // phase
+            PackedSfen sfen;
+
+            // Evaluation value returned from Learner::search()
+            int16_t score;
+
+            // PV first move
+            // Used when finding the match rate with the teacher
+            StockfishMove move;
+
+            // Trouble of the phase from the initial phase.
+            uint16_t gamePly;
+
+            // 1 if the player on this side ultimately wins the game. -1 if you are losing.
+            // 0 if a draw is reached.
+            // The draw is in the teacher position generation command gensfen,
+            // Only write if LEARN_GENSFEN_DRAW_RESULT is enabled.
+            int8_t game_result;
+
+            // When exchanging the file that wrote the teacher aspect with other people
+            //Because this structure size is not fixed, pad it so that it is 40 bytes in any environment.
+            uint8_t padding;
+
+            // 32 + 2 + 2 + 2 + 1 + 1 = 40bytes
+        };
+        static_assert(sizeof(PackedSfenValue) == 40);
+        // Class that handles bitstream
+
+        // useful when doing aspect encoding
+        struct BitStream
+        {
+            // Set the memory to store the data in advance.
+            // Assume that memory is cleared to 0.
+            void  set_data(uint8_t* data_) { data = data_; reset(); }
+
+            // Get the pointer passed in set_data().
+            uint8_t* get_data() const { return data; }
+
+            // Get the cursor.
+            int get_cursor() const { return bit_cursor; }
+
+            // reset the cursor
+            void reset() { bit_cursor = 0; }
+
+            // Write 1bit to the stream.
+            // If b is non-zero, write out 1. If 0, write 0.
+            void write_one_bit(int b)
+            {
+                if (b)
+                    data[bit_cursor / 8] |= 1 << (bit_cursor & 7);
+
+                ++bit_cursor;
+            }
+
+            // Get 1 bit from the stream.
+            int read_one_bit()
+            {
+                int b = (data[bit_cursor / 8] >> (bit_cursor & 7)) & 1;
+                ++bit_cursor;
+
+                return b;
+            }
+
+            // write n bits of data
+            // Data shall be written out from the lower order of d.
+            void write_n_bit(int d, int n)
+            {
+                for (int i = 0; i <n; ++i)
+                    write_one_bit(d & (1 << i));
+            }
+
+            // read n bits of data
+            // Reverse conversion of write_n_bit().
+            int read_n_bit(int n)
+            {
+                int result = 0;
+                for (int i = 0; i < n; ++i)
+                    result |= read_one_bit() ? (1 << i) : 0;
+
+                return result;
+            }
+
+        private:
+            // Next bit position to read/write.
+            int bit_cursor;
+
+            // data entity
+            uint8_t* data;
+        };
+
+
+        // Huffman coding
+        // * is simplified from mini encoding to make conversion easier.
+        //
+        // Huffman Encoding
+        //
+        // Empty  xxxxxxx0
+        // Pawn   xxxxx001 + 1 bit (Color)
+        // Knight xxxxx011 + 1 bit (Color)
+        // Bishop xxxxx101 + 1 bit (Color)
+        // Rook   xxxxx111 + 1 bit (Color)
+        // Queen   xxxx1001 + 1 bit (Color)
+        //
+        // Worst case:
+        // - 32 empty squares    32 bits
+        // - 30 pieces           150 bits
+        // - 2 kings             12 bits
+        // - castling rights     4 bits
+        // - ep square           7 bits
+        // - rule50              7 bits
+        // - game ply            16 bits
+        // - TOTAL               228 bits < 256 bits
+
+        struct HuffmanedPiece
+        {
+            int code; // how it will be coded
+            int bits; // How many bits do you have
+        };
+
+        // NOTE: Order adjusted for this library because originally NO_PIECE had index 0
+        constexpr HuffmanedPiece huffman_table[] =
+        {
+            {0b0001,4}, // PAWN     1
+            {0b0011,4}, // KNIGHT   3
+            {0b0101,4}, // BISHOP   5
+            {0b0111,4}, // ROOK     7
+            {0b1001,4}, // QUEEN    9
+            {-1,-1},    // KING - unused
+            {0b0000,1}, // NO_PIECE 0
+        };
+
+        // Class for compressing/decompressing sfen
+        // sfen can be packed to 256bit (32bytes) by Huffman coding.
+        // This is proven by mini. The above is Huffman coding.
+        //
+        // Internal format = 1-bit turn + 7-bit king position *2 + piece on board (Huffman coding) + hand piece (Huffman coding)
+        // Side to move (White = 0, Black = 1) (1bit)
+        // White King Position (6 bits)
+        // Black King Position (6 bits)
+        // Huffman Encoding of the board
+        // Castling availability (1 bit x 4)
+        // En passant square (1 or 1 + 6 bits)
+        // Rule 50 (6 bits)
+        // Game play (8 bits)
+        //
+        // TODO(someone): Rename SFEN to FEN.
+        //
+        struct SfenPacker
+        {
+            // Pack sfen and store in data[32].
+            void pack(const chess::Position& pos)
+            {
+                memset(data, 0, 32 /* 256bit */);
+                stream.set_data(data);
+
+                // turn
+                // Side to move.
+                stream.write_one_bit((int)(pos.sideToMove()));
+
+                // 7-bit positions for leading and trailing balls
+                // White king and black king, 6 bits for each.
+                stream.write_n_bit(static_cast<int>(pos.kingSquare(chess::Color::White)), 6);
+                stream.write_n_bit(static_cast<int>(pos.kingSquare(chess::Color::Black)), 6);
+
+                // Write the pieces on the board other than the kings.
+                for (chess::Rank r = chess::rank8; r >= chess::rank1; --r)
+                {
+                    for (chess::File f = chess::fileA; f <= chess::fileH; ++f)
+                    {
+                        chess::Piece pc = pos.pieceAt(chess::Square(f, r));
+                        if (pc.type() == chess::PieceType::King)
+                            continue;
+                        write_board_piece_to_stream(pc);
+                    }
+                }
+
+                // TODO(someone): Support chess960.
+                auto cr = pos.castlingRights();
+                stream.write_one_bit(contains(cr, chess::CastlingRights::WhiteKingSide));
+                stream.write_one_bit(contains(cr, chess::CastlingRights::WhiteQueenSide));
+                stream.write_one_bit(contains(cr, chess::CastlingRights::BlackKingSide));
+                stream.write_one_bit(contains(cr, chess::CastlingRights::BlackQueenSide));
+
+                if (pos.epSquare() == chess::Square::none()) {
+                    stream.write_one_bit(0);
+                }
+                else {
+                    stream.write_one_bit(1);
+                    stream.write_n_bit(static_cast<int>(pos.epSquare()), 6);
+                }
+
+                stream.write_n_bit(pos.rule50Counter(), 6);
+
+                stream.write_n_bit(pos.fullMove(), 8);
+
+                // Write high bits of half move. This is a fix for the
+                // limited range of half move counter.
+                // This is backwards compatibile.
+                stream.write_n_bit(pos.fullMove() >> 8, 8);
+
+                // Write the highest bit of rule50 at the end. This is a backwards
+                // compatibile fix for rule50 having only 6 bits stored.
+                // This bit is just ignored by the old parsers.
+                stream.write_n_bit(pos.rule50Counter() >> 6, 1);
+
+                assert(stream.get_cursor() <= 256);
+            }
+
+            // sfen packed by pack() (256bit = 32bytes)
+            // Or sfen to decode with unpack()
+            uint8_t *data; // uint8_t[32];
+
+            BitStream stream;
+
+            // Output the board pieces to stream.
+            void write_board_piece_to_stream(chess::Piece pc)
+            {
+                // piece type
+                chess::PieceType pr = pc.type();
+                auto c = huffman_table[static_cast<int>(pr)];
+                stream.write_n_bit(c.code, c.bits);
+
+                if (pc == chess::Piece::none())
+                    return;
+
+                // first and second flag
+                stream.write_one_bit(static_cast<int>(pc.color()));
+            }
+
+            // Read one board piece from stream
+            [[nodiscard]] chess::Piece read_board_piece_from_stream()
+            {
+                int pr = static_cast<int>(chess::PieceType::None);
+                int code = 0, bits = 0;
+                while (true)
+                {
+                    code |= stream.read_one_bit() << bits;
+                    ++bits;
+
+                    assert(bits <= 6);
+
+                    for (pr = static_cast<int>(chess::PieceType::Pawn); pr <= static_cast<int>(chess::PieceType::None); ++pr)
+                        if (huffman_table[pr].code == code
+                            && huffman_table[pr].bits == bits)
+                            goto Found;
+                }
+            Found:;
+                if (pr == static_cast<int>(chess::PieceType::None))
+                    return chess::Piece::none();
+
+                // first and second flag
+                chess::Color c = (chess::Color)stream.read_one_bit();
+
+                return chess::Piece(static_cast<chess::PieceType>(pr), c);
+            }
+        };
+
+
+        [[nodiscard]] inline chess::Position pos_from_packed_sfen(const PackedSfen& sfen)
+        {
+            SfenPacker packer;
+            auto& stream = packer.stream;
+            stream.set_data(const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(&sfen)));
+
+            chess::Position pos{};
+
+            // Active color
+            pos.setSideToMove((chess::Color)stream.read_one_bit());
+
+            // First the position of the ball
+            pos.place(chess::Piece(chess::PieceType::King, chess::Color::White), static_cast<chess::Square>(stream.read_n_bit(6)));
+            pos.place(chess::Piece(chess::PieceType::King, chess::Color::Black), static_cast<chess::Square>(stream.read_n_bit(6)));
+
+            // Piece placement
+            for (chess::Rank r = chess::rank8; r >= chess::rank1; --r)
+            {
+                for (chess::File f = chess::fileA; f <= chess::fileH; ++f)
+                {
+                    auto sq = chess::Square(f, r);
+
+                    // it seems there are already balls
+                    chess::Piece pc;
+                    if (pos.pieceAt(sq).type() != chess::PieceType::King)
+                    {
+                        assert(pos.pieceAt(sq) == chess::Piece::none());
+                        pc = packer.read_board_piece_from_stream();
+                    }
+                    else
+                    {
+                        pc = pos.pieceAt(sq);
+                    }
+
+                    // There may be no pieces, so skip in that case.
+                    if (pc == chess::Piece::none())
+                        continue;
+
+                    if (pc.type() != chess::PieceType::King)
+                    {
+                        pos.place(pc, sq);
+                    }
+
+                    assert(stream.get_cursor() <= 256);
+                }
+            }
+
+            // Castling availability.
+            chess::CastlingRights cr = chess::CastlingRights::None;
+            if (stream.read_one_bit()) {
+                cr |= chess::CastlingRights::WhiteKingSide;
+            }
+            if (stream.read_one_bit()) {
+                cr |= chess::CastlingRights::WhiteQueenSide;
+            }
+            if (stream.read_one_bit()) {
+                cr |= chess::CastlingRights::BlackKingSide;
+            }
+            if (stream.read_one_bit()) {
+                cr |= chess::CastlingRights::BlackQueenSide;
+            }
+            pos.setCastlingRights(cr);
+
+            // En passant square. Ignore if no pawn capture is possible
+            if (stream.read_one_bit()) {
+                chess::Square ep_square = static_cast<chess::Square>(stream.read_n_bit(6));
+                pos.setEpSquare(ep_square);
+            }
+
+            // Halfmove clock
+            std::uint8_t rule50 = stream.read_n_bit(6);
+
+            // Fullmove number
+            std::uint16_t fullmove = stream.read_n_bit(8);
+
+            // Fullmove number, high bits
+            // This was added as a fix for fullmove clock
+            // overflowing at 256. This change is backwards compatibile.
+            fullmove |= stream.read_n_bit(8) << 8;
+
+            // Read the highest bit of rule50. This was added as a fix for rule50
+            // counter having only 6 bits stored.
+            // In older entries this will just be a zero bit.
+            rule50 |= stream.read_n_bit(1) << 6;
+
+            pos.setFullMove(fullmove);
+            pos.setRule50Counter(rule50);
+
+            assert(stream.get_cursor() <= 256);
+
+            return pos;
+        }
+    }
+
+    struct CompressedTrainingDataFile
+    {
+        struct Header
+        {
+            std::uint32_t chunkSize;
+        };
+
+        CompressedTrainingDataFile(std::string path, std::ios_base::openmode om = std::ios_base::app) :
+            m_path(std::move(path)),
+            m_file(m_path, std::ios_base::binary | std::ios_base::in | std::ios_base::out | om)
+        {
+            // Necessary for MAC because app mode makes it put the reading
+            // head at the end.
+            m_file.seekg(0);
+        }
+
+        void append(const char* data, std::uint32_t size)
+        {
+            writeChunkHeader({size});
+            m_file.write(data, size);
+        }
+
+        [[nodiscard]] bool hasNextChunk()
+        {
+            if (!m_file)
+            {
+                return false;
+            }
+
+            m_file.peek();
+            return !m_file.eof();
+        }
+
+        [[nodiscard]] std::vector<unsigned char> readNextChunk()
+        {
+            auto size = readChunkHeader().chunkSize;
+            std::vector<unsigned char> data(size);
+            m_file.read(reinterpret_cast<char*>(data.data()), size);
+            return data;
+        }
+
+    private:
+        std::string m_path;
+        std::fstream m_file;
+
+        void writeChunkHeader(Header h)
+        {
+            unsigned char header[8];
+            header[0] = 'B';
+            header[1] = 'I';
+            header[2] = 'N';
+            header[3] = 'P';
+            header[4] = h.chunkSize;
+            header[5] = h.chunkSize >> 8;
+            header[6] = h.chunkSize >> 16;
+            header[7] = h.chunkSize >> 24;
+            m_file.write(reinterpret_cast<const char*>(header), 8);
+        }
+
+        [[nodiscard]] Header readChunkHeader()
+        {
+            unsigned char header[8];
+            m_file.read(reinterpret_cast<char*>(header), 8);
+            if (header[0] != 'B' || header[1] != 'I' || header[2] != 'N' || header[3] != 'P')
+            {
+                assert(false);
+                // throw std::runtime_error("Invalid binpack file or chunk.");
+            }
+
+            const std::uint32_t size =
+                header[4]
+                | (header[5] << 8)
+                | (header[6] << 16)
+                | (header[7] << 24);
+
+            if (size > maxChunkSize)
+            {
+                assert(false);
+                // throw std::runtime_error("Chunks size larger than supported. Malformed file?");
+            }
+
+            return { size };
+        }
+    };
+
+    [[nodiscard]] inline std::uint16_t signedToUnsigned(std::int16_t a)
+    {
+        std::uint16_t r;
+        std::memcpy(&r, &a, sizeof(std::uint16_t));
+        if (r & 0x8000)
+        {
+            r ^= 0x7FFF;
+        }
+        r = (r << 1) | (r >> 15);
+        return r;
+    }
+
+    [[nodiscard]] inline std::int16_t unsignedToSigned(std::uint16_t r)
+    {
+        std::int16_t a;
+        r = (r << 15) | (r >> 1);
+        if (r & 0x8000)
+        {
+            r ^= 0x7FFF;
+        }
+        std::memcpy(&a, &r, sizeof(std::uint16_t));
+        return a;
+    }
+
+    struct TrainingDataEntry
+    {
+        chess::Position pos;
+        chess::Move move;
+        std::int16_t score;
+        std::uint16_t ply;
+        std::int16_t result;
+
+        [[nodiscard]] bool isValid() const
+        {
+            return pos.isMoveLegal(move);
+        }
+    };
+
+    [[nodiscard]] inline TrainingDataEntry packedSfenValueToTrainingDataEntry(const nodchip::PackedSfenValue& psv)
+    {
+        TrainingDataEntry ret;
+
+        ret.pos = nodchip::pos_from_packed_sfen(psv.sfen);
+        ret.move = psv.move.toMove();
+        ret.score = psv.score;
+        ret.ply = psv.gamePly;
+        ret.result = psv.game_result;
+
+        return ret;
+    }
+
+    [[nodiscard]] inline nodchip::PackedSfenValue trainingDataEntryToPackedSfenValue(const TrainingDataEntry& plain)
+    {
+        nodchip::PackedSfenValue ret;
+
+        nodchip::SfenPacker sp;
+        sp.data = reinterpret_cast<uint8_t*>(&ret.sfen);
+        sp.pack(plain.pos);
+
+        ret.score = plain.score;
+        ret.move = nodchip::StockfishMove::fromMove(plain.move);
+        ret.gamePly = plain.ply;
+        ret.game_result = plain.result;
+        ret.padding = 0xff; // for consistency with the .bin format.
+
+        return ret;
+    }
+
+    [[nodiscard]] inline bool isContinuation(const TrainingDataEntry& lhs, const TrainingDataEntry& rhs)
+    {
+        return
+            lhs.result == -rhs.result
+            && lhs.ply + 1 == rhs.ply
+            && lhs.pos.afterMove(lhs.move) == rhs.pos;
+    }
+
+    struct PackedTrainingDataEntry
+    {
+        unsigned char bytes[32];
+    };
+
+    [[nodiscard]] inline std::size_t usedBitsSafe(std::size_t value)
+    {
+        if (value == 0) return 0;
+        return chess::util::usedBits(value - 1);
+    }
+
+    static constexpr std::size_t scoreVleBlockSize = 4;
+
+    struct PackedMoveScoreListReader
+    {
+        TrainingDataEntry entry;
+        std::uint16_t numPlies;
+        unsigned char* movetext;
+
+        PackedMoveScoreListReader(const TrainingDataEntry& entry_, unsigned char* movetext_, std::uint16_t numPlies_) :
+            entry(entry_),
+            numPlies(numPlies_),
+            movetext(movetext_),
+            m_lastScore(-entry_.score)
+        {
+
+        }
+
+        [[nodiscard]] std::uint8_t extractBitsLE8(std::size_t count)
+        {
+            if (count == 0) return 0;
+
+            if (m_readBitsLeft == 0)
+            {
+                m_readOffset += 1;
+                m_readBitsLeft = 8;
+            }
+
+            const std::uint8_t byte = movetext[m_readOffset] << (8 - m_readBitsLeft);
+            std::uint8_t bits = byte >> (8 - count);
+
+            if (count > m_readBitsLeft)
+            {
+                const auto spillCount = count - m_readBitsLeft;
+                bits |= movetext[m_readOffset + 1] >> (8 - spillCount);
+
+                m_readBitsLeft += 8;
+                m_readOffset += 1;
+            }
+
+            m_readBitsLeft -= count;
+
+            return bits;
+        }
+
+        [[nodiscard]] std::uint16_t extractVle16(std::size_t blockSize)
+        {
+            auto mask = (1 << blockSize) - 1;
+            std::uint16_t v = 0;
+            std::size_t offset = 0;
+            for(;;)
+            {
+                std::uint16_t block = extractBitsLE8(blockSize + 1);
+                v |= ((block & mask) << offset);
+                if (!(block >> blockSize))
+                {
+                    break;
+                }
+
+                offset += blockSize;
+            }
+            return v;
+        }
+
+        [[nodiscard]] TrainingDataEntry nextEntry()
+        {
+            entry.pos.doMove(entry.move);
+            auto [move, score] = nextMoveScore(entry.pos);
+            entry.move = move;
+            entry.score = score;
+            entry.ply += 1;
+            entry.result = -entry.result;
+            return entry;
+        }
+
+        [[nodiscard]] bool hasNext() const
+        {
+            return m_numReadPlies < numPlies;
+        }
+
+        [[nodiscard]] std::pair<chess::Move, std::int16_t> nextMoveScore(const chess::Position& pos)
+        {
+            chess::Move move;
+            std::int16_t score;
+
+            const chess::Color sideToMove = pos.sideToMove();
+            const chess::Bitboard ourPieces = pos.piecesBB(sideToMove);
+            const chess::Bitboard theirPieces = pos.piecesBB(!sideToMove);
+            const chess::Bitboard occupied = ourPieces | theirPieces;
+
+            const auto pieceId = extractBitsLE8(usedBitsSafe(ourPieces.count()));
+            const auto from = chess::Square(chess::nthSetBitIndex(ourPieces.bits(), pieceId));
+
+            const auto pt = pos.pieceAt(from).type();
+            switch (pt)
+            {
+            case chess::PieceType::Pawn:
+            {
+                const chess::Rank promotionRank = pos.sideToMove() == chess::Color::White ? chess::rank7 : chess::rank2;
+                const chess::Rank startRank = pos.sideToMove() == chess::Color::White ? chess::rank2 : chess::rank7;
+                const auto forward = sideToMove == chess::Color::White ? chess::FlatSquareOffset(0, 1) : chess::FlatSquareOffset(0, -1);
+
+                const chess::Square epSquare = pos.epSquare();
+
+                chess::Bitboard attackTargets = theirPieces;
+                if (epSquare != chess::Square::none())
+                {
+                    attackTargets |= epSquare;
+                }
+
+                chess::Bitboard destinations = chess::bb::pawnAttacks(chess::Bitboard::square(from), sideToMove) & attackTargets;
+
+                const chess::Square sqForward = from + forward;
+                if (!occupied.isSet(sqForward))
+                {
+                    destinations |= sqForward;
+                    if (
+                        from.rank() == startRank
+                        && !occupied.isSet(sqForward + forward)
+                        )
+                    {
+                        destinations |= sqForward + forward;
+                    }
+                }
+
+                const auto destinationsCount = destinations.count();
+                if (from.rank() == promotionRank)
+                {
+                    const auto moveId = extractBitsLE8(usedBitsSafe(destinationsCount * 4ull));
+                    const chess::Piece promotedPiece = chess::Piece(
+                        chess::fromOrdinal<chess::PieceType>(ordinal(chess::PieceType::Knight) + (moveId % 4ull)),
+                        sideToMove
+                    );
+                    const auto to = chess::Square(chess::nthSetBitIndex(destinations.bits(), moveId / 4ull));
+
+                    move = chess::Move::promotion(from, to, promotedPiece);
+                    break;
+                }
+                else
+                {
+                    auto moveId = extractBitsLE8(usedBitsSafe(destinationsCount));
+                    const auto to = chess::Square(chess::nthSetBitIndex(destinations.bits(), moveId));
+                    if (to == epSquare)
+                    {
+                        move = chess::Move::enPassant(from, to);
+                        break;
+                    }
+                    else
+                    {
+                        move = chess::Move::normal(from, to);
+                        break;
+                    }
+                }
+            }
+            case chess::PieceType::King:
+            {
+                const chess::CastlingRights ourCastlingRightsMask =
+                    sideToMove == chess::Color::White
+                    ? chess::CastlingRights::White
+                    : chess::CastlingRights::Black;
+
+                const chess::CastlingRights castlingRights = pos.castlingRights();
+
+                const chess::Bitboard attacks = chess::bb::pseudoAttacks<chess::PieceType::King>(from) & ~ourPieces;
+                const std::size_t attacksSize = attacks.count();
+                const std::size_t numCastlings = chess::intrin::popcount(ordinal(castlingRights & ourCastlingRightsMask));
+
+                const auto moveId = extractBitsLE8(usedBitsSafe(attacksSize + numCastlings));
+
+                if (moveId >= attacksSize)
+                {
+                    const std::size_t idx = moveId - attacksSize;
+
+                    const chess::CastleType castleType =
+                        idx == 0
+                        && chess::contains(castlingRights, chess::CastlingTraits::castlingRights[sideToMove][chess::CastleType::Long])
+                        ? chess::CastleType::Long
+                        : chess::CastleType::Short;
+
+                    move = chess::Move::castle(castleType, sideToMove);
+                    break;
+                }
+                else
+                {
+                    auto to = chess::Square(chess::nthSetBitIndex(attacks.bits(), moveId));
+                    move = chess::Move::normal(from, to);
+                    break;
+                }
+                break;
+            }
+            default:
+            {
+                const chess::Bitboard attacks = chess::bb::attacks(pt, from, occupied) & ~ourPieces;
+                const auto moveId = extractBitsLE8(usedBitsSafe(attacks.count()));
+                auto to = chess::Square(chess::nthSetBitIndex(attacks.bits(), moveId));
+                move = chess::Move::normal(from, to);
+                break;
+            }
+            }
+
+            score = m_lastScore + unsignedToSigned(extractVle16(scoreVleBlockSize));
+            m_lastScore = -score;
+
+            ++m_numReadPlies;
+
+            return {move, score};
+        }
+
+        [[nodiscard]] std::size_t numReadBytes()
+        {
+            return m_readOffset + (m_readBitsLeft != 8);
+        }
+
+    private:
+        std::size_t m_readBitsLeft = 8;
+        std::size_t m_readOffset = 0;
+        std::int16_t m_lastScore = 0;
+        std::uint16_t m_numReadPlies = 0;
+    };
+
+    struct PackedMoveScoreList
+    {
+        std::uint16_t numPlies = 0;
+        std::vector<unsigned char> movetext;
+
+        void clear(const TrainingDataEntry& e)
+        {
+            numPlies = 0;
+            movetext.clear();
+            m_bitsLeft = 0;
+            m_lastScore = -e.score;
+        }
+
+        void addBitsLE8(std::uint8_t bits, std::size_t count)
+        {
+            if (count == 0) return;
+
+            if (m_bitsLeft == 0)
+            {
+                movetext.emplace_back(bits << (8 - count));
+                m_bitsLeft = 8;
+            }
+            else if (count <= m_bitsLeft)
+            {
+                movetext.back() |= bits << (m_bitsLeft - count);
+            }
+            else
+            {
+                const auto spillCount = count - m_bitsLeft;
+                movetext.back() |= bits >> spillCount;
+                movetext.emplace_back(bits << (8 - spillCount));
+                m_bitsLeft += 8;
+            }
+
+            m_bitsLeft -= count;
+        }
+
+        void addBitsVle16(std::uint16_t v, std::size_t blockSize)
+        {
+            auto mask = (1 << blockSize) - 1;
+            for(;;)
+            {
+                std::uint8_t block = (v & mask) | ((v > mask) << blockSize);
+                addBitsLE8(block, blockSize + 1);
+                v >>= blockSize;
+                if (v == 0) break;
+            }
+        }
+
+
+        void addMoveScore(const chess::Position& pos, chess::Move move, std::int16_t score)
+        {
+            const chess::Color sideToMove = pos.sideToMove();
+            const chess::Bitboard ourPieces = pos.piecesBB(sideToMove);
+            const chess::Bitboard theirPieces = pos.piecesBB(!sideToMove);
+            const chess::Bitboard occupied = ourPieces | theirPieces;
+
+            const std::uint8_t pieceId = (pos.piecesBB(sideToMove) & chess::bb::before(move.from)).count();
+            std::size_t numMoves = 0;
+            int moveId = 0;
+            const auto pt = pos.pieceAt(move.from).type();
+            switch (pt)
+            {
+            case chess::PieceType::Pawn:
+            {
+                const chess::Rank secondToLastRank = pos.sideToMove() == chess::Color::White ? chess::rank7 : chess::rank2;
+                const chess::Rank startRank = pos.sideToMove() == chess::Color::White ? chess::rank2 : chess::rank7;
+                const auto forward = sideToMove == chess::Color::White ? chess::FlatSquareOffset(0, 1) : chess::FlatSquareOffset(0, -1);
+
+                const chess::Square epSquare = pos.epSquare();
+
+                chess::Bitboard attackTargets = theirPieces;
+                if (epSquare != chess::Square::none())
+                {
+                    attackTargets |= epSquare;
+                }
+
+                chess::Bitboard destinations = chess::bb::pawnAttacks(chess::Bitboard::square(move.from), sideToMove) & attackTargets;
+
+                const chess::Square sqForward = move.from + forward;
+                if (!occupied.isSet(sqForward))
+                {
+                    destinations |= sqForward;
+
+                    if (
+                        move.from.rank() == startRank
+                        && !occupied.isSet(sqForward + forward)
+                        )
+                    {
+                        destinations |= sqForward + forward;
+                    }
+                }
+
+                moveId = (destinations & chess::bb::before(move.to)).count();
+                numMoves = destinations.count();
+                if (move.from.rank() == secondToLastRank)
+                {
+                    const auto promotionIndex = (ordinal(move.promotedPiece.type()) - ordinal(chess::PieceType::Knight));
+                    moveId = moveId * 4 + promotionIndex;
+                    numMoves *= 4;
+                }
+
+                break;
+            }
+            case chess::PieceType::King:
+            {
+                const chess::CastlingRights ourCastlingRightsMask =
+                    sideToMove == chess::Color::White
+                    ? chess::CastlingRights::White
+                    : chess::CastlingRights::Black;
+
+                const chess::CastlingRights castlingRights = pos.castlingRights();
+
+                const chess::Bitboard attacks = chess::bb::pseudoAttacks<chess::PieceType::King>(move.from) & ~ourPieces;
+                const auto attacksSize = attacks.count();
+                const auto numCastlingRights = chess::intrin::popcount(ordinal(castlingRights & ourCastlingRightsMask));
+
+                numMoves += attacksSize;
+                numMoves += numCastlingRights;
+
+                if (move.type == chess::MoveType::Castle)
+                {
+                    const auto longCastlingRights = chess::CastlingTraits::castlingRights[sideToMove][chess::CastleType::Long];
+
+                    moveId = attacksSize - 1;
+
+                    if (chess::contains(castlingRights, longCastlingRights))
+                    {
+                        // We have to add one no matter if it's the used one or not.
+                        moveId += 1;
+                    }
+
+                    if (chess::CastlingTraits::moveCastlingType(move) == chess::CastleType::Short)
+                    {
+                        moveId += 1;
+                    }
+                }
+                else
+                {
+                    moveId = (attacks & chess::bb::before(move.to)).count();
+                }
+                break;
+            }
+            default:
+            {
+                const chess::Bitboard attacks = chess::bb::attacks(pt, move.from, occupied) & ~ourPieces;
+
+                moveId = (attacks & chess::bb::before(move.to)).count();
+                numMoves = attacks.count();
+            }
+            }
+
+            const std::size_t numPieces = ourPieces.count();
+            addBitsLE8(pieceId, usedBitsSafe(numPieces));
+            addBitsLE8(moveId, usedBitsSafe(numMoves));
+
+            std::uint16_t scoreDelta = signedToUnsigned(score - m_lastScore);
+            addBitsVle16(scoreDelta, scoreVleBlockSize);
+            m_lastScore = -score;
+
+            ++numPlies;
+        }
+
+    private:
+        std::size_t m_bitsLeft = 0;
+        std::int16_t m_lastScore = 0;
+    };
+
+
+    [[nodiscard]] inline PackedTrainingDataEntry packEntry(const TrainingDataEntry& plain)
+    {
+        PackedTrainingDataEntry packed;
+
+        auto compressedPos = plain.pos.compress();
+        auto compressedMove = plain.move.compress();
+
+        static_assert(sizeof(compressedPos) + sizeof(compressedMove) + 6 == sizeof(PackedTrainingDataEntry));
+
+        std::size_t offset = 0;
+        compressedPos.writeToBigEndian(packed.bytes);
+        offset += sizeof(compressedPos);
+        compressedMove.writeToBigEndian(packed.bytes + offset);
+        offset += sizeof(compressedMove);
+        std::uint16_t pr = plain.ply | (signedToUnsigned(plain.result) << 14);
+        packed.bytes[offset++] = signedToUnsigned(plain.score) >> 8;
+        packed.bytes[offset++] = signedToUnsigned(plain.score);
+        packed.bytes[offset++] = pr >> 8;
+        packed.bytes[offset++] = pr;
+        packed.bytes[offset++] = plain.pos.rule50Counter() >> 8;
+        packed.bytes[offset++] = plain.pos.rule50Counter();
+
+        return packed;
+    }
+
+    [[nodiscard]] inline TrainingDataEntry unpackEntry(const PackedTrainingDataEntry& packed)
+    {
+        TrainingDataEntry plain;
+
+        std::size_t offset = 0;
+        auto compressedPos = chess::CompressedPosition::readFromBigEndian(packed.bytes);
+        plain.pos = compressedPos.decompress();
+        offset += sizeof(compressedPos);
+        auto compressedMove = chess::CompressedMove::readFromBigEndian(packed.bytes + offset);
+        plain.move = compressedMove.decompress();
+        offset += sizeof(compressedMove);
+        plain.score = unsignedToSigned((packed.bytes[offset] << 8) | packed.bytes[offset+1]);
+        offset += 2;
+        std::uint16_t pr = (packed.bytes[offset] << 8) | packed.bytes[offset+1];
+        plain.ply = pr & 0x3FFF;
+        plain.pos.setPly(plain.ply);
+        plain.result = unsignedToSigned(pr >> 14);
+        offset += 2;
+        plain.pos.setRule50Counter((packed.bytes[offset] << 8) | packed.bytes[offset+1]);
+
+        return plain;
+    }
+
+    struct CompressedTrainingDataEntryWriter
+    {
+        static constexpr std::size_t chunkSize = suggestedChunkSize;
+
+        CompressedTrainingDataEntryWriter(std::string path, std::ios_base::openmode om = std::ios_base::app) :
+            m_outputFile(path, om),
+            m_lastEntry{},
+            m_movelist{},
+            m_packedSize(0),
+            m_packedEntries(chunkSize + maxMovelistSize),
+            m_isFirst(true)
+        {
+            m_lastEntry.ply = 0xFFFF; // so it's never a continuation
+            m_lastEntry.result = 0x7FFF;
+        }
+
+        void addTrainingDataEntry(const TrainingDataEntry& e)
+        {
+            bool isCont = isContinuation(m_lastEntry, e);
+            if (isCont)
+            {
+                // add to movelist
+                m_movelist.addMoveScore(e.pos, e.move, e.score);
+            }
+            else
+            {
+                if (!m_isFirst)
+                {
+                    writeMovelist();
+                }
+
+                if (m_packedSize >= chunkSize)
+                {
+                    m_outputFile.append(m_packedEntries.data(), m_packedSize);
+                    m_packedSize = 0;
+                }
+
+                auto packed = packEntry(e);
+                std::memcpy(m_packedEntries.data() + m_packedSize, &packed, sizeof(PackedTrainingDataEntry));
+                m_packedSize += sizeof(PackedTrainingDataEntry);
+
+                m_movelist.clear(e);
+
+                m_isFirst = false;
+            }
+
+            m_lastEntry = e;
+        }
+
+        ~CompressedTrainingDataEntryWriter()
+        {
+            if (m_packedSize > 0)
+            {
+                if (!m_isFirst)
+                {
+                    writeMovelist();
+                }
+
+                m_outputFile.append(m_packedEntries.data(), m_packedSize);
+                m_packedSize = 0;
+            }
+        }
+
+    private:
+        CompressedTrainingDataFile m_outputFile;
+        TrainingDataEntry m_lastEntry;
+        PackedMoveScoreList m_movelist;
+        std::size_t m_packedSize;
+        std::vector<char> m_packedEntries;
+        bool m_isFirst;
+
+        void writeMovelist()
+        {
+            m_packedEntries[m_packedSize++] = m_movelist.numPlies >> 8;
+            m_packedEntries[m_packedSize++] = m_movelist.numPlies;
+            if (m_movelist.numPlies > 0)
+            {
+                std::memcpy(m_packedEntries.data() + m_packedSize, m_movelist.movetext.data(), m_movelist.movetext.size());
+                m_packedSize += m_movelist.movetext.size();
+            }
+        };
+    };
+
+    struct CompressedTrainingDataEntryReader
+    {
+        static constexpr std::size_t chunkSize = suggestedChunkSize;
+
+        CompressedTrainingDataEntryReader(std::string path, std::ios_base::openmode om = std::ios_base::app) :
+            m_inputFile(path, om),
+            m_chunk(),
+            m_movelistReader(std::nullopt),
+            m_offset(0),
+            m_isEnd(false)
+        {
+            if (!m_inputFile.hasNextChunk())
+            {
+                m_isEnd = true;
+            }
+            else
+            {
+                m_chunk = m_inputFile.readNextChunk();
+            }
+        }
+
+        [[nodiscard]] bool hasNext()
+        {
+            return !m_isEnd;
+        }
+
+        [[nodiscard]] TrainingDataEntry next()
+        {
+            if (m_movelistReader.has_value())
+            {
+                const auto e = m_movelistReader->nextEntry();
+
+                if (!m_movelistReader->hasNext())
+                {
+                    m_offset += m_movelistReader->numReadBytes();
+                    m_movelistReader.reset();
+
+                    fetchNextChunkIfNeeded();
+                }
+
+                return e;
+            }
+
+            PackedTrainingDataEntry packed;
+            std::memcpy(&packed, m_chunk.data() + m_offset, sizeof(PackedTrainingDataEntry));
+            m_offset += sizeof(PackedTrainingDataEntry);
+
+            const std::uint16_t numPlies = (m_chunk[m_offset] << 8) | m_chunk[m_offset + 1];
+            m_offset += 2;
+
+            const auto e = unpackEntry(packed);
+
+            if (numPlies > 0)
+            {
+                m_movelistReader.emplace(e, reinterpret_cast<unsigned char*>(m_chunk.data()) + m_offset, numPlies);
+            }
+            else
+            {
+                fetchNextChunkIfNeeded();
+            }
+
+            return e;
+        }
+
+    private:
+        CompressedTrainingDataFile m_inputFile;
+        std::vector<unsigned char> m_chunk;
+        std::optional<PackedMoveScoreListReader> m_movelistReader;
+        std::size_t m_offset;
+        bool m_isEnd;
+
+        void fetchNextChunkIfNeeded()
+        {
+            if (m_offset + sizeof(PackedTrainingDataEntry) + 2 > m_chunk.size())
+            {
+                if (m_inputFile.hasNextChunk())
+                {
+                    m_chunk = m_inputFile.readNextChunk();
+                    m_offset = 0;
+                }
+                else
+                {
+                    m_isEnd = true;
+                }
+            }
+        }
+    };
+
+    inline void emitPlainEntry(std::string& buffer, const TrainingDataEntry& plain)
+    {
+        buffer += "fen ";
+        buffer += plain.pos.fen();
+        buffer += '\n';
+
+        buffer += "move ";
+        buffer += chess::uci::moveToUci(plain.pos, plain.move);
+        buffer += '\n';
+
+        buffer += "score ";
+        buffer += std::to_string(plain.score);
+        buffer += '\n';
+
+        buffer += "ply ";
+        buffer += std::to_string(plain.ply);
+        buffer += '\n';
+
+        buffer += "result ";
+        buffer += std::to_string(plain.result);
+        buffer += "\ne\n";
+    }
+
+    inline void emitBinEntry(std::vector<char>& buffer, const TrainingDataEntry& plain)
+    {
+        auto psv = trainingDataEntryToPackedSfenValue(plain);
+        const char* data = reinterpret_cast<const char*>(&psv);
+        buffer.insert(buffer.end(), data, data+sizeof(psv));
+    }
+
+    inline void convertPlainToBinpack(std::string inputPath, std::string outputPath, std::ios_base::openmode om, bool validate)
+    {
+        constexpr std::size_t reportEveryNPositions = 100'000;
+
+        std::cout << "Converting " << inputPath << " to " << outputPath << '\n';
+
+        CompressedTrainingDataEntryWriter writer(outputPath, om);
+        TrainingDataEntry e;
+
+        std::string key;
+        std::string value;
+        std::string move;
+
+        std::ifstream inputFile(inputPath);
+        const auto base = inputFile.tellg();
+        std::size_t numProcessedPositions = 0;
+
+        for(;;)
+        {
+            inputFile >> key;
+            if (!inputFile)
+            {
+                break;
+            }
+
+            if (key == "e"sv)
+            {
+                e.move = chess::uci::uciToMove(e.pos, move);
+                if (validate && !e.isValid())
+                {
+                    std::cerr << "Illegal move " << chess::uci::moveToUci(e.pos, e.move) << " for position " << e.pos.fen() << '\n';
+                    return;
+                }
+
+                writer.addTrainingDataEntry(e);
+
+                ++numProcessedPositions;
+                const auto cur = inputFile.tellg();
+                if (numProcessedPositions % reportEveryNPositions == 0)
+                {
+                    std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
+                }
+
+                continue;
+            }
+
+            inputFile >> std::ws;
+            std::getline(inputFile, value, '\n');
+
+            if (key == "fen"sv) e.pos = chess::Position::fromFen(value.c_str());
+            if (key == "move"sv) move = value;
+            if (key == "score"sv) e.score = std::stoi(value);
+            if (key == "ply"sv) e.ply = std::stoi(value);
+            if (key == "result"sv) e.result = std::stoi(value);
+        }
+
+        std::cout << "Finished. Converted " << numProcessedPositions << " positions.\n";
+    }
+
+    inline void convertBinpackToPlain(std::string inputPath, std::string outputPath, std::ios_base::openmode om, bool validate)
+    {
+        constexpr std::size_t bufferSize = MiB;
+
+        std::cout << "Converting " << inputPath << " to " << outputPath << '\n';
+
+        CompressedTrainingDataEntryReader reader(inputPath);
+        std::ofstream outputFile(outputPath, om);
+        const auto base = outputFile.tellp();
+        std::size_t numProcessedPositions = 0;
+        std::string buffer;
+        buffer.reserve(bufferSize * 2);
+
+        while(reader.hasNext())
+        {
+            auto e = reader.next();
+            if (validate && !e.isValid())
+            {
+                std::cerr << "Illegal move " << chess::uci::moveToUci(e.pos, e.move) << " for position " << e.pos.fen() << '\n';
+                return;
+            }
+
+            emitPlainEntry(buffer, e);
+
+            ++numProcessedPositions;
+
+            if (buffer.size() > bufferSize)
+            {
+                outputFile << buffer;
+                buffer.clear();
+
+                const auto cur = outputFile.tellp();
+                std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
+            }
+        }
+
+        if (!buffer.empty())
+        {
+            outputFile << buffer;
+
+            const auto cur = outputFile.tellp();
+            std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
+        }
+
+        std::cout << "Finished. Converted " << numProcessedPositions << " positions.\n";
+    }
+
+
+    inline void convertBinToBinpack(std::string inputPath, std::string outputPath, std::ios_base::openmode om, bool validate)
+    {
+        constexpr std::size_t reportEveryNPositions = 100'000;
+
+        std::cout << "Converting " << inputPath << " to " << outputPath << '\n';
+
+        CompressedTrainingDataEntryWriter writer(outputPath, om);
+
+        std::ifstream inputFile(inputPath, std::ios_base::binary);
+        const auto base = inputFile.tellg();
+        std::size_t numProcessedPositions = 0;
+
+        nodchip::PackedSfenValue psv;
+        for(;;)
+        {
+            inputFile.read(reinterpret_cast<char*>(&psv), sizeof(psv));
+            if (inputFile.gcount() != 40)
+            {
+                break;
+            }
+
+            auto e = packedSfenValueToTrainingDataEntry(psv);
+            if (validate && !e.isValid())
+            {
+                std::cerr << "Illegal move " << chess::uci::moveToUci(e.pos, e.move) << " for position " << e.pos.fen() << '\n';
+                std::cerr << static_cast<int>(e.move.type) << '\n';
+                return;
+            }
+
+            writer.addTrainingDataEntry(e);
+
+            ++numProcessedPositions;
+            const auto cur = inputFile.tellg();
+            if (numProcessedPositions % reportEveryNPositions == 0)
+            {
+                std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
+            }
+        }
+
+        std::cout << "Finished. Converted " << numProcessedPositions << " positions.\n";
+    }
+
+    inline void convertBinpackToBin(std::string inputPath, std::string outputPath, std::ios_base::openmode om, bool validate)
+    {
+        constexpr std::size_t bufferSize = MiB;
+
+        std::cout << "Converting " << inputPath << " to " << outputPath << '\n';
+
+        CompressedTrainingDataEntryReader reader(inputPath);
+        std::ofstream outputFile(outputPath, std::ios_base::binary | om);
+        const auto base = outputFile.tellp();
+        std::size_t numProcessedPositions = 0;
+        std::vector<char> buffer;
+        buffer.reserve(bufferSize * 2);
+
+        while(reader.hasNext())
+        {
+            auto e = reader.next();
+            if (validate && !e.isValid())
+            {
+                std::cerr << "Illegal move " << chess::uci::moveToUci(e.pos, e.move) << " for position " << e.pos.fen() << '\n';
+                return;
+            }
+
+            emitBinEntry(buffer, e);
+
+            ++numProcessedPositions;
+
+            if (buffer.size() > bufferSize)
+            {
+                outputFile.write(buffer.data(), buffer.size());
+                buffer.clear();
+
+                const auto cur = outputFile.tellp();
+                std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
+            }
+        }
+
+        if (!buffer.empty())
+        {
+            outputFile.write(buffer.data(), buffer.size());
+
+            const auto cur = outputFile.tellp();
+            std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
+        }
+
+        std::cout << "Finished. Converted " << numProcessedPositions << " positions.\n";
+    }
+
+    inline void convertBinToPlain(std::string inputPath, std::string outputPath, std::ios_base::openmode om, bool validate)
+    {
+        constexpr std::size_t bufferSize = MiB;
+
+        std::cout << "Converting " << inputPath << " to " << outputPath << '\n';
+
+        std::ifstream inputFile(inputPath, std::ios_base::binary);
+        const auto base = inputFile.tellg();
+        std::size_t numProcessedPositions = 0;
+
+        std::ofstream outputFile(outputPath, om);
+        std::string buffer;
+        buffer.reserve(bufferSize * 2);
+
+        nodchip::PackedSfenValue psv;
+        for(;;)
+        {
+            inputFile.read(reinterpret_cast<char*>(&psv), sizeof(psv));
+            if (inputFile.gcount() != 40)
+            {
+                break;
+            }
+
+            auto e = packedSfenValueToTrainingDataEntry(psv);
+            if (validate && !e.isValid())
+            {
+                std::cerr << "Illegal move " << chess::uci::moveToUci(e.pos, e.move) << " for position " << e.pos.fen() << '\n';
+                return;
+            }
+
+            emitPlainEntry(buffer, e);
+
+            ++numProcessedPositions;
+
+            if (buffer.size() > bufferSize)
+            {
+                outputFile << buffer;
+                buffer.clear();
+
+                const auto cur = outputFile.tellp();
+                std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
+            }
+        }
+
+        if (!buffer.empty())
+        {
+            outputFile << buffer;
+
+            const auto cur = outputFile.tellp();
+            std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
+        }
+
+        std::cout << "Finished. Converted " << numProcessedPositions << " positions.\n";
+    }
+
+    inline void convertPlainToBin(std::string inputPath, std::string outputPath, std::ios_base::openmode om, bool validate)
+    {
+        constexpr std::size_t bufferSize = MiB;
+
+        std::cout << "Converting " << inputPath << " to " << outputPath << '\n';
+
+        std::ofstream outputFile(outputPath, std::ios_base::binary | om);
+        std::vector<char> buffer;
+        buffer.reserve(bufferSize * 2);
+
+        TrainingDataEntry e;
+
+        std::string key;
+        std::string value;
+        std::string move;
+
+        std::ifstream inputFile(inputPath);
+        const auto base = inputFile.tellg();
+        std::size_t numProcessedPositions = 0;
+
+        for(;;)
+        {
+            inputFile >> key;
+            if (!inputFile)
+            {
+                break;
+            }
+
+            if (key == "e"sv)
+            {
+                e.move = chess::uci::uciToMove(e.pos, move);
+                if (validate && !e.isValid())
+                {
+                    std::cerr << "Illegal move " << chess::uci::moveToUci(e.pos, e.move) << " for position " << e.pos.fen() << '\n';
+                    return;
+                }
+
+                emitBinEntry(buffer, e);
+
+                ++numProcessedPositions;
+
+                if (buffer.size() > bufferSize)
+                {
+                    outputFile.write(buffer.data(), buffer.size());
+                    buffer.clear();
+
+                    const auto cur = outputFile.tellp();
+                    std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
+                }
+
+                continue;
+            }
+
+            inputFile >> std::ws;
+            std::getline(inputFile, value, '\n');
+
+            if (key == "fen"sv) e.pos = chess::Position::fromFen(value.c_str());
+            if (key == "move"sv) move = value;
+            if (key == "score"sv) e.score = std::stoi(value);
+            if (key == "ply"sv) e.ply = std::stoi(value);
+            if (key == "result"sv) e.result = std::stoi(value);
+        }
+
+        if (!buffer.empty())
+        {
+            outputFile.write(buffer.data(), buffer.size());
+
+            const auto cur = outputFile.tellp();
+            std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
+        }
+
+        std::cout << "Finished. Converted " << numProcessedPositions << " positions.\n";
+    }
+}
diff --git a/src/extra/sfen_packer.cpp b/src/extra/sfen_packer.cpp
deleted file mode 100644
index ac789ce8..00000000
--- a/src/extra/sfen_packer.cpp
+++ /dev/null
@@ -1,429 +0,0 @@
-﻿#if defined (EVAL_LEARN)
-
-#include "../misc.h"
-#include "../position.h"
-
-#include <sstream>
-#include <fstream>
-#include <cstring> // std::memset()
-
-using namespace std;
-
-// -----------------------------------
-// stage compression/decompression
-// -----------------------------------
-
-// Class that handles bitstream
-// useful when doing aspect encoding
-struct BitStream
-{
-  // Set the memory to store the data in advance.
-  // Assume that memory is cleared to 0.
-  void  set_data(uint8_t* data_) { data = data_; reset(); }
-
-  // Get the pointer passed in set_data().
-  uint8_t* get_data() const { return data; }
-
-  // Get the cursor.
-  int get_cursor() const { return bit_cursor; }
-
-  // reset the cursor
-  void reset() { bit_cursor = 0; }
-
-  // Write 1bit to the stream.
-  // If b is non-zero, write out 1. If 0, write 0.
-  void write_one_bit(int b)
-  {
-    if (b)
-      data[bit_cursor / 8] |= 1 << (bit_cursor & 7);
-
-    ++bit_cursor;
-  }
-
-  // Get 1 bit from the stream.
-  int read_one_bit()
-  {
-    int b = (data[bit_cursor / 8] >> (bit_cursor & 7)) & 1;
-    ++bit_cursor;
-
-    return b;
-  }
-
-  // write n bits of data
-  // Data shall be written out from the lower order of d.
-  void write_n_bit(int d, int n)
-  {
-    for (int i = 0; i <n; ++i)
-      write_one_bit(d & (1 << i));
-  }
-
-  // read n bits of data
-  // Reverse conversion of write_n_bit().
-  int read_n_bit(int n)
-  {
-    int result = 0;
-    for (int i = 0; i < n; ++i)
-      result |= read_one_bit() ? (1 << i) : 0;
-
-    return result;
-  }
-
-private:
-  // Next bit position to read/write.
-  int bit_cursor;
-
-  // data entity
-  uint8_t* data;
-};
-
-
-// Huffman coding
-// * is simplified from mini encoding to make conversion easier.
-//
-// 1 box on the board (other than NO_PIECE) = 2 to 6 bits (+ 1-bit flag + 1-bit forward and backward)
-// 1 piece of hand piece = 1-5bit (+ 1-bit flag + 1bit ahead and behind)
-//
-// empty xxxxx0 + 0 (none)
-// step xxxx01 + 2 xxxx0 + 2
-// incense xx0011 + 2 xx001 + 2
-// Katsura xx1011 + 2 xx101 + 2
-// silver xx0111 + 2 xx011 + 2
-// Gold x01111 + 1 x0111 + 1 // Gold is valid and has no flags.
-// corner 011111 + 2 01111 + 2
-// Fly 111111 + 2 11111 + 2
-//
-// Assuming all pieces are on the board,
-// Sky 81-40 pieces = 41 boxes = 41bit
-// Walk 4bit*18 pieces = 72bit
-// Incense 6bit*4 pieces = 24bit
-// Katsura 6bit*4 pieces = 24bit
-// Silver 6bit*4 pieces = 24bit
-// Gold 6bit* 4 pieces = 24bit
-// corner 8bit* 2 pieces = 16bit
-// Fly 8bit* 2 pieces = 16bit
-// -------
-// 241bit + 1bit (turn) + 7bit × 2 (King's position after) = 256bit
-//
-// When the piece on the board moves to the hand piece, the piece on the board becomes empty, so the box on the board can be expressed with 1 bit,
-// Since the hand piece can be expressed by 1 bit less than the piece on the board, the total number of bits does not change in the end.
-// Therefore, in this expression, any aspect can be expressed by this bit number.
-// It is a hand piece and no flag is required, but if you include this, the bit number of the piece on the board will be -1
-// Since the total number of bits can be fixed, we will include this as well.
-
-// Huffman Encoding
-//
-// Empty  xxxxxxx0
-// Pawn   xxxxx001 + 1 bit (Side to move)
-// Knight xxxxx011 + 1 bit (Side to move)
-// Bishop xxxxx101 + 1 bit (Side to move)
-// Rook   xxxxx111 + 1 bit (Side to move)
-
-struct HuffmanedPiece
-{
-  int code; // how it will be coded
-  int bits; // How many bits do you have
-};
-
-HuffmanedPiece huffman_table[] =
-{
-  {0b0000,1}, // NO_PIECE
-  {0b0001,4}, // PAWN
-  {0b0011,4}, // KNIGHT
-  {0b0101,4}, // BISHOP
-  {0b0111,4}, // ROOK
-  {0b1001,4}, // QUEEN
-};
-
-// Class for compressing/decompressing sfen
-// sfen can be packed to 256bit (32bytes) by Huffman coding.
-// This is proven by mini. The above is Huffman coding.
-//
-// Internal format = 1-bit turn + 7-bit king position *2 + piece on board (Huffman coding) + hand piece (Huffman coding)
-// Side to move (White = 0, Black = 1) (1bit)
-// White King Position (6 bits)
-// Black King Position (6 bits)
-// Huffman Encoding of the board
-// Castling availability (1 bit x 4)
-// En passant square (1 or 1 + 6 bits)
-// Rule 50 (6 bits)
-// Game play (8 bits)
-//
-// TODO(someone): Rename SFEN to FEN.
-//
-struct SfenPacker
-{
-  // Pack sfen and store in data[32].
-  void pack(const Position& pos)
-  {
-// cout << pos;
-
-    memset(data, 0, 32 /* 256bit */);
-    stream.set_data(data);
-
-    // turn
-    // Side to move.
-    stream.write_one_bit((int)(pos.side_to_move()));
-
-    // 7-bit positions for leading and trailing balls
-    // White king and black king, 6 bits for each.
-    for(auto c: Colors)
-      stream.write_n_bit(pos.king_square(c), 6);
-
-    // Write the pieces on the board other than the kings.
-    for (Rank r = RANK_8; r >= RANK_1; --r)
-    {
-      for (File f = FILE_A; f <= FILE_H; ++f)
-      {
-        Piece pc = pos.piece_on(make_square(f, r));
-        if (type_of(pc) == KING)
-          continue;
-        write_board_piece_to_stream(pc);
-      }
-    }
-
-    // TODO(someone): Support chess960.
-    stream.write_one_bit(pos.can_castle(WHITE_OO));
-    stream.write_one_bit(pos.can_castle(WHITE_OOO));
-    stream.write_one_bit(pos.can_castle(BLACK_OO));
-    stream.write_one_bit(pos.can_castle(BLACK_OOO));
-
-    if (pos.ep_square() == SQ_NONE) {
-      stream.write_one_bit(0);
-    }
-    else {
-      stream.write_one_bit(1);
-      stream.write_n_bit(static_cast<int>(pos.ep_square()), 6);
-    }
-
-    stream.write_n_bit(pos.state()->rule50, 6);
-
-    stream.write_n_bit(1 + (pos.game_ply()-(pos.side_to_move() == BLACK)) / 2, 8);
-
-    assert(stream.get_cursor() <= 256);
-  }
-
-  // sfen packed by pack() (256bit = 32bytes)
-  // Or sfen to decode with unpack()
-  uint8_t *data; // uint8_t[32];
-
-//private:
-  // Position::set_from_packed_sfen(uint8_t data[32]) I want to use these functions, so the line is bad, but I want to keep it public.
-
-  BitStream stream;
-
-  // Output the board pieces to stream.
-  void write_board_piece_to_stream(Piece pc)
-  {
-    // piece type
-    PieceType pr = type_of(pc);
-    auto c = huffman_table[pr];
-    stream.write_n_bit(c.code, c.bits);
- 
-    if (pc == NO_PIECE)
-      return;
-
-    // first and second flag
-    stream.write_one_bit(color_of(pc));
-  }
-
-  // Read one board piece from stream
-  Piece read_board_piece_from_stream()
-  {
-    PieceType pr = NO_PIECE_TYPE;
-    int code = 0, bits = 0;
-    while (true)
-    {
-      code |= stream.read_one_bit() << bits;
-      ++bits;
-
-      assert(bits <= 6);
-
-      for (pr = NO_PIECE_TYPE; pr <KING; ++pr)
-        if (huffman_table[pr].code == code
-          && huffman_table[pr].bits == bits)
-          goto Found;
-    }
-  Found:;
-    if (pr == NO_PIECE_TYPE)
-      return NO_PIECE;
-
-    // first and second flag
-    Color c = (Color)stream.read_one_bit();
-    
-    return make_piece(c, pr);
-  }
-};
-
-
-// -----------------------------------
-// Add to Position class
-// -----------------------------------
-
-// Add a function that directly unpacks for speed. It's pretty tough.
-// Write it by combining packer::unpack() and Position::set().
-// If there is a problem with the passed phase and there is an error, non-zero is returned.
-int Position::set_from_packed_sfen(const PackedSfen& sfen , StateInfo * si, Thread* th, bool mirror)
-{
-	SfenPacker packer;
-	auto& stream = packer.stream;
-	stream.set_data((uint8_t*)&sfen);
-
-	std::memset(this, 0, sizeof(Position));
-	std::memset(si, 0, sizeof(StateInfo));
-  std::fill_n(&pieceList[0][0], sizeof(pieceList) / sizeof(Square), SQ_NONE);
-  st = si;
-
-	// Active color
-	sideToMove = (Color)stream.read_one_bit();
-
-  pieceList[W_KING][0] = SQUARE_NB;
-  pieceList[B_KING][0] = SQUARE_NB;
-
-	// First the position of the ball
-	if (mirror)
-	{
-		for (auto c : Colors)
-			board[flip_file((Square)stream.read_n_bit(6))] = make_piece(c, KING);
-	}
-	else
-	{
-		for (auto c : Colors)
-			board[stream.read_n_bit(6)] = make_piece(c, KING);
-	}
-
-  // Piece placement
-  for (Rank r = RANK_8; r >= RANK_1; --r)
-  {
-    for (File f = FILE_A; f <= FILE_H; ++f)
-    {
-      auto sq = make_square(f, r);
-      if (mirror) {
-        sq = flip_file(sq);
-      }
-
-      // it seems there are already balls
-      Piece pc;
-      if (type_of(board[sq]) != KING)
-      {
-        assert(board[sq] == NO_PIECE);
-        pc = packer.read_board_piece_from_stream();
-      }
-      else
-      {
-        pc = board[sq];
-        board[sq] = NO_PIECE; // put_piece() will catch ASSERT unless you remove it all.
-      }
-
-      // There may be no pieces, so skip in that case.
-      if (pc == NO_PIECE)
-        continue;
-
-      put_piece(Piece(pc), sq);
-
-      //cout << sq << ' ' << board[sq] << ' ' << stream.get_cursor() << endl;
-
-      if (stream.get_cursor()> 256)
-        return 1;
-      //assert(stream.get_cursor() <= 256);
-
-    }
-  }
-
-  // Castling availability.
-  // TODO(someone): Support chess960.
-  st->castlingRights = 0;
-  if (stream.read_one_bit()) {
-    Square rsq;
-    for (rsq = relative_square(WHITE, SQ_H1); piece_on(rsq) != W_ROOK; --rsq) {}
-    set_castling_right(WHITE, rsq);
-  }
-  if (stream.read_one_bit()) {
-    Square rsq;
-    for (rsq = relative_square(WHITE, SQ_A1); piece_on(rsq) != W_ROOK; ++rsq) {}
-    set_castling_right(WHITE, rsq);
-  }
-  if (stream.read_one_bit()) {
-    Square rsq;
-    for (rsq = relative_square(BLACK, SQ_H1); piece_on(rsq) != B_ROOK; --rsq) {}
-    set_castling_right(BLACK, rsq);
-  }
-  if (stream.read_one_bit()) {
-    Square rsq;
-    for (rsq = relative_square(BLACK, SQ_A1); piece_on(rsq) != B_ROOK; ++rsq) {}
-    set_castling_right(BLACK, rsq);
-  }
-
-  // En passant square. Ignore if no pawn capture is possible
-  if (stream.read_one_bit()) {
-    Square ep_square = static_cast<Square>(stream.read_n_bit(6));
-    if (mirror) {
-      ep_square = flip_file(ep_square);
-    }
-    st->epSquare = ep_square;
-
-    if (!(attackers_to(st->epSquare) & pieces(sideToMove, PAWN))
-      || !(pieces(~sideToMove, PAWN) & (st->epSquare + pawn_push(~sideToMove))))
-      st->epSquare = SQ_NONE;
-  }
-  else {
-    st->epSquare = SQ_NONE;
-  }
-
-  // Halfmove clock
-  st->rule50 = static_cast<Square>(stream.read_n_bit(6));
-
-  // Fullmove number
-  gamePly = static_cast<Square>(stream.read_n_bit(8));
-  // Convert from fullmove starting from 1 to gamePly starting from 0,
-  // handle also common incorrect FEN with fullmove = 0.
-  gamePly = std::max(2 * (gamePly - 1), 0) + (sideToMove == BLACK);
-
-  assert(stream.get_cursor() <= 256);
-
-  chess960 = false;
-  thisThread = th;
-set_state(st);
-
-  //std::cout << *this << std::endl;
-
-  assert(pos_is_ok());
-
-	return 0;
-}
-
-// Give the board, hand piece, and turn, and return the sfen.
-//std::string Position::sfen_from_rawdata(Piece board[81], Hand hands[2], Color turn, int gamePly_)
-//{
-// // Copy it to an internal structure and call sfen() if the conversion process depends only on it
-// // Maybe it will be converted normally...
-//  Position pos;
-//
-//  memcpy(pos.board, board, sizeof(Piece) * 81);
-//  memcpy(pos.hand, hands, sizeof(Hand) * 2);
-//  pos.sideToMove = turn;
-//  pos.gamePly = gamePly_;
-//
-//  return pos.sfen();
-//
-// // Implementation of ↑ is beautiful, but slow.
-// // This is a bottleneck when learning a large amount of game records, so write a function to unpack directly.
-//}
-
-// Get the packed sfen. Returns to the buffer specified in the argument.
-void Position::sfen_pack(PackedSfen& sfen)
-{
-  SfenPacker sp;
-  sp.data = (uint8_t*)&sfen;
-  sp.pack(*this);
-}
-
-//// Unpack the packed sfen. Returns an sfen string.
-//std::string Position::sfen_unpack(const PackedSfen& sfen)
-//{
-// SfenPacker sp;
-// sp.data = (uint8_t*)&sfen;
-// return sp.unpack();
-//}
-
-
-#endif // USE_SFEN_PACKER
diff --git a/src/extra/stockfish_blas.cpp b/src/extra/stockfish_blas.cpp
new file mode 100644
index 00000000..70b258bc
--- /dev/null
+++ b/src/extra/stockfish_blas.cpp
@@ -0,0 +1,1291 @@
+#include "stockfish_blas.h"
+
+#include "thread.h"
+
+#include <cstring>
+#include <random>
+#include <iostream>
+#include <vector>
+#include <algorithm>
+#include <cmath>
+#include <atomic>
+#include <chrono>
+
+#if defined(USE_SSE2)
+#include <xmmintrin.h>
+#endif
+
+#if defined (USE_SSE3)
+#include <pmmintrin.h>
+#endif
+
+#if defined(USE_BLAS)
+#include <cblas.h>
+#endif
+
+namespace Blas {
+    void scopy(
+        const int N,
+        const float * SF_BLAS_RESTRICT X,
+        float * SF_BLAS_RESTRICT Y
+    )
+    {
+        std::memcpy(Y, X, sizeof(float) * N);
+    }
+
+    void scopy(
+        const int N,
+        const float * SF_BLAS_RESTRICT X, const int incX,
+        float * SF_BLAS_RESTRICT Y, const int incY
+    )
+    {
+        if (incX == 1 && incY == 1)
+        {
+            scopy(N, X, Y);
+        }
+        else
+        {
+            for(int i = 0; i < N; ++i)
+            {
+                *Y = *X;
+                X += incX;
+                Y += incY;
+            }
+        }
+    }
+
+    void scopy(
+        ThreadPool&,
+        const int N,
+        const float * SF_BLAS_RESTRICT X,
+        float * SF_BLAS_RESTRICT Y
+    )
+    {
+        scopy(N, X, Y);
+    }
+
+    void scopy(
+        ThreadPool&,
+        const int N,
+        const float * SF_BLAS_RESTRICT X, const int incX,
+        float * SF_BLAS_RESTRICT Y, const int incY
+    )
+    {
+        scopy(N, X, incX, Y, incY);
+    }
+
+    void sscal(
+        const int N,
+        const float alpha,
+        float * SF_BLAS_RESTRICT X
+    )
+    {
+#if defined (USE_SSE2)
+
+        const __m128 alpha4 = _mm_set1_ps(alpha);
+
+        int i = 0;
+        for(; i < N - 31; i += 32)
+        {
+            __m128 x0 = _mm_loadu_ps(X + i +  0);
+            __m128 x1 = _mm_loadu_ps(X + i +  4);
+            __m128 x2 = _mm_loadu_ps(X + i +  8);
+            __m128 x3 = _mm_loadu_ps(X + i + 12);
+            __m128 x4 = _mm_loadu_ps(X + i + 16);
+            __m128 x5 = _mm_loadu_ps(X + i + 20);
+            __m128 x6 = _mm_loadu_ps(X + i + 24);
+            __m128 x7 = _mm_loadu_ps(X + i + 28);
+
+            x0 = _mm_mul_ps(x0, alpha4);
+            x1 = _mm_mul_ps(x1, alpha4);
+            x2 = _mm_mul_ps(x2, alpha4);
+            x3 = _mm_mul_ps(x3, alpha4);
+            x4 = _mm_mul_ps(x4, alpha4);
+            x5 = _mm_mul_ps(x5, alpha4);
+            x6 = _mm_mul_ps(x6, alpha4);
+            x7 = _mm_mul_ps(x7, alpha4);
+
+            _mm_storeu_ps(X + i +  0, x0);
+            _mm_storeu_ps(X + i +  4, x1);
+            _mm_storeu_ps(X + i +  8, x2);
+            _mm_storeu_ps(X + i + 12, x3);
+            _mm_storeu_ps(X + i + 16, x4);
+            _mm_storeu_ps(X + i + 20, x5);
+            _mm_storeu_ps(X + i + 24, x6);
+            _mm_storeu_ps(X + i + 28, x7);
+        }
+
+        for(; i < N; ++i)
+        {
+            X[i] *= alpha;
+        }
+
+#else
+
+        for(int i = 0; i < N; ++i)
+        {
+            X[i] *= alpha;
+        }
+
+#endif
+    }
+
+    void sscal(
+        const int N,
+        const float alpha,
+        float * SF_BLAS_RESTRICT X, const int incX
+    )
+    {
+        if (incX == 1)
+        {
+            sscal(N, alpha, X);
+        }
+        else
+        {
+            for(int i = 0; i < N; ++i)
+            {
+                *X *= alpha;
+                X += incX;
+            }
+        }
+    }
+
+    void sscal(
+        ThreadPool&,
+        const int N,
+        const float alpha,
+        float * SF_BLAS_RESTRICT X
+    )
+    {
+        sscal(N, alpha, X);
+    }
+
+    void sscal(
+        ThreadPool&,
+        const int N,
+        const float alpha,
+        float *X, const int incX
+    )
+    {
+        sscal(N, alpha, X, incX);
+    }
+
+    void saxpy(
+        const int N,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT X,
+        float * SF_BLAS_RESTRICT Y
+    )
+    {
+        if (alpha == 1.0f)
+        {
+            for (int i = 0; i < N; ++i)
+            {
+                Y[i] += X[i];
+            }
+        }
+        else
+        {
+            for (int i = 0; i < N; ++i)
+            {
+                Y[i] += X[i] * alpha;
+            }
+        }
+
+    }
+
+    void saxpy(
+        const int N,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT X, const int incX,
+        float * SF_BLAS_RESTRICT Y, const int incY
+    )
+    {
+        if (incX == 1 && incY == 1)
+        {
+            saxpy(N, alpha, X, Y);
+        }
+        else
+        {
+            for(int i = 0; i < N; ++i)
+            {
+                *Y += *X * alpha;
+                Y += incY;
+                X += incX;
+            }
+        }
+    }
+
+    void saxpy(
+        ThreadPool&,
+        const int N,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT X,
+        float * SF_BLAS_RESTRICT Y
+    )
+    {
+        saxpy(N, alpha, X, Y);
+    }
+
+    void saxpy(
+        ThreadPool&,
+        const int N,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT X, const int incX,
+        float * SF_BLAS_RESTRICT Y, const int incY
+    )
+    {
+        saxpy(N, alpha, X, incX, Y, incY);
+    }
+
+#if defined (USE_SSE3)
+    inline __m128 m128_hadd_ps(__m128 a, __m128 b, __m128 c, __m128 d)
+    {
+        const __m128 t0 = _mm_hadd_ps(a, b);
+        const __m128 t1 = _mm_hadd_ps(c, d);
+        return _mm_hadd_ps(t0, t1);
+    }
+#endif
+
+#if defined (USE_SSE2)
+
+    inline void transpose4x4_sse2(
+        const float* SF_BLAS_RESTRICT A, const int lda,
+        float* SF_BLAS_RESTRICT B, const int ldb
+    )
+    {
+        __m128 row1 = _mm_loadu_ps(&A[0 * lda]);
+        __m128 row2 = _mm_loadu_ps(&A[1 * lda]);
+        __m128 row3 = _mm_loadu_ps(&A[2 * lda]);
+        __m128 row4 = _mm_loadu_ps(&A[3 * lda]);
+
+        _MM_TRANSPOSE4_PS(row1, row2, row3, row4);
+
+        _mm_storeu_ps(&B[0 * ldb], row1);
+        _mm_storeu_ps(&B[1 * ldb], row2);
+        _mm_storeu_ps(&B[2 * ldb], row3);
+        _mm_storeu_ps(&B[3 * ldb], row4);
+    }
+
+    void transpose_sse2(
+        const int N, const int M,
+        const float* SF_BLAS_RESTRICT A, const int lda,
+        float* SF_BLAS_RESTRICT B, const int ldb
+    )
+    {
+        static constexpr int block_size = 16;
+
+        for (int n = 0; n < N; n += block_size)
+        {
+            for (int m = 0; m < M; m += block_size)
+            {
+                const int max_n2 = n + block_size < N ? n + block_size : N;
+                const int max_m2 = m + block_size < M ? m + block_size : M;
+
+                int n2 = n;
+                for (; n2 < max_n2 - 3; n2 += 4)
+                {
+                    int m2 = m;
+                    for (; m2 < max_m2 - 3; m2 += 4)
+                    {
+                        transpose4x4_sse2(
+                            &A[n2 * lda + m2], lda,
+                            &B[m2 * ldb + n2], ldb
+                        );
+                    }
+
+                    for (; m2 < max_m2; ++m2)
+                    {
+                        B[m2 * ldb + n2 + 0] = A[(n2 + 0) * lda + m2];
+                        B[m2 * ldb + n2 + 1] = A[(n2 + 1) * lda + m2];
+                        B[m2 * ldb + n2 + 2] = A[(n2 + 2) * lda + m2];
+                        B[m2 * ldb + n2 + 3] = A[(n2 + 3) * lda + m2];
+                    }
+                }
+
+                for (; n2 < max_n2; ++n2)
+                {
+                    for (int m2 = m; m2 < max_m2; ++m2)
+                    {
+                        B[m2 * ldb + n2] = A[n2 * lda + m2];
+                    }
+                }
+            }
+        }
+    }
+#endif
+
+    void transpose(
+        const int N, const int M,
+        const float * SF_BLAS_RESTRICT A, const int lda,
+        float* SF_BLAS_RESTRICT B, const int ldb
+    )
+    {
+#if defined (USE_SSE2)
+
+        transpose_sse2(
+            N, M,
+            A, lda,
+            B, ldb
+        );
+
+#else
+
+        for(int r = 0; r < N; ++r)
+        {
+            for (int c = 0; c < M; ++c)
+            {
+                B[c*ldb + r] = A[r*lda + c];
+            }
+        }
+
+#endif
+    }
+
+    void sgemm_row_major_transpose_right(
+        ThreadPool& thread_pool,
+        const int M, const int N, const int K,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT A, const int lda,
+        const float * SF_BLAS_RESTRICT B, const int ldb,
+        const float beta,
+        float * SF_BLAS_RESTRICT C, const int ldc
+    )
+    {
+
+#if defined(USE_SSE3)
+
+        const __m128 alpha4 = _mm_set1_ps(alpha);
+        const __m128 beta4 = _mm_set1_ps(beta);
+
+        std::atomic<int> m_atomic = 0;
+        thread_pool.execute_with_workers(
+            [
+                M, N, K,
+                alpha, alpha4,
+                A, lda,
+                B, ldb,
+                beta, beta4,
+                C, ldc,
+                &m_atomic
+            ](Thread&) {
+                for (;;)
+                {
+                    const int m = m_atomic.fetch_add(2);
+                    if (m >= M - 1)
+                        break;
+
+                    int n = 0;
+                    for (; n < N - 3; n += 4)
+                    {
+                        //        mn
+                        __m128 sum00 = _mm_setzero_ps();
+                        __m128 sum01 = _mm_setzero_ps();
+                        __m128 sum02 = _mm_setzero_ps();
+                        __m128 sum03 = _mm_setzero_ps();
+                        __m128 sum10 = _mm_setzero_ps();
+                        __m128 sum11 = _mm_setzero_ps();
+                        __m128 sum12 = _mm_setzero_ps();
+                        __m128 sum13 = _mm_setzero_ps();
+
+                        // Horizontal sum of elements in sum[m][n] corresponds to
+                        // the final element in the C.
+
+                        int k = 0;
+                        for (; k < K - 3; k += 4)
+                        {
+                            const __m128 a0 = _mm_loadu_ps(&A[(m+0)*lda+k+0]);
+                            const __m128 a1 = _mm_loadu_ps(&A[(m+1)*lda+k+0]);
+
+                            const __m128 b0 = _mm_loadu_ps(&B[(n+0)*ldb+k+0]);
+                            const __m128 b1 = _mm_loadu_ps(&B[(n+1)*ldb+k+0]);
+                            const __m128 b2 = _mm_loadu_ps(&B[(n+2)*ldb+k+0]);
+                            const __m128 b3 = _mm_loadu_ps(&B[(n+3)*ldb+k+0]);
+
+                            sum00 = _mm_add_ps(sum00, _mm_mul_ps(a0, b0));
+                            sum01 = _mm_add_ps(sum01, _mm_mul_ps(a0, b1));
+                            sum02 = _mm_add_ps(sum02, _mm_mul_ps(a0, b2));
+                            sum03 = _mm_add_ps(sum03, _mm_mul_ps(a0, b3));
+                            sum10 = _mm_add_ps(sum10, _mm_mul_ps(a1, b0));
+                            sum11 = _mm_add_ps(sum11, _mm_mul_ps(a1, b1));
+                            sum12 = _mm_add_ps(sum12, _mm_mul_ps(a1, b2));
+                            sum13 = _mm_add_ps(sum13, _mm_mul_ps(a1, b3));
+                        }
+
+                        for(; k < K; k += 1)
+                        {
+                            const float a0 = A[(m+0)*lda+k+0];
+                            const float a1 = A[(m+1)*lda+k+0];
+
+                            const float b0 = B[(n+0)*ldb+k+0];
+                            const float b1 = B[(n+1)*ldb+k+0];
+                            const float b2 = B[(n+2)*ldb+k+0];
+                            const float b3 = B[(n+3)*ldb+k+0];
+
+                            // Since all will be summed vertically anyway we can
+                            // just add to the first element.
+                            // Other elements are left unmodified.
+                            sum00 = _mm_add_ss(sum00, _mm_set_ss(a0 * b0));
+                            sum01 = _mm_add_ss(sum01, _mm_set_ss(a0 * b1));
+                            sum02 = _mm_add_ss(sum02, _mm_set_ss(a0 * b2));
+                            sum03 = _mm_add_ss(sum03, _mm_set_ss(a0 * b3));
+                            sum10 = _mm_add_ss(sum10, _mm_set_ss(a1 * b0));
+                            sum11 = _mm_add_ss(sum11, _mm_set_ss(a1 * b1));
+                            sum12 = _mm_add_ss(sum12, _mm_set_ss(a1 * b2));
+                            sum13 = _mm_add_ss(sum13, _mm_set_ss(a1 * b3));
+                        }
+
+                        __m128 s0 = m128_hadd_ps(sum00, sum01, sum02, sum03);
+                        __m128 s1 = m128_hadd_ps(sum10, sum11, sum12, sum13);
+                        s0 = _mm_mul_ps(s0, alpha4);
+                        s1 = _mm_mul_ps(s1, alpha4);
+
+                        __m128 c0 = _mm_loadu_ps(&C[(m+0)*ldc+(n+0)]);
+                        __m128 c1 = _mm_loadu_ps(&C[(m+1)*ldc+(n+0)]);
+                        c0 = _mm_mul_ps(c0, beta4);
+                        c1 = _mm_mul_ps(c1, beta4);
+
+                        c0 = _mm_add_ps(c0, s0);
+                        c1 = _mm_add_ps(c1, s1);
+
+                        _mm_storeu_ps(&C[(m+0)*ldc+(n+0)], c0);
+                        _mm_storeu_ps(&C[(m+1)*ldc+(n+0)], c1);
+                    }
+
+                    for(; n < N; n += 1)
+                    {
+                        float sum0 = 0.0f;
+                        float sum1 = 0.0f;
+
+                        for (int k = 0; k < K; ++k)
+                        {
+                            const float a0 = A[(m+0)*lda+k+0];
+                            const float a1 = A[(m+1)*lda+k+0];
+
+                            const float b0 = B[(n+0)*ldb+k+0];
+
+                            sum0 += a0 * b0;
+                            sum1 += a1 * b0;
+                        }
+
+                        C[(m+0)*ldc+(n+0)] = C[(m+0)*ldc+(n+0)] * beta + sum0 * alpha;
+                        C[(m+1)*ldc+(n+0)] = C[(m+1)*ldc+(n+0)] * beta + sum1 * alpha;
+                    }
+                }
+            }
+        );
+
+        int m = M - (M % 2);
+        for (; m < M; m += 1)
+        {
+            for (int n = 0; n < N; n += 1)
+            {
+                float sum = 0.0f;
+
+                for (int k = 0; k < K; k += 1)
+                {
+                    sum += A[m*lda + k] * B[n*ldb + k];
+                }
+
+                C[m*ldc + n] = C[m*ldc + n] * beta + sum * alpha;
+            }
+        }
+
+        thread_pool.wait_for_workers_finished();
+
+#else
+
+        thread_pool.for_each_index_with_workers(
+            0, M,
+            [&](Thread&, int m) {
+                for (int n = 0; n < N; n += 1)
+                {
+                    float sum = 0.0f;
+
+                    for (int k = 0; k < K; k += 1)
+                    {
+                        sum += A[m*lda + k] * B[n*ldb + k];
+                    }
+
+                    C[m*ldc + n] = C[m*ldc + n] * beta + sum * alpha;
+                }
+            }
+        );
+        thread_pool.wait_for_workers_finished();
+
+#endif
+    }
+
+    void sgemm_row_major_transpose_right(
+        const int M, const int N, const int K,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT A, const int lda,
+        const float * SF_BLAS_RESTRICT B, const int ldb,
+        const float beta,
+        float * SF_BLAS_RESTRICT C, const int ldc
+    )
+    {
+
+#if defined(USE_SSE3)
+
+        const __m128 alpha4 = _mm_set1_ps(alpha);
+        const __m128 beta4 = _mm_set1_ps(beta);
+
+        int m = 0;
+        for (; m < M - 1; m += 2)
+        {
+            int n = 0;
+            for (; n < N - 3; n += 4)
+            {
+                //        mn
+                __m128 sum00 = _mm_setzero_ps();
+                __m128 sum01 = _mm_setzero_ps();
+                __m128 sum02 = _mm_setzero_ps();
+                __m128 sum03 = _mm_setzero_ps();
+                __m128 sum10 = _mm_setzero_ps();
+                __m128 sum11 = _mm_setzero_ps();
+                __m128 sum12 = _mm_setzero_ps();
+                __m128 sum13 = _mm_setzero_ps();
+
+                // Horizontal sum of elements in sum[m][n] corresponds to
+                // the final element in the C.
+
+                int k = 0;
+                for (; k < K - 3; k += 4)
+                {
+                    const __m128 a0 = _mm_loadu_ps(&A[(m+0)*lda+k+0]);
+                    const __m128 a1 = _mm_loadu_ps(&A[(m+1)*lda+k+0]);
+
+                    const __m128 b0 = _mm_loadu_ps(&B[(n+0)*ldb+k+0]);
+                    const __m128 b1 = _mm_loadu_ps(&B[(n+1)*ldb+k+0]);
+                    const __m128 b2 = _mm_loadu_ps(&B[(n+2)*ldb+k+0]);
+                    const __m128 b3 = _mm_loadu_ps(&B[(n+3)*ldb+k+0]);
+
+                    sum00 = _mm_add_ps(sum00, _mm_mul_ps(a0, b0));
+                    sum01 = _mm_add_ps(sum01, _mm_mul_ps(a0, b1));
+                    sum02 = _mm_add_ps(sum02, _mm_mul_ps(a0, b2));
+                    sum03 = _mm_add_ps(sum03, _mm_mul_ps(a0, b3));
+                    sum10 = _mm_add_ps(sum10, _mm_mul_ps(a1, b0));
+                    sum11 = _mm_add_ps(sum11, _mm_mul_ps(a1, b1));
+                    sum12 = _mm_add_ps(sum12, _mm_mul_ps(a1, b2));
+                    sum13 = _mm_add_ps(sum13, _mm_mul_ps(a1, b3));
+                }
+
+                for(; k < K; k += 1)
+                {
+                    const float a0 = A[(m+0)*lda+k+0];
+                    const float a1 = A[(m+1)*lda+k+0];
+
+                    const float b0 = B[(n+0)*ldb+k+0];
+                    const float b1 = B[(n+1)*ldb+k+0];
+                    const float b2 = B[(n+2)*ldb+k+0];
+                    const float b3 = B[(n+3)*ldb+k+0];
+
+                    // Since all will be summed vertically anyway we can
+                    // just add to the first element.
+                    // Other elements are left unmodified.
+                    sum00 = _mm_add_ss(sum00, _mm_set_ss(a0 * b0));
+                    sum01 = _mm_add_ss(sum01, _mm_set_ss(a0 * b1));
+                    sum02 = _mm_add_ss(sum02, _mm_set_ss(a0 * b2));
+                    sum03 = _mm_add_ss(sum03, _mm_set_ss(a0 * b3));
+                    sum10 = _mm_add_ss(sum10, _mm_set_ss(a1 * b0));
+                    sum11 = _mm_add_ss(sum11, _mm_set_ss(a1 * b1));
+                    sum12 = _mm_add_ss(sum12, _mm_set_ss(a1 * b2));
+                    sum13 = _mm_add_ss(sum13, _mm_set_ss(a1 * b3));
+                }
+
+                __m128 s0 = m128_hadd_ps(sum00, sum01, sum02, sum03);
+                __m128 s1 = m128_hadd_ps(sum10, sum11, sum12, sum13);
+                s0 = _mm_mul_ps(s0, alpha4);
+                s1 = _mm_mul_ps(s1, alpha4);
+
+                __m128 c0 = _mm_loadu_ps(&C[(m+0)*ldc+(n+0)]);
+                __m128 c1 = _mm_loadu_ps(&C[(m+1)*ldc+(n+0)]);
+                c0 = _mm_mul_ps(c0, beta4);
+                c1 = _mm_mul_ps(c1, beta4);
+
+                c0 = _mm_add_ps(c0, s0);
+                c1 = _mm_add_ps(c1, s1);
+
+                _mm_storeu_ps(&C[(m+0)*ldc+(n+0)], c0);
+                _mm_storeu_ps(&C[(m+1)*ldc+(n+0)], c1);
+            }
+
+            for(; n < N; n += 1)
+            {
+                float sum0 = 0.0f;
+                float sum1 = 0.0f;
+
+                for (int k = 0; k < K; ++k)
+                {
+                    const float a0 = A[(m+0)*lda+k+0];
+                    const float a1 = A[(m+1)*lda+k+0];
+
+                    const float b0 = B[(n+0)*ldb+k+0];
+
+                    sum0 += a0 * b0;
+                    sum1 += a1 * b0;
+                }
+
+                C[(m+0)*ldc+(n+0)] = C[(m+0)*ldc+(n+0)] * beta + sum0 * alpha;
+                C[(m+1)*ldc+(n+0)] = C[(m+1)*ldc+(n+0)] * beta + sum1 * alpha;
+            }
+        }
+
+        for (; m < M; m += 1)
+        {
+            for (int n = 0; n < N; n += 1)
+            {
+                float sum = 0.0f;
+
+                for (int k = 0; k < K; k += 1)
+                {
+                    sum += A[m*lda + k] * B[n*ldb + k];
+                }
+
+                C[m*ldc + n] = C[m*ldc + n] * beta + sum * alpha;
+            }
+        }
+
+#else
+
+        for (int m = 0; m < M; m += 1)
+        {
+            for (int n = 0; n < N; n += 1)
+            {
+                float sum = 0.0f;
+
+                for (int k = 0; k < K; k += 1)
+                {
+                    sum += A[m*lda + k] * B[n*ldb + k];
+                }
+
+                C[m*ldc + n] = C[m*ldc + n] * beta + sum * alpha;
+            }
+        }
+
+#endif
+    }
+
+    // The pointer to the storage returned by this function
+    // is valid until the next call to this function from
+    // the same thread with the same idx.
+    // This is an unsafe function and should be used with caution
+    // and only within this translation unit.
+    // The number of buffers available is just enough to make
+    // all functions here work.
+    float* get_thread_local_temporary_storage(
+        int requested_size, int idx
+    )
+    {
+        static constexpr int MAX_NUM_BUFFERS = 2;
+
+        static thread_local int s_data_size[MAX_NUM_BUFFERS] = {0};
+        static thread_local std::unique_ptr<float[]> s_data[MAX_NUM_BUFFERS];
+
+        if (requested_size > s_data_size[idx])
+        {
+            s_data[idx] = std::make_unique<float[]>(requested_size);
+            s_data_size[idx] = requested_size;
+        }
+
+        return s_data[idx].get();
+    }
+
+    void sgemm_row_major_transpose_none(
+        ThreadPool& thread_pool,
+        const int M, const int N, const int K,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT A, const int lda,
+        const float * SF_BLAS_RESTRICT B, const int ldb,
+        const float beta,
+        float * SF_BLAS_RESTRICT C, const int ldc
+    )
+    {
+        constexpr static int temporary_buffer_index = 1;
+
+        auto B_tr = get_thread_local_temporary_storage(K * N, temporary_buffer_index);
+
+        transpose(
+            K, N,
+            B, ldb,
+            B_tr, K
+        );
+
+        sgemm_row_major_transpose_right(
+            thread_pool,
+            M, N, K,
+            alpha,
+            A, lda,
+            B_tr, K,
+            beta,
+            C, ldc
+        );
+    }
+
+    void sgemm_row_major_transpose_none(
+        const int M, const int N, const int K,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT A, const int lda,
+        const float * SF_BLAS_RESTRICT B, const int ldb,
+        const float beta,
+        float * SF_BLAS_RESTRICT C, const int ldc
+    )
+    {
+        constexpr static int temporary_buffer_index = 1;
+
+        auto B_tr = get_thread_local_temporary_storage(K * N, temporary_buffer_index);
+
+        transpose(
+            K, N,
+            B, ldb,
+            B_tr, K
+        );
+
+        sgemm_row_major_transpose_right(
+            M, N, K,
+            alpha,
+            A, lda,
+            B_tr, K,
+            beta,
+            C, ldc
+        );
+    }
+
+    void sgemm_row_major(
+        ThreadPool& thread_pool,
+        MatrixTranspose TransA, MatrixTranspose TransB,
+        const int M, const int N, const int K,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT A, const int lda,
+        const float * SF_BLAS_RESTRICT B, const int ldb,
+        const float beta,
+        float * SF_BLAS_RESTRICT C, const int ldc
+    )
+    {
+        constexpr static int temporary_buffer_index = 0;
+
+        if (TransA == MatrixTranspose::Trans && TransB == MatrixTranspose::Trans)
+        {
+            auto A_tr = get_thread_local_temporary_storage(K * M, temporary_buffer_index);
+
+            transpose(
+                K, M,
+                A, lda,
+                A_tr, K
+            );
+
+            sgemm_row_major_transpose_right(
+                thread_pool,
+                M, N, K,
+                alpha,
+                A_tr, K,
+                B, ldb,
+                beta,
+                C, ldc
+            );
+        }
+        else if (TransA == MatrixTranspose::NoTrans && TransB == MatrixTranspose::Trans)
+        {
+            sgemm_row_major_transpose_right(
+                thread_pool,
+                M, N, K,
+                alpha,
+                A, lda,
+                B, ldb,
+                beta,
+                C, ldc
+            );
+        }
+        else if (TransA == MatrixTranspose::Trans && TransB == MatrixTranspose::NoTrans)
+        {
+            auto A_tr = get_thread_local_temporary_storage(K * M, temporary_buffer_index);
+
+            transpose(
+                K, M,
+                A, lda,
+                A_tr, K
+            );
+
+            sgemm_row_major_transpose_none(
+                thread_pool,
+                M, N, K,
+                alpha,
+                A_tr, K,
+                B, ldb,
+                beta,
+                C, ldc
+            );
+        }
+        else // no transpositions
+        {
+            sgemm_row_major_transpose_none(
+                thread_pool,
+                M, N, K,
+                alpha,
+                A, lda,
+                B, ldb,
+                beta,
+                C, ldc
+            );
+        }
+    }
+
+    void sgemm_row_major(
+        MatrixTranspose TransA, MatrixTranspose TransB,
+        const int M, const int N, const int K,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT A, const int lda,
+        const float * SF_BLAS_RESTRICT B, const int ldb,
+        const float beta,
+        float * SF_BLAS_RESTRICT C, const int ldc
+    )
+    {
+        constexpr static int temporary_buffer_index = 0;
+
+        if (TransA == MatrixTranspose::Trans && TransB == MatrixTranspose::Trans)
+        {
+            auto A_tr = get_thread_local_temporary_storage(K * M, temporary_buffer_index);
+
+            transpose(
+                K, M,
+                A, lda,
+                A_tr, K
+            );
+
+            sgemm_row_major_transpose_right(
+                M, N, K,
+                alpha,
+                A_tr, K,
+                B, ldb,
+                beta,
+                C, ldc
+            );
+        }
+        else if (TransA == MatrixTranspose::NoTrans && TransB == MatrixTranspose::Trans)
+        {
+            sgemm_row_major_transpose_right(
+                M, N, K,
+                alpha,
+                A, lda,
+                B, ldb,
+                beta,
+                C, ldc
+            );
+        }
+        else if (TransA == MatrixTranspose::Trans && TransB == MatrixTranspose::NoTrans)
+        {
+            auto A_tr = get_thread_local_temporary_storage(K * M, temporary_buffer_index);
+
+            transpose(
+                K, M,
+                A, lda,
+                A_tr, K
+            );
+
+            sgemm_row_major_transpose_none(
+                M, N, K,
+                alpha,
+                A_tr, K,
+                B, ldb,
+                beta,
+                C, ldc
+            );
+        }
+        else // no transpositions
+        {
+            sgemm_row_major_transpose_none(
+                M, N, K,
+                alpha,
+                A, lda,
+                B, ldb,
+                beta,
+                C, ldc
+            );
+        }
+    }
+
+    void sgemm(
+        ThreadPool& thread_pool,
+        MatrixLayout layout, MatrixTranspose TransA, MatrixTranspose TransB,
+        const int M, const int N, const int K,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT A, const int lda,
+        const float * SF_BLAS_RESTRICT B, const int ldb,
+        const float beta,
+        float * SF_BLAS_RESTRICT C, const int ldc
+    )
+    {
+        if (layout == MatrixLayout::RowMajor)
+        {
+            sgemm_row_major(
+                thread_pool,
+                TransA, TransB,
+                M, N, K,
+                alpha,
+                A, lda,
+                B, ldb,
+                beta,
+                C, ldc
+            );
+        }
+        else
+        {
+            sgemm_row_major(
+                thread_pool,
+                TransB, TransA,
+                N, M, K,
+                alpha,
+                B, ldb,
+                A, lda,
+                beta,
+                C, ldc
+            );
+        }
+    }
+
+
+    void sgemm(
+        MatrixLayout layout, MatrixTranspose TransA, MatrixTranspose TransB,
+        const int M, const int N, const int K,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT A, const int lda,
+        const float * SF_BLAS_RESTRICT B, const int ldb,
+        const float beta,
+        float * SF_BLAS_RESTRICT C, const int ldc
+    )
+    {
+        if (layout == MatrixLayout::RowMajor)
+        {
+            sgemm_row_major(
+                TransA, TransB,
+                M, N, K,
+                alpha,
+                A, lda,
+                B, ldb,
+                beta,
+                C, ldc
+            );
+        }
+        else
+        {
+            sgemm_row_major(
+                TransB, TransA,
+                N, M, K,
+                alpha,
+                B, ldb,
+                A, lda,
+                beta,
+                C, ldc
+            );
+        }
+    }
+
+    std::vector<float> generate_random_matrix(int rows, int cols)
+    {
+        std::vector<float> m(rows * cols);
+
+        std::mt19937_64 rng;
+        std::uniform_real_distribution<float> d(-1.0, 1.0);
+
+        for(auto& v : m)
+        {
+            v = d(rng);
+        }
+
+        return m;
+    }
+
+    std::vector<float> generate_zero_matrix(int rows, int cols)
+    {
+        return std::vector<float>(rows * cols, 0.0f);
+    }
+
+    float matrix_relative_error(
+        const std::vector<float>& ref,
+        const std::vector<float>& our
+    )
+    {
+        double sum = 0.0;
+        double diff_sum = 0.0;
+
+        for(size_t i = 0; i < ref.size(); ++i)
+        {
+            sum += std::abs(ref[i]);
+            diff_sum += std::abs(ref[i] - our[i]);
+        }
+
+        return diff_sum / sum;
+    }
+
+    float norm(
+        const std::vector<float>& v
+    )
+    {
+        double sum = 0.0;
+
+        for(auto& e : v)
+        {
+            sum += e * e;
+        }
+
+        return std::sqrt(sum);
+    }
+
+#if defined (USE_BLAS)
+
+    CBLAS_LAYOUT matrix_layout_to_blas_layout(MatrixLayout layout)
+    {
+        if (layout == MatrixLayout::RowMajor)
+            return CblasRowMajor;
+        else if (layout == MatrixLayout::ColMajor)
+            return CblasColMajor;
+
+        return static_cast<CBLAS_LAYOUT>(-1);
+    }
+
+    const char* matrix_layout_to_string(MatrixLayout layout)
+    {
+        if (layout == MatrixLayout::RowMajor)
+            return "RowMajor";
+        else if (layout == MatrixLayout::ColMajor)
+            return "ColMajor";
+
+        return "INVALID";
+    }
+
+    CBLAS_TRANSPOSE matrix_transpose_to_blas_transpose(MatrixTranspose tr)
+    {
+        if (tr == MatrixTranspose::NoTrans)
+            return CblasNoTrans;
+        else if (tr == MatrixTranspose::Trans)
+            return CblasTrans;
+
+        return static_cast<CBLAS_TRANSPOSE>(-1);
+    }
+
+    const char* matrix_transpose_to_string(MatrixTranspose tr)
+    {
+        if (tr == MatrixTranspose::NoTrans)
+            return "NoTrans";
+        else if (tr == MatrixTranspose::Trans)
+            return "Trans";
+
+        return "INVALID";
+    }
+
+    void test_sgemm(
+        ThreadPool& thread_pool,
+        MatrixLayout layout, MatrixTranspose trA, MatrixTranspose trB,
+        int M, int N, int K
+    )
+    {
+        auto A = generate_random_matrix(M * 2, K * 2);
+        auto B = generate_random_matrix(K * 2, N * 2);
+        auto C_ref = generate_random_matrix(M * 2, N * 2);
+        auto C_our = C_ref;
+
+        std::cout
+            << matrix_layout_to_string(layout) << ' '
+            << matrix_transpose_to_string(trA) << ' '
+            << matrix_transpose_to_string(trB) << '\n';
+
+        std::cout << "A norm: " << norm(A) << '\n';
+        std::cout << "B norm: " << norm(B) << '\n';
+        std::cout << "C norm: " << norm(C_ref) << '\n';
+
+        const int lda = (trA == MatrixTranspose::NoTrans) == (layout == MatrixLayout::RowMajor) ? K * 2 : M * 2;
+        const int ldb = (trB == MatrixTranspose::NoTrans) == (layout == MatrixLayout::RowMajor) ? N * 2 : K * 2;
+        const int ldc = (layout == MatrixLayout::RowMajor) ? N * 2 : M * 2;
+
+        cblas_sgemm(
+            matrix_layout_to_blas_layout(layout),
+            matrix_transpose_to_blas_transpose(trA),
+            matrix_transpose_to_blas_transpose(trB),
+            M, N, K,
+            1.0,
+            A.data(), lda,
+            B.data(), ldb,
+            1.0,
+            C_ref.data(), ldc
+        );
+
+        sgemm(
+            thread_pool,
+            layout, trA, trB,
+            M, N, K,
+            1.0,
+            A.data(), lda,
+            B.data(), ldb,
+            1.0,
+            C_our.data(), ldc
+        );
+
+        std::cout << "C_ref norm: " << norm(C_ref) << '\n';
+        std::cout << "C_our norm: " << norm(C_our) << '\n';
+        std::cout << "Relative error: " << matrix_relative_error(C_ref, C_our) << '\n';
+
+        std::cout << '\n';
+    }
+
+    void test_sgemm(
+        ThreadPool& thread_pool
+    )
+    {
+        constexpr int M = 57;
+        constexpr int N = 127;
+        constexpr int K = 31;
+
+        std::cout << "SGEMM test:\n";
+
+        for(auto layout : { MatrixLayout::RowMajor, MatrixLayout::ColMajor })
+        {
+            for(auto trA : { MatrixTranspose::NoTrans, MatrixTranspose::Trans })
+            {
+                for(auto trB : { MatrixTranspose::NoTrans, MatrixTranspose::Trans })
+                {
+                    test_sgemm(
+                        thread_pool,
+                        layout, trA, trB,
+                        M, N, K
+                    );
+                }
+            }
+        }
+    }
+
+    void bench_sgemm(
+        ThreadPool& thread_pool,
+        MatrixLayout layout, MatrixTranspose trA, MatrixTranspose trB,
+        int M, int N, int K
+    )
+    {
+        constexpr int num_iters = 1000;
+
+        auto A = generate_random_matrix(M * 2, K * 2);
+        auto B = generate_random_matrix(K * 2, N * 2);
+        auto C_ref = generate_random_matrix(M * 2, N * 2);
+        auto C_our = C_ref;
+
+        std::cout
+            << matrix_layout_to_string(layout) << ' '
+            << matrix_transpose_to_string(trA) << ' '
+            << matrix_transpose_to_string(trB) << '\n';
+
+        std::cout << "A norm: " << norm(A) << '\n';
+        std::cout << "B norm: " << norm(B) << '\n';
+        std::cout << "C norm: " << norm(C_ref) << '\n';
+
+        const int lda = (trA == MatrixTranspose::NoTrans) == (layout == MatrixLayout::RowMajor) ? K * 2 : M * 2;
+        const int ldb = (trB == MatrixTranspose::NoTrans) == (layout == MatrixLayout::RowMajor) ? N * 2 : K * 2;
+        const int ldc = (layout == MatrixLayout::RowMajor) ? N * 2 : M * 2;
+
+        auto t0_ref = std::chrono::high_resolution_clock::now();
+        for(int i = 0; i < num_iters; ++i)
+        {
+            cblas_sgemm(
+                matrix_layout_to_blas_layout(layout),
+                matrix_transpose_to_blas_transpose(trA),
+                matrix_transpose_to_blas_transpose(trB),
+                M, N, K,
+                1.0,
+                A.data(), lda,
+                B.data(), ldb,
+                -0.5,
+                C_ref.data(), ldc
+            );
+        }
+        auto t1_ref = std::chrono::high_resolution_clock::now();
+        auto diff_ref = t1_ref - t0_ref;
+
+        auto t0_our = std::chrono::high_resolution_clock::now();
+        for(int i = 0; i < num_iters; ++i)
+        {
+            sgemm(
+                thread_pool,
+                layout, trA, trB,
+                M, N, K,
+                1.0,
+                A.data(), lda,
+                B.data(), ldb,
+                -0.5,
+                C_our.data(), ldc
+            );
+        }
+        auto t1_our = std::chrono::high_resolution_clock::now();
+        auto diff_our = t1_our - t0_our;
+
+        std::cout << "C_ref norm: " << norm(C_ref) << '\n';
+        std::cout << "C_our norm: " << norm(C_our) << '\n';
+        std::cout << "Relative error: " << matrix_relative_error(C_ref, C_our) << '\n';
+        std::cout << "Ref time: " << std::chrono::duration_cast<std::chrono::nanoseconds>(diff_ref).count() << " [ns]\n";
+        std::cout << "Our time: " << std::chrono::duration_cast<std::chrono::nanoseconds>(diff_our).count() << " [ns]\n";
+
+        std::cout << '\n';
+    }
+
+    void bench_sgemm(
+        ThreadPool& thread_pool
+    )
+    {
+        constexpr int M = 107;
+        constexpr int N = 213;
+        constexpr int K = 57;
+
+        std::cout << "SGEMM benchmark:\n";
+
+        for(auto layout : { MatrixLayout::RowMajor, MatrixLayout::ColMajor })
+        {
+            for(auto trA : { MatrixTranspose::NoTrans, MatrixTranspose::Trans })
+            {
+                for(auto trB : { MatrixTranspose::NoTrans, MatrixTranspose::Trans })
+                {
+                    bench_sgemm(
+                        thread_pool,
+                        layout, trA, trB,
+                        M, N, K
+                    );
+                }
+            }
+        }
+    }
+
+#endif
+
+    void print_arch()
+    {
+#if defined (USE_SSE3)
+        std::cout << "Using the sse3 implementation.\n";
+#elif defined (USE_SSE2)
+        std::cout << "Using the sse2 implementation.\n";
+#else
+        std::cout << "Using the base implementation.\n";
+#endif
+    }
+
+    void test(
+        ThreadPool& thread_pool
+    )
+    {
+#if defined (USE_BLAS)
+        print_arch();
+        test_sgemm(thread_pool);
+#else
+        std::cout << "Blas tests are only runnable when USE_BLAS is defined.\n";
+        (void)thread_pool;
+#endif
+    }
+
+    void bench(
+        ThreadPool& thread_pool
+    )
+    {
+#if defined (USE_BLAS)
+        print_arch();
+        bench_sgemm(thread_pool);
+#else
+        std::cout << "Blas benchmarks are only runnable when USE_BLAS is defined.\n";
+        (void)thread_pool;
+#endif
+    }
+}
\ No newline at end of file
diff --git a/src/extra/stockfish_blas.h b/src/extra/stockfish_blas.h
new file mode 100644
index 00000000..f551bbf2
--- /dev/null
+++ b/src/extra/stockfish_blas.h
@@ -0,0 +1,140 @@
+#ifndef _STOCKFISH_BLAS_H_
+#define _STOCKFISH_BLAS_H_
+
+struct ThreadPool;
+
+#if defined (_MSC_VER)
+#define SF_BLAS_RESTRICT __restrict
+#elif defined (__INTEL_COMPILER)
+#define SF_BLAS_RESTRICT restrict
+#elif defined (__clang__)
+#define SF_BLAS_RESTRICT __restrict__
+#elif defined (__GNUC__)
+#define SF_BLAS_RESTRICT __restrict__
+#endif
+
+namespace Blas {
+
+    enum struct MatrixLayout {
+        RowMajor = 101,
+        ColMajor = 102
+    };
+
+    enum struct MatrixTranspose {
+        NoTrans = 111,
+        Trans = 112
+    };
+
+    void scopy(
+        const int N,
+        const float * SF_BLAS_RESTRICT X,
+        float * SF_BLAS_RESTRICT Y
+    );
+
+    void scopy(
+        const int N,
+        const float * SF_BLAS_RESTRICT X, const int incX,
+        float * SF_BLAS_RESTRICT Y, const int incY
+    );
+
+    void scopy(
+        ThreadPool& thread_pool,
+        const int N,
+        const float * SF_BLAS_RESTRICT X,
+        float * SF_BLAS_RESTRICT Y
+    );
+
+    void scopy(
+        ThreadPool& thread_pool,
+        const int N,
+        const float * SF_BLAS_RESTRICT X, const int incX,
+        float * SF_BLAS_RESTRICT Y, const int incY
+    );
+
+    void sscal(
+        const int N,
+        const float alpha,
+        float * SF_BLAS_RESTRICT X
+    );
+
+    void sscal(
+        const int N,
+        const float alpha,
+        float * SF_BLAS_RESTRICT X, const int incX
+    );
+
+    void sscal(
+        ThreadPool& thread_pool,
+        const int N,
+        const float alpha,
+        float * SF_BLAS_RESTRICT X
+    );
+
+    void sscal(
+        ThreadPool& thread_pool,
+        const int N,
+        const float alpha,
+        float * SF_BLAS_RESTRICT X, const int incX
+    );
+
+    void saxpy(
+        const int N,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT X,
+        float * SF_BLAS_RESTRICT Y
+    );
+
+    void saxpy(
+        const int N,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT X, const int incX,
+        float * SF_BLAS_RESTRICT Y, const int incY
+    );
+
+    void saxpy(
+        ThreadPool& thread_pool,
+        const int N,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT X,
+        float * SF_BLAS_RESTRICT Y
+    );
+
+    void saxpy(
+        ThreadPool& thread_pool,
+        const int N,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT X, const int incX,
+        float * SF_BLAS_RESTRICT Y, const int incY
+    );
+
+    void sgemm(
+        ThreadPool& thread_pool,
+        MatrixLayout layout, MatrixTranspose TransA, MatrixTranspose TransB,
+        const int M, const int N, const int K,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT A, const int lda,
+        const float * SF_BLAS_RESTRICT B, const int ldb,
+        const float beta,
+        float * SF_BLAS_RESTRICT C, const int ldc
+    );
+
+    void sgemm(
+        MatrixLayout layout, MatrixTranspose TransA, MatrixTranspose TransB,
+        const int M, const int N, const int K,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT A, const int lda,
+        const float * SF_BLAS_RESTRICT B, const int ldb,
+        const float beta,
+        float * SF_BLAS_RESTRICT C, const int ldc
+    );
+
+    void test(
+        ThreadPool& thread_pool
+    );
+
+    void bench(
+        ThreadPool& thread_pool
+    );
+}
+
+#endif
diff --git a/src/incbin/UNLICENCE b/src/incbin/UNLICENCE
new file mode 100644
index 00000000..32484ab5
--- /dev/null
+++ b/src/incbin/UNLICENCE
@@ -0,0 +1,26 @@
+The file "incbin.h" is free and unencumbered software released into
+the public domain by Dale Weiler, see:
+   <https://github.com/graphitemaster/incbin>
+
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <http://unlicense.org/>
diff --git a/src/incbin/incbin.h b/src/incbin/incbin.h
new file mode 100755
index 00000000..c19684d7
--- /dev/null
+++ b/src/incbin/incbin.h
@@ -0,0 +1,368 @@
+/**
+ * @file incbin.h
+ * @author Dale Weiler
+ * @brief Utility for including binary files
+ *
+ * Facilities for including binary files into the current translation unit and
+ * making use from them externally in other translation units.
+ */
+#ifndef INCBIN_HDR
+#define INCBIN_HDR
+#include <limits.h>
+#if   defined(__AVX512BW__) || \
+      defined(__AVX512CD__) || \
+      defined(__AVX512DQ__) || \
+      defined(__AVX512ER__) || \
+      defined(__AVX512PF__) || \
+      defined(__AVX512VL__) || \
+      defined(__AVX512F__)
+# define INCBIN_ALIGNMENT_INDEX 6
+#elif defined(__AVX__)      || \
+      defined(__AVX2__)
+# define INCBIN_ALIGNMENT_INDEX 5
+#elif defined(__SSE__)      || \
+      defined(__SSE2__)     || \
+      defined(__SSE3__)     || \
+      defined(__SSSE3__)    || \
+      defined(__SSE4_1__)   || \
+      defined(__SSE4_2__)   || \
+      defined(__neon__)
+# define INCBIN_ALIGNMENT_INDEX 4
+#elif ULONG_MAX != 0xffffffffu
+# define INCBIN_ALIGNMENT_INDEX 3
+# else
+# define INCBIN_ALIGNMENT_INDEX 2
+#endif
+
+/* Lookup table of (1 << n) where `n' is `INCBIN_ALIGNMENT_INDEX' */
+#define INCBIN_ALIGN_SHIFT_0 1
+#define INCBIN_ALIGN_SHIFT_1 2
+#define INCBIN_ALIGN_SHIFT_2 4
+#define INCBIN_ALIGN_SHIFT_3 8
+#define INCBIN_ALIGN_SHIFT_4 16
+#define INCBIN_ALIGN_SHIFT_5 32
+#define INCBIN_ALIGN_SHIFT_6 64
+
+/* Actual alignment value */
+#define INCBIN_ALIGNMENT \
+    INCBIN_CONCATENATE( \
+        INCBIN_CONCATENATE(INCBIN_ALIGN_SHIFT, _), \
+        INCBIN_ALIGNMENT_INDEX)
+
+/* Stringize */
+#define INCBIN_STR(X) \
+    #X
+#define INCBIN_STRINGIZE(X) \
+    INCBIN_STR(X)
+/* Concatenate */
+#define INCBIN_CAT(X, Y) \
+    X ## Y
+#define INCBIN_CONCATENATE(X, Y) \
+    INCBIN_CAT(X, Y)
+/* Deferred macro expansion */
+#define INCBIN_EVAL(X) \
+    X
+#define INCBIN_INVOKE(N, ...) \
+    INCBIN_EVAL(N(__VA_ARGS__))
+
+/* Green Hills uses a different directive for including binary data */
+#if defined(__ghs__)
+#  if (__ghs_asm == 2)
+#    define INCBIN_MACRO ".file"
+/* Or consider the ".myrawdata" entry in the ld file */
+#  else
+#    define INCBIN_MACRO "\tINCBIN"
+#  endif
+#else
+#  define INCBIN_MACRO ".incbin"
+#endif
+
+#ifndef _MSC_VER
+#  define INCBIN_ALIGN \
+    __attribute__((aligned(INCBIN_ALIGNMENT)))
+#else
+#  define INCBIN_ALIGN __declspec(align(INCBIN_ALIGNMENT))
+#endif
+
+#if defined(__arm__) || /* GNU C and RealView */ \
+    defined(__arm) || /* Diab */ \
+    defined(_ARM) /* ImageCraft */
+#  define INCBIN_ARM
+#endif
+
+#ifdef __GNUC__
+/* Utilize .balign where supported */
+#  define INCBIN_ALIGN_HOST ".balign " INCBIN_STRINGIZE(INCBIN_ALIGNMENT) "\n"
+#  define INCBIN_ALIGN_BYTE ".balign 1\n"
+#elif defined(INCBIN_ARM)
+/*
+ * On arm assemblers, the alignment value is calculated as (1 << n) where `n' is
+ * the shift count. This is the value passed to `.align'
+ */
+#  define INCBIN_ALIGN_HOST ".align " INCBIN_STRINGIZE(INCBIN_ALIGNMENT_INDEX) "\n"
+#  define INCBIN_ALIGN_BYTE ".align 0\n"
+#else
+/* We assume other inline assembler's treat `.align' as `.balign' */
+#  define INCBIN_ALIGN_HOST ".align " INCBIN_STRINGIZE(INCBIN_ALIGNMENT) "\n"
+#  define INCBIN_ALIGN_BYTE ".align 1\n"
+#endif
+
+/* INCBIN_CONST is used by incbin.c generated files */
+#if defined(__cplusplus)
+#  define INCBIN_EXTERNAL extern "C"
+#  define INCBIN_CONST    extern const
+#else
+#  define INCBIN_EXTERNAL extern
+#  define INCBIN_CONST    const
+#endif
+
+/**
+ * @brief Optionally override the linker section into which data is emitted.
+ *
+ * @warning If you use this facility, you'll have to deal with platform-specific linker output
+ * section naming on your own
+ *
+ * Overriding the default linker output section, e.g for esp8266/Arduino:
+ * @code
+ * #define INCBIN_OUTPUT_SECTION ".irom.text"
+ * #include "incbin.h"
+ * INCBIN(Foo, "foo.txt");
+ * // Data is emitted into program memory that never gets copied to RAM
+ * @endcode
+ */
+#if !defined(INCBIN_OUTPUT_SECTION)
+#  if defined(__APPLE__)
+#    define INCBIN_OUTPUT_SECTION         ".const_data"
+#  else
+#    define INCBIN_OUTPUT_SECTION         ".rodata"
+#  endif
+#endif
+
+#if defined(__APPLE__)
+/* The directives are different for Apple branded compilers */
+#  define INCBIN_SECTION         INCBIN_OUTPUT_SECTION "\n"
+#  define INCBIN_GLOBAL(NAME)    ".globl " INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME "\n"
+#  define INCBIN_INT             ".long "
+#  define INCBIN_MANGLE          "_"
+#  define INCBIN_BYTE            ".byte "
+#  define INCBIN_TYPE(...)
+#else
+#  define INCBIN_SECTION         ".section " INCBIN_OUTPUT_SECTION "\n"
+#  define INCBIN_GLOBAL(NAME)    ".global " INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME "\n"
+#  if defined(__ghs__)
+#    define INCBIN_INT           ".word "
+#  else
+#    define INCBIN_INT           ".int "
+#  endif
+#  if defined(__USER_LABEL_PREFIX__)
+#    define INCBIN_MANGLE        INCBIN_STRINGIZE(__USER_LABEL_PREFIX__)
+#  else
+#    define INCBIN_MANGLE        ""
+#  endif
+#  if defined(INCBIN_ARM)
+/* On arm assemblers, `@' is used as a line comment token */
+#    define INCBIN_TYPE(NAME)    ".type " INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME ", %object\n"
+#  elif defined(__MINGW32__) || defined(__MINGW64__)
+/* Mingw doesn't support this directive either */
+#    define INCBIN_TYPE(NAME)
+#  else
+/* It's safe to use `@' on other architectures */
+#    define INCBIN_TYPE(NAME)    ".type " INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME ", @object\n"
+#  endif
+#  define INCBIN_BYTE            ".byte "
+#endif
+
+/* List of style types used for symbol names */
+#define INCBIN_STYLE_CAMEL 0
+#define INCBIN_STYLE_SNAKE 1
+
+/**
+ * @brief Specify the prefix to use for symbol names.
+ *
+ * By default this is `g', producing symbols of the form:
+ * @code
+ * #include "incbin.h"
+ * INCBIN(Foo, "foo.txt");
+ *
+ * // Now you have the following symbols:
+ * // const unsigned char gFooData[];
+ * // const unsigned char *const gFooEnd;
+ * // const unsigned int gFooSize;
+ * @endcode
+ *
+ * If however you specify a prefix before including: e.g:
+ * @code
+ * #define INCBIN_PREFIX incbin
+ * #include "incbin.h"
+ * INCBIN(Foo, "foo.txt");
+ *
+ * // Now you have the following symbols instead:
+ * // const unsigned char incbinFooData[];
+ * // const unsigned char *const incbinFooEnd;
+ * // const unsigned int incbinFooSize;
+ * @endcode
+ */
+#if !defined(INCBIN_PREFIX)
+#  define INCBIN_PREFIX g
+#endif
+
+/**
+ * @brief Specify the style used for symbol names.
+ *
+ * Possible options are
+ * - INCBIN_STYLE_CAMEL "CamelCase"
+ * - INCBIN_STYLE_SNAKE "snake_case"
+ *
+ * Default option is *INCBIN_STYLE_CAMEL* producing symbols of the form:
+ * @code
+ * #include "incbin.h"
+ * INCBIN(Foo, "foo.txt");
+ *
+ * // Now you have the following symbols:
+ * // const unsigned char <prefix>FooData[];
+ * // const unsigned char *const <prefix>FooEnd;
+ * // const unsigned int <prefix>FooSize;
+ * @endcode
+ *
+ * If however you specify a style before including: e.g:
+ * @code
+ * #define INCBIN_STYLE INCBIN_STYLE_SNAKE
+ * #include "incbin.h"
+ * INCBIN(foo, "foo.txt");
+ *
+ * // Now you have the following symbols:
+ * // const unsigned char <prefix>foo_data[];
+ * // const unsigned char *const <prefix>foo_end;
+ * // const unsigned int <prefix>foo_size;
+ * @endcode
+ */
+#if !defined(INCBIN_STYLE)
+#  define INCBIN_STYLE INCBIN_STYLE_CAMEL
+#endif
+
+/* Style lookup tables */
+#define INCBIN_STYLE_0_DATA Data
+#define INCBIN_STYLE_0_END End
+#define INCBIN_STYLE_0_SIZE Size
+#define INCBIN_STYLE_1_DATA _data
+#define INCBIN_STYLE_1_END _end
+#define INCBIN_STYLE_1_SIZE _size
+
+/* Style lookup: returning identifier */
+#define INCBIN_STYLE_IDENT(TYPE) \
+    INCBIN_CONCATENATE( \
+        INCBIN_STYLE_, \
+        INCBIN_CONCATENATE( \
+            INCBIN_EVAL(INCBIN_STYLE), \
+            INCBIN_CONCATENATE(_, TYPE)))
+
+/* Style lookup: returning string literal */
+#define INCBIN_STYLE_STRING(TYPE) \
+    INCBIN_STRINGIZE( \
+        INCBIN_STYLE_IDENT(TYPE)) \
+
+/* Generate the global labels by indirectly invoking the macro with our style
+ * type and concatenating the name against them. */
+#define INCBIN_GLOBAL_LABELS(NAME, TYPE) \
+    INCBIN_INVOKE( \
+        INCBIN_GLOBAL, \
+        INCBIN_CONCATENATE( \
+            NAME, \
+            INCBIN_INVOKE( \
+                INCBIN_STYLE_IDENT, \
+                TYPE))) \
+    INCBIN_INVOKE( \
+        INCBIN_TYPE, \
+        INCBIN_CONCATENATE( \
+            NAME, \
+            INCBIN_INVOKE( \
+                INCBIN_STYLE_IDENT, \
+                TYPE)))
+
+/**
+ * @brief Externally reference binary data included in another translation unit.
+ *
+ * Produces three external symbols that reference the binary data included in
+ * another translation unit.
+ *
+ * The symbol names are a concatenation of `INCBIN_PREFIX' before *NAME*; with
+ * "Data", as well as "End" and "Size" after. An example is provided below.
+ *
+ * @param NAME The name given for the binary data
+ *
+ * @code
+ * INCBIN_EXTERN(Foo);
+ *
+ * // Now you have the following symbols:
+ * // extern const unsigned char <prefix>FooData[];
+ * // extern const unsigned char *const <prefix>FooEnd;
+ * // extern const unsigned int <prefix>FooSize;
+ * @endcode
+ */
+#define INCBIN_EXTERN(NAME) \
+    INCBIN_EXTERNAL const INCBIN_ALIGN unsigned char \
+        INCBIN_CONCATENATE( \
+            INCBIN_CONCATENATE(INCBIN_PREFIX, NAME), \
+            INCBIN_STYLE_IDENT(DATA))[]; \
+    INCBIN_EXTERNAL const INCBIN_ALIGN unsigned char *const \
+    INCBIN_CONCATENATE( \
+        INCBIN_CONCATENATE(INCBIN_PREFIX, NAME), \
+        INCBIN_STYLE_IDENT(END)); \
+    INCBIN_EXTERNAL const unsigned int \
+        INCBIN_CONCATENATE( \
+            INCBIN_CONCATENATE(INCBIN_PREFIX, NAME), \
+            INCBIN_STYLE_IDENT(SIZE))
+
+/**
+ * @brief Include a binary file into the current translation unit.
+ *
+ * Includes a binary file into the current translation unit, producing three symbols
+ * for objects that encode the data and size respectively.
+ *
+ * The symbol names are a concatenation of `INCBIN_PREFIX' before *NAME*; with
+ * "Data", as well as "End" and "Size" after. An example is provided below.
+ *
+ * @param NAME The name to associate with this binary data (as an identifier.)
+ * @param FILENAME The file to include (as a string literal.)
+ *
+ * @code
+ * INCBIN(Icon, "icon.png");
+ *
+ * // Now you have the following symbols:
+ * // const unsigned char <prefix>IconData[];
+ * // const unsigned char *const <prefix>IconEnd;
+ * // const unsigned int <prefix>IconSize;
+ * @endcode
+ *
+ * @warning This must be used in global scope
+ * @warning The identifiers may be different if INCBIN_STYLE is not default
+ *
+ * To externally reference the data included by this in another translation unit
+ * please @see INCBIN_EXTERN.
+ */
+#ifdef _MSC_VER
+#define INCBIN(NAME, FILENAME) \
+    INCBIN_EXTERN(NAME)
+#else
+#define INCBIN(NAME, FILENAME) \
+    __asm__(INCBIN_SECTION \
+            INCBIN_GLOBAL_LABELS(NAME, DATA) \
+            INCBIN_ALIGN_HOST \
+            INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(DATA) ":\n" \
+            INCBIN_MACRO " \"" FILENAME "\"\n" \
+            INCBIN_GLOBAL_LABELS(NAME, END) \
+            INCBIN_ALIGN_BYTE \
+            INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(END) ":\n" \
+                INCBIN_BYTE "1\n" \
+            INCBIN_GLOBAL_LABELS(NAME, SIZE) \
+            INCBIN_ALIGN_HOST \
+            INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(SIZE) ":\n" \
+                INCBIN_INT INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(END) " - " \
+                           INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(DATA) "\n" \
+            INCBIN_ALIGN_HOST \
+            ".text\n" \
+    ); \
+    INCBIN_EXTERN(NAME)
+
+#endif
+#endif
diff --git a/src/learn/autograd.h b/src/learn/autograd.h
new file mode 100644
index 00000000..7b2853df
--- /dev/null
+++ b/src/learn/autograd.h
@@ -0,0 +1,667 @@
+#ifndef LEARNER_AUTOGRAD_H
+#define LEARNER_AUTOGRAD_H
+
+#include <cmath>
+#include <utility>
+#include <type_traits>
+#include <memory>
+#include <tuple>
+#include <optional>
+#include <algorithm>
+#include <cstdint>
+
+namespace Learner
+{
+    template <typename T>
+    struct ValueWithGrad
+    {
+        T value;
+        T grad;
+
+        ValueWithGrad& operator+=(const ValueWithGrad<T>& rhs)
+        {
+            value += rhs.value;
+            grad += rhs.grad;
+            return *this;
+        }
+
+        ValueWithGrad& operator-=(const ValueWithGrad<T>& rhs)
+        {
+            value -= rhs.value;
+            grad -= rhs.grad;
+            return *this;
+        }
+
+        ValueWithGrad& operator*=(T rhs)
+        {
+            value *= rhs;
+            grad *= rhs;
+            return *this;
+        }
+
+        ValueWithGrad& operator/=(T rhs)
+        {
+            value /= rhs;
+            grad /= rhs;
+            return *this;
+        }
+
+        [[nodiscard]] ValueWithGrad abs() const
+        {
+            return { std::abs(value), std::abs(grad) };
+        }
+
+        [[nodiscard]] ValueWithGrad clamp_grad(T max) const
+        {
+            return { value, std::clamp(grad, -max, max) };
+        }
+    };
+}
+
+namespace Learner::Autograd::UnivariateStatic
+{
+
+    template <typename T>
+    struct Identity
+    {
+        using type = T;
+    };
+
+    template <typename T>
+    using Id = typename Identity<T>::type;
+
+    template <typename T>
+    using StoreValueOrRef = std::conditional_t<
+            std::is_rvalue_reference_v<T>,
+            std::remove_reference_t<T>,
+            const std::remove_reference_t<T>&
+        >;
+
+    namespace Detail
+    {
+        using CallIdType = std::uint32_t;
+
+        struct CallId
+        {
+            CallIdType call_id{};
+
+            constexpr CallId() :
+                call_id(0)
+            {
+            }
+
+            constexpr CallId(CallIdType id) :
+                call_id(id)
+            {
+            }
+
+            [[nodiscard]] bool operator==(CallId rhs) const noexcept
+            {
+                return call_id == rhs.call_id;
+            }
+
+            [[nodiscard]] bool operator!=(CallId rhs) const noexcept
+            {
+                return call_id != rhs.call_id;
+            }
+        };
+
+        [[nodiscard]] inline CallId next_call_id()
+        {
+            static thread_local CallIdType s_call_id = 0;
+            return CallId{ s_call_id++ };
+        }
+
+        template <typename T, typename Tuple>
+        struct TupleContains;
+
+        template <typename T, typename... Us>
+        struct TupleContains<T, std::tuple<Us...>> : std::disjunction<std::is_same<T, Us>...> {};
+
+        template <typename T, typename Tuple>
+        constexpr bool TupleContainsV = TupleContains<T, Tuple>::value;
+
+        template <typename... Ts>
+        constexpr bool AreAllConstantV = (std::remove_reference_t<Ts>::is_constant && ...);
+    }
+
+    template <typename T, typename ChildT>
+    struct Evaluable
+    {
+        constexpr Evaluable() = default;
+
+        // We append a unique call id so that we can invalidate the cache when
+        // the next computation starts. A single evaluation should see
+        // the same call_id at every node.
+        template <typename... ArgsTs>
+        [[nodiscard]] auto eval(const std::tuple<ArgsTs...>& args) const
+        {
+            const auto call_id = Detail::next_call_id();
+            const auto new_args = std::tuple_cat(args, std::tuple(call_id));
+            return ValueWithGrad<T>{ value(new_args), grad(new_args) };
+        }
+
+        template <typename... ArgsTs,
+            typename SFINAE = std::enable_if_t<Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
+        [[nodiscard]] auto value(const std::tuple<ArgsTs...>& args) const
+        {
+            const ChildT* this_ = static_cast<const ChildT*>(this);
+
+            const auto call_id = std::get<Detail::CallId>(args);
+            if (!value_cache.has_value() || value_cache_call_id != call_id)
+            {
+                value_cache_call_id = call_id;
+                value_cache = this_->calculate_value(args);
+            }
+
+            return *value_cache;
+        }
+
+        template <typename... ArgsTs,
+            typename SFINAE = std::enable_if_t<!Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
+        [[nodiscard]] auto value(const std::tuple<ArgsTs...>& args, ...) const
+        {
+            const auto call_id = Detail::next_call_id();
+            const auto new_args = std::tuple_cat(args, std::tuple(call_id));
+            return value(new_args);
+        }
+
+        template <typename... ArgsTs,
+            typename SFINAE = std::enable_if_t<Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
+        [[nodiscard]] auto grad(const std::tuple<ArgsTs...>& args) const
+        {
+            if constexpr (ChildT::is_constant)
+            {
+                return T(0.0);
+            }
+            else
+            {
+                const ChildT* this_ = static_cast<const ChildT*>(this);
+
+                const auto call_id = std::get<Detail::CallId>(args);
+                if (!grad_cache.has_value() || grad_cache_call_id != call_id)
+                {
+                    grad_cache_call_id = call_id;
+                    grad_cache = this_->calculate_grad(args);
+                }
+
+                return *grad_cache;
+            }
+        }
+
+        template <typename... ArgsTs,
+            typename SFINAE = std::enable_if_t<!Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
+        [[nodiscard]] auto grad(const std::tuple<ArgsTs...>& args, ...) const
+        {
+            const auto call_id = Detail::next_call_id();
+            const auto new_args = std::tuple_cat(args, std::tuple(call_id));
+            return grad(new_args);
+        }
+
+    private:
+        mutable std::optional<T> value_cache;
+        mutable std::optional<T> grad_cache;
+        mutable Detail::CallId value_cache_call_id{};
+        mutable Detail::CallId grad_cache_call_id{};
+    };
+
+    template <typename T, int I>
+    struct VariableParameter : Evaluable<T, VariableParameter<T, I>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = false;
+
+        constexpr VariableParameter()
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
+        {
+            return std::get<I>(args);
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>&) const
+        {
+            return T(1.0);
+        }
+    };
+
+    template <typename T, int I>
+    struct ConstantParameter : Evaluable<T, ConstantParameter<T, I>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = true;
+
+        constexpr ConstantParameter()
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
+        {
+            return std::get<I>(args);
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>&) const
+        {
+            return T(0.0);
+        }
+    };
+
+    template <typename T>
+    struct Constant : Evaluable<T, Constant<T>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = true;
+
+        constexpr Constant(T x) :
+            m_x(std::move(x))
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>&) const
+        {
+            return m_x;
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>&) const
+        {
+            return T(0.0);
+        }
+
+    private:
+        T m_x;
+    };
+
+    // The "constant" may change between executions, but is assumed to be
+    // constant during a single evaluation.
+    template <typename T>
+    struct ConstantRef : Evaluable<T, ConstantRef<T>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = true;
+
+        constexpr ConstantRef(const T& x) :
+            m_x(x)
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>&) const
+        {
+            return m_x;
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>&) const
+        {
+            return T(0.0);
+        }
+
+    private:
+        const T& m_x;
+    };
+
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    struct Sum : Evaluable<T, Sum<LhsT, RhsT, T>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = Detail::AreAllConstantV<LhsT, RhsT>;
+
+        constexpr Sum(LhsT&& lhs, RhsT&& rhs) :
+            m_lhs(std::forward<LhsT>(lhs)),
+            m_rhs(std::forward<RhsT>(rhs))
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_lhs.value(args) + m_rhs.value(args);
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_lhs.grad(args) + m_rhs.grad(args);
+        }
+
+    private:
+        StoreValueOrRef<LhsT> m_lhs;
+        StoreValueOrRef<RhsT> m_rhs;
+    };
+
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator+(LhsT&& lhs, RhsT&& rhs)
+    {
+        return Sum<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
+    }
+
+    template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator+(LhsT&& lhs, Id<T> rhs)
+    {
+        return Sum<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
+    }
+
+    template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator+(Id<T> lhs, RhsT&& rhs)
+    {
+        return Sum<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
+    }
+
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    struct Difference : Evaluable<T, Difference<LhsT, RhsT, T>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = Detail::AreAllConstantV<LhsT, RhsT>;
+
+        constexpr Difference(LhsT&& lhs, RhsT&& rhs) :
+            m_lhs(std::forward<LhsT>(lhs)),
+            m_rhs(std::forward<RhsT>(rhs))
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_lhs.value(args) - m_rhs.value(args);
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_lhs.grad(args) - m_rhs.grad(args);
+        }
+
+    private:
+        StoreValueOrRef<LhsT> m_lhs;
+        StoreValueOrRef<RhsT> m_rhs;
+    };
+
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator-(LhsT&& lhs, RhsT&& rhs)
+    {
+        return Difference<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
+    }
+
+    template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator-(LhsT&& lhs, Id<T> rhs)
+    {
+        return Difference<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
+    }
+
+    template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator-(Id<T> lhs, RhsT&& rhs)
+    {
+        return Difference<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
+    }
+
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    struct Product : Evaluable<T, Product<LhsT, RhsT, T>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = Detail::AreAllConstantV<LhsT, RhsT>;
+
+        constexpr Product(LhsT&& lhs, RhsT&& rhs) :
+            m_lhs(std::forward<LhsT>(lhs)),
+            m_rhs(std::forward<RhsT>(rhs))
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_lhs.value(args) * m_rhs.value(args);
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_lhs.grad(args) * m_rhs.value(args) + m_lhs.value(args) * m_rhs.grad(args);
+        }
+
+    private:
+        StoreValueOrRef<LhsT> m_lhs;
+        StoreValueOrRef<RhsT> m_rhs;
+    };
+
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator*(LhsT&& lhs, RhsT&& rhs)
+    {
+        return Product<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
+    }
+
+    template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator*(LhsT&& lhs, Id<T> rhs)
+    {
+        return Product<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
+    }
+
+    template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator*(Id<T> lhs, RhsT&& rhs)
+    {
+        return Product<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
+    }
+
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    struct Quotient : Evaluable<T, Quotient<LhsT, RhsT, T>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = Detail::AreAllConstantV<LhsT, RhsT>;
+
+        constexpr Quotient(LhsT&& lhs, RhsT&& rhs) :
+            m_lhs(std::forward<LhsT>(lhs)),
+            m_rhs(std::forward<RhsT>(rhs))
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_lhs.value(args) / m_rhs.value(args);
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        {
+            auto g = m_rhs.value(args);
+            return (m_lhs.grad(args) * g - m_lhs.value(args) * m_rhs.grad(args)) / (g * g);
+        }
+
+    private:
+        StoreValueOrRef<LhsT> m_lhs;
+        StoreValueOrRef<RhsT> m_rhs;
+    };
+
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator/(LhsT&& lhs, RhsT&& rhs)
+    {
+        return Quotient<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
+    }
+
+    template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator/(LhsT&& lhs, Id<T> rhs)
+    {
+        return Quotient<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
+    }
+
+    template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator/(Id<T> lhs, RhsT&& rhs)
+    {
+        return Quotient<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
+    }
+
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
+    struct Negation : Evaluable<T, Negation<ArgT, T>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = Detail::AreAllConstantV<ArgT>;
+
+        constexpr explicit Negation(ArgT&& x) :
+            m_x(std::forward<ArgT>(x))
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
+        {
+            return -m_x.value(args);
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        {
+            return -m_x.grad(args);
+        }
+
+    private:
+        StoreValueOrRef<ArgT> m_x;
+    };
+
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
+    [[nodiscard]] constexpr auto operator-(ArgT&& x)
+    {
+        return Negation<ArgT&&>(std::forward<ArgT>(x));
+    }
+
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
+    struct Sigmoid : Evaluable<T, Sigmoid<ArgT, T>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = Detail::AreAllConstantV<ArgT>;
+
+        constexpr explicit Sigmoid(ArgT&& x) :
+            m_x(std::forward<ArgT>(x))
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
+        {
+            return value_(m_x.value(args));
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_x.grad(args) * grad_(m_x.value(args));
+        }
+
+    private:
+        StoreValueOrRef<ArgT> m_x;
+
+        [[nodiscard]] T value_(T x) const
+        {
+            return 1.0 / (1.0 + std::exp(-x));
+        }
+
+        [[nodiscard]] T grad_(T x) const
+        {
+            return value_(x) * (1.0 - value_(x));
+        }
+    };
+
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
+    [[nodiscard]] constexpr auto sigmoid(ArgT&& x)
+    {
+        return Sigmoid<ArgT&&>(std::forward<ArgT>(x));
+    }
+
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
+    struct Pow : Evaluable<T, Pow<ArgT, T>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = Detail::AreAllConstantV<ArgT>;
+
+        constexpr explicit Pow(ArgT&& x, Id<T> exponent) :
+            m_x(std::forward<ArgT>(x)),
+            m_exponent(std::move(exponent))
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
+        {
+            return std::pow(m_x.value(args), m_exponent);
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_exponent * std::pow(m_x.value(args), m_exponent - T(1.0)) * m_x.grad(args);
+        }
+
+    private:
+        StoreValueOrRef<ArgT> m_x;
+        T m_exponent;
+    };
+
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
+    [[nodiscard]] constexpr auto pow(ArgT&& x, Id<T> exp)
+    {
+        return Pow<ArgT&&>(std::forward<ArgT>(x), std::move(exp));
+    }
+
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
+    struct Log : Evaluable<T, Log<ArgT, T>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = Detail::AreAllConstantV<ArgT>;
+
+        constexpr explicit Log(ArgT&& x) :
+            m_x(std::forward<ArgT>(x))
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
+        {
+            return value_(m_x.value(args));
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_x.grad(args) * grad_(m_x.value(args));
+        }
+
+    private:
+        StoreValueOrRef<ArgT> m_x;
+
+        T value_(T x) const
+        {
+            return std::log(x);
+        }
+
+        T grad_(T x) const
+        {
+            return 1.0 / x;
+        }
+    };
+
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
+    [[nodiscard]] constexpr auto log(ArgT&& x)
+    {
+        return Log<ArgT&&>(std::forward<ArgT>(x));
+    }
+
+}
+
+#endif
\ No newline at end of file
diff --git a/src/learn/convert.cpp b/src/learn/convert.cpp
new file mode 100644
index 00000000..5fe7ea1d
--- /dev/null
+++ b/src/learn/convert.cpp
@@ -0,0 +1,815 @@
+#include "convert.h"
+
+#include "uci.h"
+#include "misc.h"
+#include "thread.h"
+#include "position.h"
+#include "tt.h"
+
+#include "extra/nnue_data_binpack_format.h"
+
+#include "nnue/evaluate_nnue.h"
+
+#include "syzygy/tbprobe.h"
+
+#include <sstream>
+#include <fstream>
+#include <unordered_set>
+#include <iomanip>
+#include <list>
+#include <cmath>    // std::exp(),std::pow(),std::log()
+#include <cstring>  // memcpy()
+#include <memory>
+#include <limits>
+#include <optional>
+#include <chrono>
+#include <random>
+#include <regex>
+#include <filesystem>
+
+using namespace std;
+
+namespace Learner
+{
+    bool fen_is_ok(Position& pos, std::string input_fen) {
+        std::string pos_fen = pos.fen();
+        std::istringstream ss_input(input_fen);
+        std::istringstream ss_pos(pos_fen);
+
+        // example : "2r4r/4kpp1/nb1np3/p2p3p/B2P1BP1/PP6/4NPKP/2R1R3 w - h6 0 24"
+        //       --> "2r4r/4kpp1/nb1np3/p2p3p/B2P1BP1/PP6/4NPKP/2R1R3"
+        std::string str_input, str_pos;
+        ss_input >> str_input;
+        ss_pos >> str_pos;
+
+        // Only compare "Piece placement field" between input_fen and pos.fen().
+        return str_input == str_pos;
+    }
+
+    void convert_bin(
+        const vector<string>& filenames,
+        const string& output_file_name,
+        const int ply_minimum,
+        const int ply_maximum,
+        const int interpolate_eval,
+        const int src_score_min_value,
+        const int src_score_max_value,
+        const int dest_score_min_value,
+        const int dest_score_max_value,
+        const bool check_invalid_fen,
+        const bool check_illegal_move)
+    {
+        std::cout << "check_invalid_fen=" << check_invalid_fen << std::endl;
+        std::cout << "check_illegal_move=" << check_illegal_move << std::endl;
+
+        std::fstream fs;
+        uint64_t data_size = 0;
+        uint64_t filtered_size = 0;
+        uint64_t filtered_size_fen = 0;
+        uint64_t filtered_size_move = 0;
+        uint64_t filtered_size_ply = 0;
+        auto th = Threads.main();
+        auto& tpos = th->rootPos;
+        // convert plain rag to packed sfenvalue for Yaneura king
+        fs.open(output_file_name, ios::app | ios::binary);
+        StateListPtr states;
+        for (auto filename : filenames) {
+            std::cout << "convert " << filename << " ... ";
+            std::string line;
+            ifstream ifs;
+            ifs.open(filename);
+            PackedSfenValue p;
+            data_size = 0;
+            filtered_size = 0;
+            filtered_size_fen = 0;
+            filtered_size_move = 0;
+            filtered_size_ply = 0;
+            p.gamePly = 1; // Not included in apery format. Should be initialized
+            bool ignore_flag_fen = false;
+            bool ignore_flag_move = false;
+            bool ignore_flag_ply = false;
+            while (std::getline(ifs, line)) {
+                std::stringstream ss(line);
+                std::string token;
+                std::string value;
+                ss >> token;
+                if (token == "fen") {
+                    states = StateListPtr(new std::deque<StateInfo>(1)); // Drop old and create a new one
+                    std::string input_fen = line.substr(4);
+                    tpos.set(input_fen, false, &states->back(), Threads.main());
+                    if (check_invalid_fen && !fen_is_ok(tpos, input_fen)) {
+                        ignore_flag_fen = true;
+                        filtered_size_fen++;
+                    }
+                    else {
+                        tpos.sfen_pack(p.sfen);
+                    }
+                }
+                else if (token == "move") {
+                    ss >> value;
+                    Move move = UCI::to_move(tpos, value);
+                    if (check_illegal_move && move == MOVE_NONE) {
+                        ignore_flag_move = true;
+                        filtered_size_move++;
+                    }
+                    else {
+                        p.move = move;
+                    }
+                }
+                else if (token == "score") {
+                    double score;
+                    ss >> score;
+                    // Training Formula ?Issue #71 ?nodchip/Stockfish https://github.com/nodchip/Stockfish/issues/71
+                    // Normalize to [0.0, 1.0].
+                    score = (score - src_score_min_value) / (src_score_max_value - src_score_min_value);
+                    // Scale to [dest_score_min_value, dest_score_max_value].
+                    score = score * (dest_score_max_value - dest_score_min_value) + dest_score_min_value;
+                    p.score = Math::clamp((int32_t)std::round(score), -(int32_t)VALUE_MATE, (int32_t)VALUE_MATE);
+                }
+                else if (token == "ply") {
+                    int temp;
+                    ss >> temp;
+                    if (temp < ply_minimum || temp > ply_maximum) {
+                        ignore_flag_ply = true;
+                        filtered_size_ply++;
+                    }
+                    p.gamePly = uint16_t(temp); // No cast here?
+                    if (interpolate_eval != 0) {
+                        p.score = min(3000, interpolate_eval * temp);
+                    }
+                }
+                else if (token == "result") {
+                    int temp;
+                    ss >> temp;
+                    p.game_result = int8_t(temp); // Do you need a cast here?
+                    if (interpolate_eval) {
+                        p.score = p.score * p.game_result;
+                    }
+                }
+                else if (token == "e") {
+                    if (!(ignore_flag_fen || ignore_flag_move || ignore_flag_ply)) {
+                        fs.write((char*)&p, sizeof(PackedSfenValue));
+                        data_size += 1;
+                        // debug
+                        // std::cout<<tpos<<std::endl;
+                        // std::cout<<p.score<<","<<int(p.gamePly)<<","<<int(p.game_result)<<std::endl;
+                    }
+                    else {
+                        filtered_size++;
+                    }
+                    ignore_flag_fen = false;
+                    ignore_flag_move = false;
+                    ignore_flag_ply = false;
+                }
+            }
+            std::cout << "done " << data_size << " parsed " << filtered_size << " is filtered"
+                << " (invalid fen:" << filtered_size_fen << ", illegal move:" << filtered_size_move << ", invalid ply:" << filtered_size_ply << ")" << std::endl;
+            ifs.close();
+        }
+        std::cout << "all done" << std::endl;
+        fs.close();
+    }
+
+    static inline void ltrim(std::string& s) {
+        s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](int ch) {
+            return !std::isspace(ch);
+            }));
+    }
+
+    static inline void rtrim(std::string& s) {
+        s.erase(std::find_if(s.rbegin(), s.rend(), [](int ch) {
+            return !std::isspace(ch);
+            }).base(), s.end());
+    }
+
+    static inline void trim(std::string& s) {
+        ltrim(s);
+        rtrim(s);
+    }
+
+    int parse_game_result_from_pgn_extract(std::string result) {
+        // White Win
+        if (result == "\"1-0\"") {
+            return 1;
+        }
+        // Black Win
+        else if (result == "\"0-1\"") {
+            return -1;
+        }
+        // Draw
+        else {
+            return 0;
+        }
+    }
+
+    // 0.25 -->  0.25 * PawnValueEg
+    // #-4  --> -mate_in(4)
+    // #3   -->  mate_in(3)
+    // -M4  --> -mate_in(4)
+    // +M3  -->  mate_in(3)
+    Value parse_score_from_pgn_extract(std::string eval, bool& success) {
+        success = true;
+
+        if (eval.substr(0, 1) == "#") {
+            if (eval.substr(1, 1) == "-") {
+                return -mate_in(stoi(eval.substr(2, eval.length() - 2)));
+            }
+            else {
+                return mate_in(stoi(eval.substr(1, eval.length() - 1)));
+            }
+        }
+        else if (eval.substr(0, 2) == "-M") {
+            //std::cout << "eval=" << eval << std::endl;
+            return -mate_in(stoi(eval.substr(2, eval.length() - 2)));
+        }
+        else if (eval.substr(0, 2) == "+M") {
+            //std::cout << "eval=" << eval << std::endl;
+            return mate_in(stoi(eval.substr(2, eval.length() - 2)));
+        }
+        else {
+            char* endptr;
+            double value = strtod(eval.c_str(), &endptr);
+
+            if (*endptr != '\0') {
+                success = false;
+                return VALUE_ZERO;
+            }
+            else {
+                return Value(value * static_cast<double>(PawnValueEg));
+            }
+        }
+    }
+
+    // for Debug
+    //#define DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT
+
+    bool is_like_fen(std::string fen) {
+        int count_space = std::count(fen.cbegin(), fen.cend(), ' ');
+        int count_slash = std::count(fen.cbegin(), fen.cend(), '/');
+
+#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
+        //std::cout << "count_space=" << count_space << std::endl;
+        //std::cout << "count_slash=" << count_slash << std::endl;
+#endif
+
+        return count_space == 5 && count_slash == 7;
+    }
+
+    void convert_bin_from_pgn_extract(
+        const vector<string>& filenames,
+        const string& output_file_name,
+        const bool pgn_eval_side_to_move,
+        const bool convert_no_eval_fens_as_score_zero)
+    {
+        std::cout << "pgn_eval_side_to_move=" << pgn_eval_side_to_move << std::endl;
+        std::cout << "convert_no_eval_fens_as_score_zero=" << convert_no_eval_fens_as_score_zero << std::endl;
+
+        auto th = Threads.main();
+        auto& pos = th->rootPos;
+
+        std::fstream ofs;
+        ofs.open(output_file_name, ios::out | ios::binary);
+
+        int game_count = 0;
+        int fen_count = 0;
+
+        for (auto filename : filenames) {
+            std::cout << now_string() << " convert " << filename << std::endl;
+            ifstream ifs;
+            ifs.open(filename);
+
+            int game_result = 0;
+
+            std::string line;
+            while (std::getline(ifs, line)) {
+
+                if (line.empty()) {
+                    continue;
+                }
+
+                else if (line.substr(0, 1) == "[") {
+                    std::regex pattern_result(R"(\[Result (.+?)\])");
+                    std::smatch match;
+
+                    // example: [Result "1-0"]
+                    if (std::regex_search(line, match, pattern_result)) {
+                        game_result = parse_game_result_from_pgn_extract(match.str(1));
+#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
+                        std::cout << "game_result=" << game_result << std::endl;
+#endif
+                        game_count++;
+                        if (game_count % 10000 == 0) {
+                            std::cout << now_string() << " game_count=" << game_count << ", fen_count=" << fen_count << std::endl;
+                        }
+                    }
+
+                    continue;
+                }
+
+                else {
+                    int gamePly = 1;
+                    auto itr = line.cbegin();
+
+                    while (true) {
+                        gamePly++;
+
+                        PackedSfenValue psv;
+                        memset((char*)&psv, 0, sizeof(PackedSfenValue));
+
+                        // fen
+                        {
+                            bool fen_found = false;
+
+                            while (!fen_found) {
+                                std::regex pattern_bracket(R"(\{(.+?)\})");
+                                std::smatch match;
+                                if (!std::regex_search(itr, line.cend(), match, pattern_bracket)) {
+                                    break;
+                                }
+
+                                itr += match.position(0) + match.length(0) - 1;
+                                std::string str_fen = match.str(1);
+                                trim(str_fen);
+
+                                if (is_like_fen(str_fen)) {
+                                    fen_found = true;
+
+                                    StateInfo si;
+                                    pos.set(str_fen, false, &si, th);
+                                    pos.sfen_pack(psv.sfen);
+                                }
+
+#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
+                                std::cout << "str_fen=" << str_fen << std::endl;
+                                std::cout << "fen_found=" << fen_found << std::endl;
+#endif
+                            }
+
+                            if (!fen_found) {
+                                break;
+                            }
+                        }
+
+                        // move
+                        {
+                            std::regex pattern_move(R"(\}(.+?)\{)");
+                            std::smatch match;
+                            if (!std::regex_search(itr, line.cend(), match, pattern_move)) {
+                                break;
+                            }
+
+                            itr += match.position(0) + match.length(0) - 1;
+                            std::string str_move = match.str(1);
+                            trim(str_move);
+#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
+                            std::cout << "str_move=" << str_move << std::endl;
+#endif
+                            psv.move = UCI::to_move(pos, str_move);
+                        }
+
+                        // eval
+                        bool eval_found = false;
+                        {
+                            std::regex pattern_bracket(R"(\{(.+?)\})");
+                            std::smatch match;
+                            if (!std::regex_search(itr, line.cend(), match, pattern_bracket)) {
+                                break;
+                            }
+
+                            std::string str_eval_clk = match.str(1);
+                            trim(str_eval_clk);
+#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
+                            std::cout << "str_eval_clk=" << str_eval_clk << std::endl;
+#endif
+
+                            // example: { [%eval 0.25] [%clk 0:10:00] }
+                            // example: { [%eval #-4] [%clk 0:10:00] }
+                            // example: { [%eval #3] [%clk 0:10:00] }
+                            // example: { +0.71/22 1.2s }
+                            // example: { -M4/7 0.003s }
+                            // example: { M3/245 0.017s }
+                            // example: { +M1/245 0.010s, White mates }
+                            // example: { 0.60 }
+                            // example: { book }
+                            // example: { rnbqkb1r/pp3ppp/2p1pn2/3p4/2PP4/2N2N2/PP2PPPP/R1BQKB1R w KQkq - 0 5 }
+
+                            // Considering the absence of eval
+                            if (!is_like_fen(str_eval_clk)) {
+                                itr += match.position(0) + match.length(0) - 1;
+
+                                if (str_eval_clk != "book") {
+                                    std::regex pattern_eval1(R"(\[\%eval (.+?)\])");
+                                    std::regex pattern_eval2(R"((.+?)\/)");
+
+                                    std::string str_eval;
+                                    if (std::regex_search(str_eval_clk, match, pattern_eval1) ||
+                                        std::regex_search(str_eval_clk, match, pattern_eval2)) {
+                                        str_eval = match.str(1);
+                                        trim(str_eval);
+                                    }
+                                    else {
+                                        str_eval = str_eval_clk;
+                                    }
+
+                                    bool success = false;
+                                    Value value = parse_score_from_pgn_extract(str_eval, success);
+                                    if (success) {
+                                        eval_found = true;
+                                        psv.score = Math::clamp(value, -VALUE_MATE, VALUE_MATE);
+                                    }
+
+#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
+                                    std::cout << "str_eval=" << str_eval << std::endl;
+                                    std::cout << "success=" << success << ", psv.score=" << psv.score << std::endl;
+#endif
+                                }
+                            }
+                        }
+
+                        // write
+                        if (eval_found || convert_no_eval_fens_as_score_zero) {
+                            if (!eval_found && convert_no_eval_fens_as_score_zero) {
+                                psv.score = 0;
+                            }
+
+                            psv.gamePly = gamePly;
+                            psv.game_result = game_result;
+
+                            if (pos.side_to_move() == BLACK) {
+                                if (!pgn_eval_side_to_move) {
+                                    psv.score *= -1;
+                                }
+                                psv.game_result *= -1;
+                            }
+
+                            ofs.write((char*)&psv, sizeof(PackedSfenValue));
+
+                            fen_count++;
+                        }
+                    }
+
+                    game_result = 0;
+                }
+            }
+        }
+
+        std::cout << now_string() << " game_count=" << game_count << ", fen_count=" << fen_count << std::endl;
+        std::cout << now_string() << " all done" << std::endl;
+        ofs.close();
+    }
+
+    void convert_plain(
+        const vector<string>& filenames,
+        const string& output_file_name)
+    {
+        Position tpos;
+        std::ofstream ofs;
+        ofs.open(output_file_name, ios::app);
+        auto th = Threads.main();
+        for (auto filename : filenames) {
+            std::cout << "convert " << filename << " ... ";
+
+            // Just convert packedsfenvalue to text
+            std::fstream fs;
+            fs.open(filename, ios::in | ios::binary);
+            PackedSfenValue p;
+            while (true)
+            {
+                if (fs.read((char*)&p, sizeof(PackedSfenValue))) {
+                    StateInfo si;
+                    tpos.set_from_packed_sfen(p.sfen, &si, th);
+
+                    // write as plain text
+                    ofs << "fen " << tpos.fen() << std::endl;
+                    ofs << "move " << UCI::move(Move(p.move), false) << std::endl;
+                    ofs << "score " << p.score << std::endl;
+                    ofs << "ply " << int(p.gamePly) << std::endl;
+                    ofs << "result " << int(p.game_result) << std::endl;
+                    ofs << "e" << std::endl;
+                }
+                else {
+                    break;
+                }
+            }
+            fs.close();
+            std::cout << "done" << std::endl;
+        }
+        ofs.close();
+        std::cout << "all done" << std::endl;
+    }
+
+    static inline const std::string plain_extension = ".plain";
+    static inline const std::string bin_extension = ".bin";
+    static inline const std::string binpack_extension = ".binpack";
+
+    static bool file_exists(const std::string& name)
+    {
+        std::ifstream f(name);
+        return f.good();
+    }
+
+    static bool ends_with(const std::string& lhs, const std::string& end)
+    {
+        if (end.size() > lhs.size()) return false;
+
+        return std::equal(end.rbegin(), end.rend(), lhs.rbegin());
+    }
+
+    static bool is_convert_of_type(
+        const std::string& input_path,
+        const std::string& output_path,
+        const std::string& expected_input_extension,
+        const std::string& expected_output_extension)
+    {
+        return ends_with(input_path, expected_input_extension)
+            && ends_with(output_path, expected_output_extension);
+    }
+
+    using ConvertFunctionType = void(std::string inputPath, std::string outputPath, std::ios_base::openmode om, bool validate);
+
+    static ConvertFunctionType* get_convert_function(const std::string& input_path, const std::string& output_path)
+    {
+        if (is_convert_of_type(input_path, output_path, plain_extension, bin_extension))
+            return binpack::convertPlainToBin;
+        if (is_convert_of_type(input_path, output_path, plain_extension, binpack_extension))
+            return binpack::convertPlainToBinpack;
+
+        if (is_convert_of_type(input_path, output_path, bin_extension, plain_extension))
+            return binpack::convertBinToPlain;
+        if (is_convert_of_type(input_path, output_path, bin_extension, binpack_extension))
+            return binpack::convertBinToBinpack;
+
+        if (is_convert_of_type(input_path, output_path, binpack_extension, plain_extension))
+            return binpack::convertBinpackToPlain;
+        if (is_convert_of_type(input_path, output_path, binpack_extension, bin_extension))
+            return binpack::convertBinpackToBin;
+
+        return nullptr;
+    }
+
+    static void convert(const std::string& input_path, const std::string& output_path, std::ios_base::openmode om, bool validate)
+    {
+        if(!file_exists(input_path))
+        {
+            std::cerr << "Input file does not exist.\n";
+            return;
+        }
+
+        auto func = get_convert_function(input_path, output_path);
+        if (func != nullptr)
+        {
+            func(input_path, output_path, om, validate);
+        }
+        else
+        {
+            std::cerr << "Conversion between files of these types is not supported.\n";
+        }
+    }
+
+    static void convert(const std::vector<std::string>& args)
+    {
+        if (args.size() < 2 || args.size() > 4)
+        {
+            std::cerr << "Invalid arguments.\n";
+            std::cerr << "Usage: convert from_path to_path [append] [validate]\n";
+            return;
+        }
+
+        const bool append = std::find(args.begin() + 2, args.end(), "append") != args.end();
+        const bool validate = std::find(args.begin() + 2, args.end(), "validate") != args.end();
+
+        const std::ios_base::openmode openmode =
+            append
+            ? std::ios_base::app
+            : std::ios_base::trunc;
+
+        convert(args[0], args[1], openmode, validate);
+    }
+
+    void convert(istringstream& is)
+    {
+        std::vector<std::string> args;
+
+        while (true)
+        {
+            std::string token = "";
+            is >> token;
+            if (token == "")
+                break;
+
+            args.push_back(token);
+        }
+
+        convert(args);
+    }
+
+    static void append_files_from_dir(
+        std::vector<std::string>& filenames,
+        const std::string& base_dir,
+        const std::string& target_dir)
+    {
+        string kif_base_dir = Path::combine(base_dir, target_dir);
+
+        namespace sys = std::filesystem;
+        sys::path p(kif_base_dir); // Origin of enumeration
+        std::for_each(sys::directory_iterator(p), sys::directory_iterator(),
+            [&](const sys::path& path) {
+                if (sys::is_regular_file(path))
+                    filenames.push_back(Path::combine(target_dir, path.filename().generic_string()));
+            });
+    }
+
+    static void rebase_files(
+        std::vector<std::string>& filenames,
+        const std::string& base_dir)
+    {
+        for (auto& file : filenames)
+        {
+            file = Path::combine(base_dir, file);
+        }
+    }
+
+    void convert_bin_from_pgn_extract(std::istringstream& is)
+    {
+        std::vector<std::string> filenames;
+
+        string base_dir;
+        string target_dir;
+
+        bool pgn_eval_side_to_move = false;
+        bool convert_no_eval_fens_as_score_zero = false;
+
+        string output_file_name = "shuffled_sfen.bin";
+
+        while (true)
+        {
+            string option;
+            is >> option;
+
+            if (option == "")
+                break;
+
+            if (option == "targetdir") is >> target_dir;
+            else if (option == "targetfile")
+            {
+                std::string filename;
+                is >> filename;
+                filenames.push_back(filename);
+            }
+
+            else if (option == "basedir")   is >> base_dir;
+
+            else if (option == "pgn_eval_side_to_move") is >> pgn_eval_side_to_move;
+            else if (option == "convert_no_eval_fens_as_score_zero") is >> convert_no_eval_fens_as_score_zero;
+            else if (option == "output_file_name") is >> output_file_name;
+            else
+            {
+                cout << "Unknown option: " << option << ". Ignoring.\n";
+            }
+        }
+
+        if (!target_dir.empty())
+        {
+            append_files_from_dir(filenames, base_dir, target_dir);
+        }
+        rebase_files(filenames, base_dir);
+
+        Eval::NNUE::init();
+
+        cout << "convert_bin_from_pgn-extract.." << endl;
+        convert_bin_from_pgn_extract(
+            filenames,
+            output_file_name,
+            pgn_eval_side_to_move,
+            convert_no_eval_fens_as_score_zero);
+    }
+
+    void convert_bin(std::istringstream& is)
+    {
+        std::vector<std::string> filenames;
+
+        string base_dir;
+        string target_dir;
+
+        int ply_minimum = 0;
+        int ply_maximum = 114514;
+        bool interpolate_eval = 0;
+        bool check_invalid_fen = false;
+        bool check_illegal_move = false;
+
+        bool pgn_eval_side_to_move = false;
+        bool convert_no_eval_fens_as_score_zero = false;
+
+        double src_score_min_value = 0.0;
+        double src_score_max_value = 1.0;
+        double dest_score_min_value = 0.0;
+        double dest_score_max_value = 1.0;
+
+        string output_file_name = "shuffled_sfen.bin";
+
+        while (true)
+        {
+            string option;
+            is >> option;
+
+            if (option == "")
+                break;
+
+            if (option == "targetdir") is >> target_dir;
+            else if (option == "targetfile")
+            {
+                std::string filename;
+                is >> filename;
+                filenames.push_back(filename);
+            }
+
+            else if (option == "basedir")   is >> base_dir;
+
+            else if (option == "ply_minimum") is >> ply_minimum;
+            else if (option == "ply_maximum") is >> ply_maximum;
+            else if (option == "interpolate_eval") is >> interpolate_eval;
+            else if (option == "check_invalid_fen") is >> check_invalid_fen;
+            else if (option == "check_illegal_move") is >> check_illegal_move;
+            else if (option == "pgn_eval_side_to_move") is >> pgn_eval_side_to_move;
+            else if (option == "convert_no_eval_fens_as_score_zero") is >> convert_no_eval_fens_as_score_zero;
+            else if (option == "src_score_min_value") is >> src_score_min_value;
+            else if (option == "src_score_max_value") is >> src_score_max_value;
+            else if (option == "dest_score_min_value") is >> dest_score_min_value;
+            else if (option == "dest_score_max_value") is >> dest_score_max_value;
+            else if (option == "output_file_name") is >> output_file_name;
+            else
+            {
+                cout << "Unknown option: " << option << ". Ignoring.\n";
+            }
+        }
+
+        if (!target_dir.empty())
+        {
+            append_files_from_dir(filenames, base_dir, target_dir);
+        }
+        rebase_files(filenames, base_dir);
+
+        Eval::NNUE::init();
+
+        cout << "convert_bin.." << endl;
+            convert_bin(
+                filenames,
+                output_file_name,
+                ply_minimum,
+                ply_maximum,
+                interpolate_eval,
+                src_score_min_value,
+                src_score_max_value,
+                dest_score_min_value,
+                dest_score_max_value,
+                check_invalid_fen,
+                check_illegal_move
+            );
+    }
+
+    void convert_plain(std::istringstream& is)
+    {
+        std::vector<std::string> filenames;
+
+        string base_dir;
+        string target_dir;
+
+        string output_file_name = "shuffled_sfen.bin";
+
+        while (true)
+        {
+            string option;
+            is >> option;
+
+            if (option == "")
+                break;
+
+            if (option == "targetdir") is >> target_dir;
+            else if (option == "targetfile")
+            {
+                std::string filename;
+                is >> filename;
+                filenames.push_back(filename);
+            }
+
+            else if (option == "basedir")   is >> base_dir;
+
+            else if (option == "output_file_name") is >> output_file_name;
+            else
+            {
+                cout << "Unknown option: " << option << ". Ignoring.\n";
+            }
+        }
+
+        if (!target_dir.empty())
+        {
+            append_files_from_dir(filenames, base_dir, target_dir);
+        }
+        rebase_files(filenames, base_dir);
+
+        Eval::NNUE::init();
+
+        cout << "convert_plain.." << endl;
+        convert_plain(filenames, output_file_name);
+    }
+}
diff --git a/src/learn/convert.h b/src/learn/convert.h
new file mode 100644
index 00000000..227f0799
--- /dev/null
+++ b/src/learn/convert.h
@@ -0,0 +1,18 @@
+#ifndef _CONVERT_H_
+#define _CONVERT_H_
+
+#include <vector>
+#include <string>
+#include <sstream>
+
+namespace Learner {
+    void convert(std::istringstream& is);
+
+    void convert_bin_from_pgn_extract(std::istringstream& is);
+
+    void convert_bin(std::istringstream& is);
+
+    void convert_plain(std::istringstream& is);
+}
+
+#endif
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
new file mode 100644
index 00000000..5f8bbba1
--- /dev/null
+++ b/src/learn/gensfen.cpp
@@ -0,0 +1,962 @@
+﻿#include "gensfen.h"
+
+#include "sfen_writer.h"
+#include "packed_sfen.h"
+#include "opening_book.h"
+
+#include "misc.h"
+#include "position.h"
+#include "thread.h"
+#include "tt.h"
+#include "uci.h"
+
+#include "extra/nnue_data_binpack_format.h"
+
+#include "nnue/evaluate_nnue.h"
+#include "nnue/evaluate_nnue_learner.h"
+
+#include "syzygy/tbprobe.h"
+
+#include <atomic>
+#include <chrono>
+#include <climits>
+#include <cmath>
+#include <cstring>
+#include <filesystem>
+#include <fstream>
+#include <iomanip>
+#include <limits>
+#include <list>
+#include <memory>
+#include <optional>
+#include <random>
+#include <shared_mutex>
+#include <sstream>
+#include <unordered_set>
+
+using namespace std;
+
+namespace Learner
+{
+    // Class to generate sfen with multiple threads
+    struct Gensfen
+    {
+        struct Params
+        {
+            // Min and max depths for search during gensfen
+            int search_depth_min = 3;
+            int search_depth_max = -1;
+
+            // Number of the nodes to be searched.
+            // 0 represents no limits.
+            uint64_t nodes = 0;
+
+            // Upper limit of evaluation value of generated situation
+            int eval_limit = 3000;
+
+            // minimum ply with random move
+            // maximum ply with random move
+            // Number of random moves in one station
+            int random_move_minply = 1;
+            int random_move_maxply = 24;
+            int random_move_count = 5;
+
+            // Move kings with a probability of 1/N when randomly moving like Apery software.
+            // When you move the king again, there is a 1/N chance that it will randomly moved
+            // once in the opponent's turn.
+            // Apery has N=2. Specifying 0 here disables this function.
+            int random_move_like_apery = 0;
+
+            // For when using multi pv instead of random move.
+            // random_multi_pv is the number of candidates for MultiPV.
+            // When adopting the move of the candidate move, the difference
+            // between the evaluation value of the move of the 1st place
+            // and the evaluation value of the move of the Nth place is.
+            // Must be in the range random_multi_pv_diff.
+            // random_multi_pv_depth is the search depth for MultiPV.
+            int random_multi_pv = 0;
+            int random_multi_pv_diff = 32000;
+            int random_multi_pv_depth = -1;
+
+            // The minimum and maximum ply (number of steps from
+            // the initial phase) of the sfens to write out.
+            int write_minply = 16;
+            int write_maxply = 400;
+
+            uint64_t save_every = std::numeric_limits<uint64_t>::max();
+
+            std::string output_file_name = "generated_kifu";
+
+            SfenOutputType sfen_format = SfenOutputType::Binpack;
+
+            std::string seed;
+
+            bool write_out_draw_game_in_training_data_generation = true;
+            bool detect_draw_by_consecutive_low_score = true;
+            bool detect_draw_by_insufficient_mating_material = true;
+
+            bool ensure_quiet = false;
+
+            uint64_t num_threads;
+
+            std::string book;
+
+            void enforce_constraints()
+            {
+                search_depth_max = std::max(search_depth_min, search_depth_max);
+                random_multi_pv_depth = std::max(search_depth_min, random_multi_pv_depth);
+
+                // Limit the maximum to a one-stop score. (Otherwise you might not end the loop)
+                eval_limit = std::min(eval_limit, (int)mate_in(2));
+
+                save_every = std::max(save_every, REPORT_STATS_EVERY);
+
+                num_threads = Options["Threads"];
+            }
+        };
+
+        // Hash to limit the export of identical sfens
+        static constexpr uint64_t GENSFEN_HASH_SIZE = 64 * 1024 * 1024;
+        // It must be 2**N because it will be used as the mask to calculate hash_index.
+        static_assert((GENSFEN_HASH_SIZE& (GENSFEN_HASH_SIZE - 1)) == 0);
+
+        static constexpr uint64_t REPORT_DOT_EVERY = 5000;
+        static constexpr uint64_t REPORT_STATS_EVERY = 200000;
+        static_assert(REPORT_STATS_EVERY % REPORT_DOT_EVERY == 0);
+
+        Gensfen(
+            const Params& prm
+        ) :
+            params(prm),
+            prng(prm.seed),
+            sfen_writer(prm.output_file_name, prm.num_threads, prm.save_every, prm.sfen_format)
+        {
+            hash.resize(GENSFEN_HASH_SIZE);
+
+            if (!prm.book.empty())
+            {
+                opening_book = open_opening_book(prm.book, prng);
+                if (opening_book == nullptr)
+                {
+                    std::cout << "WARNING: Failed to open opening book " << prm.book << ". Falling back to startpos.\n";
+                }
+            }
+
+            // Output seed to veryfy by the user if it's not identical by chance.
+            std::cout << prng << std::endl;
+        }
+
+        void generate(uint64_t limit);
+
+    private:
+        Params params;
+
+        PRNG prng;
+
+        std::mutex stats_mutex;
+        TimePoint last_stats_report_time;
+
+        // sfen exporter
+        SfenWriter sfen_writer;
+
+        SynchronizedRegionLogger::Region out;
+
+        vector<Key> hash; // 64MB*sizeof(HASH_KEY) = 512MB
+
+        std::unique_ptr<OpeningBook> opening_book;
+
+        static void set_gensfen_search_limits();
+
+        void generate_worker(
+            Thread& th,
+            std::atomic<uint64_t>& counter,
+            uint64_t limit);
+
+        bool was_seen_before(const Position& pos);
+
+        optional<int8_t> get_current_game_result(
+            Position& pos,
+            const vector<int>& move_hist_scores) const;
+
+        vector<uint8_t> generate_random_move_flags();
+
+        optional<Move> choose_random_move(
+            Position& pos,
+            std::vector<uint8_t>& random_move_flag,
+            int ply,
+            int& random_move_c);
+
+        bool commit_psv(
+            Thread& th,
+            PSVector& sfens,
+            int8_t lastTurnIsWin,
+            std::atomic<uint64_t>& counter,
+            uint64_t limit,
+            Color result_color);
+
+        void report(uint64_t done, uint64_t new_done);
+
+        void maybe_report(uint64_t done);
+    };
+
+    void Gensfen::set_gensfen_search_limits()
+    {
+        // About Search::Limits
+        // Be careful because this member variable is global and affects other threads.
+        auto& limits = Search::Limits;
+
+        // Make the search equivalent to the "go infinite" command. (Because it is troublesome if time management is done)
+        limits.infinite = true;
+
+        // Since PV is an obstacle when displayed, erase it.
+        limits.silent = true;
+
+        // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it.
+        limits.nodes = 0;
+
+        // depth is also processed by the one passed as an argument of Learner::search().
+        limits.depth = 0;
+    }
+
+    void Gensfen::generate(uint64_t limit)
+    {
+        last_stats_report_time = 0;
+
+        set_gensfen_search_limits();
+
+        std::atomic<uint64_t> counter{0};
+        Threads.execute_with_workers([&counter, limit, this](Thread& th) {
+            generate_worker(th, counter, limit);
+        });
+        Threads.wait_for_workers_finished();
+
+        sfen_writer.flush();
+
+        if (limit % REPORT_STATS_EVERY != 0)
+        {
+            report(limit, limit % REPORT_STATS_EVERY);
+        }
+
+        std::cout << std::endl;
+    }
+
+    void Gensfen::generate_worker(
+        Thread& th,
+        std::atomic<uint64_t>& counter,
+        uint64_t limit)
+    {
+        // For the time being, it will be treated as a draw
+        // at the maximum number of steps to write.
+        // Maximum StateInfo + Search PV to advance to leaf buffer
+        std::vector<StateInfo, AlignedAllocator<StateInfo>> states(
+            params.write_maxply + MAX_PLY /* == search_depth_min + α */);
+
+        StateInfo si;
+
+        // end flag
+        bool quit = false;
+
+        // repeat until the specified number of times
+        while (!quit)
+        {
+            // It is necessary to set a dependent thread for Position.
+            // When parallelizing, Threads (since this is a vector<Thread*>,
+            // Do the same for up to Threads[0]...Threads[thread_num-1].
+            auto& pos = th.rootPos;
+            if (opening_book != nullptr)
+            {
+                auto& fen = opening_book->next_fen();
+                pos.set(fen, false, &si, &th);
+            }
+            else
+            {
+                pos.set(StartFEN, false, &si, &th);
+            }
+
+            int resign_counter = 0;
+            bool should_resign = prng.rand(10) > 1;
+            // Vector for holding the sfens in the current simulated game.
+            PSVector packed_sfens;
+            packed_sfens.reserve(params.write_maxply + MAX_PLY);
+
+            // Precomputed flags. Used internally by choose_random_move.
+            vector<uint8_t> random_move_flag = generate_random_move_flags();
+
+            // A counter that keeps track of the number of random moves
+            // When random_move_minply == -1, random moves are
+            // performed continuously, so use it at this time.
+            // Used internally by choose_random_move.
+            int actual_random_move_count = 0;
+
+            // Save history of move scores for adjudication
+            vector<int> move_hist_scores;
+
+            auto flush_psv = [&](int8_t result) {
+                quit = commit_psv(th, packed_sfens, result, counter, limit, pos.side_to_move());
+            };
+
+            for (int ply = 0; ; ++ply)
+            {
+                // Current search depth
+                const int depth = params.search_depth_min + (int)prng.rand(params.search_depth_max - params.search_depth_min + 1);
+
+                // Starting search calls init_for_search
+                auto [search_value, search_pv] = Search::search(pos, depth, 1, params.nodes);
+
+                // This has to be performed after search because it needs to know
+                // rootMoves which are filled in init_for_search.
+                const auto result = get_current_game_result(pos, move_hist_scores);
+                if (result.has_value())
+                {
+                    flush_psv(result.value());
+                    break;
+                }
+
+                // Always adjudivate by eval limit.
+                // Also because of this we don't have to check for TB/MATE scores
+                if (abs(search_value) >= params.eval_limit)
+                {
+                    resign_counter++;
+                    if ((should_resign && resign_counter >= 4) || abs(search_value) >= VALUE_KNOWN_WIN) {
+                        flush_psv((search_value >= params.eval_limit) ? 1 : -1);
+                        break;
+                    }
+                }
+                else
+                {
+                    resign_counter = 0;
+                }
+
+                // In case there is no PV and the game was not ended here
+                // there is nothing we can do, we can't continue the game,
+                // we don't know the result, so discard this game.
+                if (search_pv.empty())
+                {
+                    break;
+                }
+
+                // Save the move score for adjudication.
+                move_hist_scores.push_back(search_value);
+
+                // Discard stuff before write_minply is reached
+                // because it can harm training due to overfitting.
+                // Initial positions would be too common.
+                if (ply >= params.write_minply)
+                {
+                    packed_sfens.emplace_back(PackedSfenValue());
+
+                    auto& psv = packed_sfens.back();
+
+                    if (params.ensure_quiet)
+                    {
+                        auto [qsearch_value, qsearch_pv] = Search::qsearch(pos);
+                        if (qsearch_pv.empty())
+                        {
+                            // Here we only write the position data.
+                            // Result is added after the whole game is done.
+                            pos.sfen_pack(psv.sfen);
+
+                            // Already a quiet position
+                            psv.score = search_value;
+                            psv.move = search_pv[0];
+                            psv.gamePly = ply;
+                        }
+                        else
+                        {
+                            // Navigate to a quiet
+                            int old_ply = ply;
+                            for (auto m : qsearch_pv)
+                            {
+                                pos.do_move(m, states[ply++]);
+                            }
+
+                            if (was_seen_before(pos))
+                            {
+                                // Just skip the move.
+                                packed_sfens.pop_back();
+                            }
+                            else
+                            {
+                                // Reevaluate
+                                auto [quiet_search_value, quiet_search_pv] = Search::search(pos, depth, 1, params.nodes);
+                                if (quiet_search_pv.empty())
+                                {
+                                    // Just skip the move.
+                                    packed_sfens.pop_back();
+                                }
+                                else
+                                {
+                                    // Here we only write the position data.
+                                    // Result is added after the whole game is done.
+                                    pos.sfen_pack(psv.sfen);
+
+                                    psv.score = quiet_search_value;
+                                    psv.move = quiet_search_pv[0];
+                                    psv.gamePly = ply;
+                                }
+                            }
+
+                            // Get back to the game
+                            for (auto it = qsearch_pv.rbegin(); it != qsearch_pv.rend(); ++it)
+                            {
+                                pos.undo_move(*it);
+                            }
+                            ply = old_ply;
+                        }
+                    }
+                    else
+                    {
+                        if (was_seen_before(pos))
+                        {
+                            packed_sfens.pop_back();
+                        }
+                        else
+                        {
+                            // Here we only write the position data.
+                            // Result is added after the whole game is done.
+                            pos.sfen_pack(psv.sfen);
+
+                            psv.score = search_value;
+                            psv.move = search_pv[0];
+                            psv.gamePly = ply;
+                        }
+                    }
+                }
+
+                // Update the next move according to best search result or random move.
+                auto random_move = choose_random_move(pos, random_move_flag, ply, actual_random_move_count);
+                const Move next_move = random_move.has_value() ? *random_move : search_pv[0];
+
+                // We don't have the whole game yet, but it ended,
+                // so the writing process ends and the next game starts.
+                // This shouldn't really happen.
+                if (!is_ok(next_move))
+                {
+                    break;
+                }
+
+                // Do move.
+                pos.do_move(next_move, states[ply]);
+            }
+        }
+    }
+
+    bool Gensfen::was_seen_before(const Position& pos)
+    {
+        // Look into the position hashtable to see if the same
+        // position was seen before.
+        // This is a good heuristic to exlude already seen
+        // positions without many false positives.
+        auto key = pos.key();
+        auto hash_index = (size_t)(key & (GENSFEN_HASH_SIZE - 1));
+        auto old_key = hash[hash_index];
+        if (key == old_key)
+        {
+            return true;
+        }
+        else
+        {
+            // Replace with the current key.
+            hash[hash_index] = key;
+            return false;
+        }
+    }
+
+    optional<int8_t> Gensfen::get_current_game_result(
+        Position& pos,
+        const vector<int>& move_hist_scores) const
+    {
+        // Variables for draw adjudication.
+        // Todo: Make this as an option.
+
+        // start the adjudication when ply reaches this value
+        constexpr int adj_draw_ply = 80;
+
+        // 4 move scores for each side have to be checked
+        constexpr int adj_draw_cnt = 8;
+
+        // move score in CP
+        constexpr int adj_draw_score = 0;
+
+        // For the time being, it will be treated as a
+        // draw at the maximum number of steps to write.
+        const int ply = move_hist_scores.size();
+
+        // has it reached the max length or is a draw
+        if (ply >= params.write_maxply || pos.is_draw(ply))
+        {
+            return 0;
+        }
+
+        if(pos.this_thread()->rootMoves.empty())
+        {
+            // If there is no legal move
+            return pos.checkers()
+                ? -1 /* mate */
+                : 0 /* stalemate */;
+        }
+
+        // Adjudicate game to a draw if the last 4 scores of each engine is 0.
+        if (params.detect_draw_by_consecutive_low_score)
+        {
+            if (ply >= adj_draw_ply)
+            {
+                int num_cons_plies_within_draw_score = 0;
+                bool is_adj_draw = false;
+
+                for (auto it = move_hist_scores.rbegin();
+                    it != move_hist_scores.rend(); ++it)
+                {
+                    if (abs(*it) <= adj_draw_score)
+                    {
+                        num_cons_plies_within_draw_score++;
+                    }
+                    else
+                    {
+                        // Draw scores must happen on consecutive plies
+                        break;
+                    }
+
+                    if (num_cons_plies_within_draw_score >= adj_draw_cnt)
+                    {
+                        is_adj_draw = true;
+                        break;
+                    }
+                }
+
+                if (is_adj_draw)
+                {
+                    return 0;
+                }
+            }
+        }
+
+        // Draw by insufficient mating material
+        if (params.detect_draw_by_insufficient_mating_material)
+        {
+            if (pos.count<ALL_PIECES>() <= 4)
+            {
+                int num_pieces = pos.count<ALL_PIECES>();
+
+                // (1) KvK
+                if (num_pieces == 2)
+                {
+                    return 0;
+                }
+
+                // (2) KvK + 1 minor piece
+                if (num_pieces == 3)
+                {
+                    int minor_pc = pos.count<BISHOP>(WHITE) + pos.count<KNIGHT>(WHITE) +
+                        pos.count<BISHOP>(BLACK) + pos.count<KNIGHT>(BLACK);
+                    if (minor_pc == 1)
+                    {
+                        return 0;
+                    }
+                }
+
+                // (3) KBvKB, bishops of the same color
+                else if (num_pieces == 4)
+                {
+                    if (pos.count<BISHOP>(WHITE) == 1 && pos.count<BISHOP>(BLACK) == 1)
+                    {
+                        // Color of bishops is black.
+                        if ((pos.pieces(WHITE, BISHOP) & DarkSquares)
+                            && (pos.pieces(BLACK, BISHOP) & DarkSquares))
+                        {
+                            return 0;
+                        }
+                        // Color of bishops is white.
+                        if ((pos.pieces(WHITE, BISHOP) & ~DarkSquares)
+                            && (pos.pieces(BLACK, BISHOP) & ~DarkSquares))
+                        {
+                            return 0;
+                        }
+                    }
+                }
+            }
+        }
+
+        return nullopt;
+    }
+
+    vector<uint8_t> Gensfen::generate_random_move_flags()
+    {
+        vector<uint8_t> random_move_flag;
+
+        // Depending on random move selection parameters setup
+        // the array of flags that indicates whether a random move
+        // be taken at a given ply.
+
+        // Make an array like a[0] = 0 ,a[1] = 1, ...
+        // Fisher-Yates shuffle and take out the first N items.
+        // Actually, I only want N pieces, so I only need
+        // to shuffle the first N pieces with Fisher-Yates.
+
+        vector<int> a;
+        a.reserve((size_t)params.random_move_maxply);
+
+        // random_move_minply ,random_move_maxply is specified by 1 origin,
+        // Note that we are handling 0 origin here.
+        for (int i = std::max(params.random_move_minply - 1, 0); i < params.random_move_maxply; ++i)
+        {
+            a.push_back(i);
+        }
+
+        // In case of Apery random move, insert() may be called random_move_count times.
+        // Reserve only the size considering it.
+        random_move_flag.resize((size_t)params.random_move_maxply + params.random_move_count);
+
+        // A random move that exceeds the size() of a[] cannot be applied, so limit it.
+        for (int i = 0; i < std::min(params.random_move_count, (int)a.size()); ++i)
+        {
+            swap(a[i], a[prng.rand((uint64_t)a.size() - i) + i]);
+            random_move_flag[a[i]] = true;
+        }
+
+        return random_move_flag;
+    }
+
+    optional<Move> Gensfen::choose_random_move(
+        Position& pos,
+        std::vector<uint8_t>& random_move_flag,
+        int ply,
+        int& random_move_c)
+    {
+        optional<Move> random_move;
+
+        // Randomly choose one from legal move
+        if (
+            // 1. Random move of random_move_count times from random_move_minply to random_move_maxply
+            (params.random_move_minply != -1 && ply < (int)random_move_flag.size() && random_move_flag[ply]) ||
+            // 2. A mode to perform random move of random_move_count times after leaving the startpos
+            (params.random_move_minply == -1 && random_move_c < params.random_move_count))
+        {
+            ++random_move_c;
+
+            // It's not a mate, so there should be one legal move...
+            if (params.random_multi_pv == 0)
+            {
+                // Normal random move
+                MoveList<LEGAL> list(pos);
+
+                // I don't really know the goodness and badness of making this the Apery method.
+                if (params.random_move_like_apery == 0
+                    || prng.rand(params.random_move_like_apery) != 0)
+                {
+                    // Normally one move from legal move
+                    random_move = list.at((size_t)prng.rand((uint64_t)list.size()));
+                }
+                else
+                {
+                    // if you can move the king, move the king
+                    Move moves[8]; // Near 8
+                    Move* p = &moves[0];
+                    for (auto& m : list)
+                    {
+                        if (type_of(pos.moved_piece(m)) == KING)
+                        {
+                            *(p++) = m;
+                        }
+                    }
+
+                    size_t n = p - &moves[0];
+                    if (n != 0)
+                    {
+                        // move to move the king
+                        random_move = moves[prng.rand(n)];
+
+                        // In Apery method, at this time there is a 1/2 chance
+                        // that the opponent will also move randomly
+                        if (prng.rand(2) == 0)
+                        {
+                            // Is it a simple hack to add a "1" next to random_move_flag[ply]?
+                            random_move_flag.insert(random_move_flag.begin() + ply + 1, 1, true);
+                        }
+                    }
+                    else
+                    {
+                        // Normally one move from legal move
+                        random_move = list.at((size_t)prng.rand((uint64_t)list.size()));
+                    }
+                }
+            }
+            else
+            {
+                Search::search(pos, params.random_multi_pv_depth, params.random_multi_pv);
+
+                // Select one from the top N hands of root Moves
+                auto& rm = pos.this_thread()->rootMoves;
+
+                uint64_t s = min((uint64_t)rm.size(), (uint64_t)params.random_multi_pv);
+                for (uint64_t i = 1; i < s; ++i)
+                {
+                    // The difference from the evaluation value of rm[0] must
+                    // be within the range of random_multi_pv_diff.
+                    // It can be assumed that rm[x].score is arranged in descending order.
+                    if (rm[0].score > rm[i].score + params.random_multi_pv_diff)
+                    {
+                        s = i;
+                        break;
+                    }
+                }
+
+                random_move = rm[prng.rand(s)].pv[0];
+            }
+        }
+
+        return random_move;
+    }
+
+    // Write out the phases loaded in sfens to a file.
+    // result: win/loss in the next phase after the final phase in sfens
+    // 1 when winning. -1 when losing. Pass 0 for a draw.
+    // Return value: true if the specified number of
+    // sfens has already been reached and the process ends.
+    bool Gensfen::commit_psv(
+        Thread& th,
+        PSVector& sfens,
+        int8_t result,
+        std::atomic<uint64_t>& counter,
+        uint64_t limit,
+        Color result_color)
+    {
+        if (!params.write_out_draw_game_in_training_data_generation && result == 0)
+        {
+            // We didn't write anything so why quit.
+            return false;
+        }
+
+        auto side_to_move_from_sfen = [](auto& sfen){
+            return (Color)(sfen.sfen.data[0] & 1);
+        };
+
+        // From the final stage (one step before) to the first stage, give information on the outcome of the game for each stage.
+        // The phases stored in sfens are assumed to be continuous (in order).
+        for (auto it = sfens.rbegin(); it != sfens.rend(); ++it)
+        {
+            // The side to move is packed as the lowest bit of the first byte
+            const Color side_to_move = side_to_move_from_sfen(*it);
+            it->game_result = side_to_move == result_color ? result : -result;
+        }
+
+        // Write sfens in move order to make potential compression easier
+        for (auto& sfen : sfens)
+        {
+            // Return true if there is already enough data generated.
+            const auto iter = counter.fetch_add(1);
+            if (iter >= limit)
+                return true;
+
+            // because `iter` was done, now we do one more
+            maybe_report(iter + 1);
+
+            // Write out one sfen.
+            sfen_writer.write(th.thread_idx(), sfen);
+        }
+
+        return false;
+    }
+
+    void Gensfen::report(uint64_t done, uint64_t new_done)
+    {
+        const auto now_time = now();
+        const TimePoint elapsed = now_time - last_stats_report_time + 1;
+
+        out
+            << endl
+            << done << " sfens, "
+            << new_done * 1000 / elapsed << " sfens/second, "
+            << "at " << now_string() << sync_endl;
+
+        last_stats_report_time = now_time;
+
+        out = sync_region_cout.new_region();
+    }
+
+    void Gensfen::maybe_report(uint64_t done)
+    {
+        if (done % REPORT_DOT_EVERY == 0)
+        {
+            std::lock_guard lock(stats_mutex);
+
+            if (last_stats_report_time == 0)
+            {
+                last_stats_report_time = now();
+                out = sync_region_cout.new_region();
+            }
+
+            if (done != 0)
+            {
+                out << '.';
+
+                if (done % REPORT_STATS_EVERY == 0)
+                {
+                    report(done, REPORT_STATS_EVERY);
+                }
+            }
+        }
+    }
+
+    // Command to generate a game record
+    void gensfen(istringstream& is)
+    {
+        // Number of generated game records default = 8 billion phases (Ponanza specification)
+        uint64_t loop_max = 8000000000UL;
+
+        Gensfen::Params params;
+
+        // Add a random number to the end of the file name.
+        bool random_file_name = false;
+        std::string sfen_format = "binpack";
+
+        string token;
+        while (true)
+        {
+            token = "";
+            is >> token;
+            if (token == "")
+                break;
+
+            if (token == "depth")
+                is >> params.search_depth_min;
+            else if (token == "depth2")
+                is >> params.search_depth_max;
+            else if (token == "nodes")
+                is >> params.nodes;
+            else if (token == "loop")
+                is >> loop_max;
+            else if (token == "output_file_name")
+                is >> params.output_file_name;
+            else if (token == "eval_limit")
+                is >> params.eval_limit;
+            else if (token == "random_move_minply")
+                is >> params.random_move_minply;
+            else if (token == "random_move_maxply")
+                is >> params.random_move_maxply;
+            else if (token == "random_move_count")
+                is >> params.random_move_count;
+            else if (token == "random_move_like_apery")
+                is >> params.random_move_like_apery;
+            else if (token == "random_multi_pv")
+                is >> params.random_multi_pv;
+            else if (token == "random_multi_pv_diff")
+                is >> params.random_multi_pv_diff;
+            else if (token == "random_multi_pv_depth")
+                is >> params.random_multi_pv_depth;
+            else if (token == "write_minply")
+                is >> params.write_minply;
+            else if (token == "write_maxply")
+                is >> params.write_maxply;
+            else if (token == "save_every")
+                is >> params.save_every;
+            else if (token == "book")
+                is >> params.book;
+            else if (token == "random_file_name")
+                is >> random_file_name;
+            // Accept also the old option name.
+            else if (token == "use_draw_in_training_data_generation" || token == "write_out_draw_game_in_training_data_generation")
+                is >> params.write_out_draw_game_in_training_data_generation;
+            // Accept also the old option name.
+            else if (token == "use_game_draw_adjudication" || token == "detect_draw_by_consecutive_low_score")
+                is >> params.detect_draw_by_consecutive_low_score;
+            else if (token == "detect_draw_by_insufficient_mating_material")
+                is >> params.detect_draw_by_insufficient_mating_material;
+            else if (token == "sfen_format")
+                is >> sfen_format;
+            else if (token == "seed")
+                is >> params.seed;
+            else if (token == "set_recommended_uci_options")
+            {
+                UCI::setoption("Contempt", "0");
+                UCI::setoption("Skill Level", "20");
+                UCI::setoption("UCI_Chess960", "false");
+                UCI::setoption("UCI_AnalyseMode", "false");
+                UCI::setoption("UCI_LimitStrength", "false");
+                UCI::setoption("PruneAtShallowDepth", "false");
+                UCI::setoption("EnableTranspositionTable", "true");
+            }
+            else if (token == "ensure_quiet")
+            {
+                params.ensure_quiet = true;
+            }
+            else
+                cout << "ERROR: Ignoring unknown option " << token << endl;
+        }
+
+        if (!sfen_format.empty())
+        {
+            if (sfen_format == "bin")
+                params.sfen_format = SfenOutputType::Bin;
+            else if (sfen_format == "binpack")
+                params.sfen_format = SfenOutputType::Binpack;
+            else
+                cout << "WARNING: Unknown sfen format `" << sfen_format << "`. Using bin\n";
+        }
+
+        if (params.ensure_quiet)
+        {
+            // Otherwise we can't ensure quiet positions...
+            UCI::setoption("EnableTranspositionTable", "false");
+        }
+
+        if (random_file_name)
+        {
+            // Give a random number to output_file_name at this point.
+            // Do not use std::random_device().  Because it always the same integers on MinGW.
+            PRNG r(params.seed);
+
+            // Just in case, reassign the random numbers.
+            for (int i = 0; i < 10; ++i)
+                r.rand(1);
+
+            auto to_hex = [](uint64_t u) {
+                std::stringstream ss;
+                ss << std::hex << u;
+                return ss.str();
+            };
+
+            // I don't want to wear 64bit numbers by accident, so I'next_move going to make a 64bit number 2 just in case.
+            params.output_file_name += "_" + to_hex(r.rand<uint64_t>()) + to_hex(r.rand<uint64_t>());
+        }
+
+        params.enforce_constraints();
+
+        std::cout << "INFO: Executing gensfen command\n";
+
+        std::cout << "INFO: Parameters:\n";
+        std::cout
+            << "  - search_depth_min       = " << params.search_depth_min << endl
+            << "  - search_depth_max       = " << params.search_depth_max << endl
+            << "  - nodes                  = " << params.nodes << endl
+            << "  - num sfens to generate  = " << loop_max << endl
+            << "  - eval_limit             = " << params.eval_limit << endl
+            << "  - num threads (UCI)      = " << params.num_threads << endl
+            << "  - random_move_minply     = " << params.random_move_minply << endl
+            << "  - random_move_maxply     = " << params.random_move_maxply << endl
+            << "  - random_move_count      = " << params.random_move_count << endl
+            << "  - random_move_like_apery = " << params.random_move_like_apery << endl
+            << "  - random_multi_pv        = " << params.random_multi_pv << endl
+            << "  - random_multi_pv_diff   = " << params.random_multi_pv_diff << endl
+            << "  - random_multi_pv_depth  = " << params.random_multi_pv_depth << endl
+            << "  - write_minply           = " << params.write_minply << endl
+            << "  - write_maxply           = " << params.write_maxply << endl
+            << "  - book                   = " << params.book << endl
+            << "  - output_file_name       = " << params.output_file_name << endl
+            << "  - save_every             = " << params.save_every << endl
+            << "  - random_file_name       = " << random_file_name << endl
+            << "  - write_drawn_games      = " << params.write_out_draw_game_in_training_data_generation << endl
+            << "  - draw by low score      = " << params.detect_draw_by_consecutive_low_score << endl
+            << "  - draw by insuff. mat.   = " << params.detect_draw_by_insufficient_mating_material << endl;
+
+        // Show if the training data generator uses NNUE.
+        Eval::NNUE::verify_eval_file_loaded();
+
+        Threads.main()->ponder = false;
+
+        Gensfen gensfen(params);
+        gensfen.generate(loop_max);
+
+        std::cout << "INFO: Gensfen finished." << endl;
+    }
+}
diff --git a/src/learn/gensfen.h b/src/learn/gensfen.h
new file mode 100644
index 00000000..c0a7c978
--- /dev/null
+++ b/src/learn/gensfen.h
@@ -0,0 +1,14 @@
+#ifndef _GENSFEN_H_
+#define _GENSFEN_H_
+
+#include "position.h"
+
+#include <sstream>
+
+namespace Learner {
+
+    // Automatic generation of teacher position
+    void gensfen(std::istringstream& is);
+}
+
+#endif
\ No newline at end of file
diff --git a/src/learn/gensfen2019.cpp b/src/learn/gensfen2019.cpp
deleted file mode 100644
index 01293b9c..00000000
--- a/src/learn/gensfen2019.cpp
+++ /dev/null
@@ -1 +0,0 @@
-// just a place holder
diff --git a/src/learn/half_float.h b/src/learn/half_float.h
index 30b3e482..5808a786 100644
--- a/src/learn/half_float.h
+++ b/src/learn/half_float.h
@@ -7,126 +7,126 @@
 // Floating point operation by 16bit type
 // Assume that the float type code generated by the compiler is in IEEE 754 format and use it.
 
-#include "../types.h"
+#include "types.h"
 
 namespace HalfFloat
 {
-	// IEEE 754 float 32 format is :
-	//   sign(1bit) + exponent(8bits) + fraction(23bits) = 32bits
-	//
-	// Our float16 format is :
-	//   sign(1bit) + exponent(5bits) + fraction(10bits) = 16bits
-	union float32_converter
-	{
-		int32_t n;
-		float f;
-	};
+    // IEEE 754 float 32 format is :
+    //   sign(1bit) + exponent(8bits) + fraction(23bits) = 32bits
+    //
+    // Our float16 format is :
+    //   sign(1bit) + exponent(5bits) + fraction(10bits) = 16bits
+    union float32_converter
+    {
+        int32_t n;
+        float f;
+    };
 
 
-	// 16-bit float
-	struct float16
-	{
-		// --- constructors
+    // 16-bit float
+    struct float16
+    {
+        // --- constructors
 
-		float16() {}
-		float16(int16_t n) { from_float((float)n);  }
-		float16(int32_t n) { from_float((float)n); }
-		float16(float n) { from_float(n); }
-		float16(double n) { from_float((float)n); }
+        float16() {}
+        float16(int16_t n) { from_float((float)n);  }
+        float16(int32_t n) { from_float((float)n); }
+        float16(float n) { from_float(n); }
+        float16(double n) { from_float((float)n); }
 
-		// build from a float
-		void from_float(float f) { *this = to_float16(f); }
+        // build from a float
+        void from_float(float f) { *this = to_float16(f); }
 
-		// --- implicit converters
+        // --- implicit converters
 
-		operator int32_t() const { return (int32_t)to_float(*this); }
-		operator float() const { return to_float(*this); }
-		operator double() const { return double(to_float(*this)); }
+        operator int32_t() const { return (int32_t)to_float(*this); }
+        operator float() const { return to_float(*this); }
+        operator double() const { return double(to_float(*this)); }
 
-		// --- operators
+        // --- operators
 
-		float16 operator += (float16 rhs) { from_float(to_float(*this) + to_float(rhs)); return *this; }
-		float16 operator -= (float16 rhs) { from_float(to_float(*this) - to_float(rhs)); return *this; }
-		float16 operator *= (float16 rhs) { from_float(to_float(*this) * to_float(rhs)); return *this; }
-		float16 operator /= (float16 rhs) { from_float(to_float(*this) / to_float(rhs)); return *this; }
-		float16 operator + (float16 rhs) const { return float16(*this) += rhs; }
-		float16 operator - (float16 rhs) const { return float16(*this) -= rhs; }
-		float16 operator * (float16 rhs) const { return float16(*this) *= rhs; }
-		float16 operator / (float16 rhs) const { return float16(*this) /= rhs; }
-		float16 operator - () const { return float16(-to_float(*this)); }
-		bool operator == (float16 rhs) const { return this->v_ == rhs.v_; }
-		bool operator != (float16 rhs) const { return !(*this == rhs); }
+        float16 operator += (float16 rhs) { from_float(to_float(*this) + to_float(rhs)); return *this; }
+        float16 operator -= (float16 rhs) { from_float(to_float(*this) - to_float(rhs)); return *this; }
+        float16 operator *= (float16 rhs) { from_float(to_float(*this) * to_float(rhs)); return *this; }
+        float16 operator /= (float16 rhs) { from_float(to_float(*this) / to_float(rhs)); return *this; }
+        float16 operator + (float16 rhs) const { return float16(*this) += rhs; }
+        float16 operator - (float16 rhs) const { return float16(*this) -= rhs; }
+        float16 operator * (float16 rhs) const { return float16(*this) *= rhs; }
+        float16 operator / (float16 rhs) const { return float16(*this) /= rhs; }
+        float16 operator - () const { return float16(-to_float(*this)); }
+        bool operator == (float16 rhs) const { return this->v_ == rhs.v_; }
+        bool operator != (float16 rhs) const { return !(*this == rhs); }
 
-		static void UnitTest() { unit_test(); }
+        static void UnitTest() { unit_test(); }
 
-	private:
+    private:
 
-		// --- entity
+        // --- entity
 
-		uint16_t v_;
+        uint16_t v_;
 
-		// --- conversion between float and float16
+        // --- conversion between float and float16
 
-		static float16 to_float16(float f)
-		{
-			float32_converter c;
-			c.f = f;
-			u32 n = c.n;
+        static float16 to_float16(float f)
+        {
+            float32_converter c;
+            c.f = f;
+            u32 n = c.n;
 
-			// The sign bit is MSB in common.
-			uint16_t sign_bit = (n >> 16) & 0x8000;
+            // The sign bit is MSB in common.
+            uint16_t sign_bit = (n >> 16) & 0x8000;
 
-			// The exponent of IEEE 754's float 32 is biased +127 , so we change this bias into +15 and limited to 5-bit.
-			uint16_t exponent = (((n >> 23) - 127 + 15) & 0x1f) << 10;
+            // The exponent of IEEE 754's float 32 is biased +127 , so we change this bias into +15 and limited to 5-bit.
+            uint16_t exponent = (((n >> 23) - 127 + 15) & 0x1f) << 10;
 
-			// The fraction is limited to 10-bit.
-			uint16_t fraction = (n >> (23-10)) & 0x3ff;
+            // The fraction is limited to 10-bit.
+            uint16_t fraction = (n >> (23-10)) & 0x3ff;
 
-			float16 f_;
-			f_.v_ = sign_bit | exponent | fraction;
+            float16 f_;
+            f_.v_ = sign_bit | exponent | fraction;
 
-			return f_;
-		}
+            return f_;
+        }
 
-		static float to_float(float16 v)
-		{
-			u32 sign_bit = (v.v_ & 0x8000) << 16;
-			u32 exponent = ((((v.v_ >> 10) & 0x1f) - 15 + 127) & 0xff) << 23;
-			u32 fraction = (v.v_ & 0x3ff) << (23 - 10);
+        static float to_float(float16 v)
+        {
+            u32 sign_bit = (v.v_ & 0x8000) << 16;
+            u32 exponent = ((((v.v_ >> 10) & 0x1f) - 15 + 127) & 0xff) << 23;
+            u32 fraction = (v.v_ & 0x3ff) << (23 - 10);
 
-			float32_converter c;
-			c.n = sign_bit | exponent | fraction;
-			return c.f;
-		}
+            float32_converter c;
+            c.n = sign_bit | exponent | fraction;
+            return c.f;
+        }
 
-		// It is not a unit test, but I confirmed that it can be calculated. I'll fix the code later (maybe).
-		static void unit_test()
-		{
-			float16 a, b, c, d;
-			a = 1;
-			std::cout << (float)a << std::endl;
-			b = -118.625;
-			std::cout << (float)b << std::endl;
-			c = 2.5;
-			std::cout << (float)c << std::endl;
-			d = a + c;
-			std::cout << (float)d << std::endl;
+        // It is not a unit test, but I confirmed that it can be calculated. I'll fix the code later (maybe).
+        static void unit_test()
+        {
+            float16 a, b, c, d;
+            a = 1;
+            std::cout << (float)a << std::endl;
+            b = -118.625;
+            std::cout << (float)b << std::endl;
+            c = 2.5;
+            std::cout << (float)c << std::endl;
+            d = a + c;
+            std::cout << (float)d << std::endl;
 
-			c *= 1.5;
-			std::cout << (float)c << std::endl;
+            c *= 1.5;
+            std::cout << (float)c << std::endl;
 
-			b /= 3;
-			std::cout << (float)b << std::endl;
+            b /= 3;
+            std::cout << (float)b << std::endl;
 
-			float f1 = 1.5;
-			a += f1;
-			std::cout << (float)a << std::endl;
+            float f1 = 1.5;
+            a += f1;
+            std::cout << (float)a << std::endl;
 
-			a += f1 * (float)a;
-			std::cout << (float)a << std::endl;
-		}
+            a += f1 * (float)a;
+            std::cout << (float)a << std::endl;
+        }
 
-	};
+    };
 
 }
 
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
new file mode 100644
index 00000000..449542a7
--- /dev/null
+++ b/src/learn/learn.cpp
@@ -0,0 +1,1335 @@
+﻿// Learning routines:
+//
+// 1) Automatic generation of game records in .bin format
+// → "gensfen" command
+//
+// 2) Learning evaluation function parameters from the generated .bin files
+// → "learn" command
+//
+// → Shuffle in the teacher phase is also an extension of this command.
+// Example) "learn shuffle"
+//
+// 3) Automatic generation of fixed traces
+// → "makebook think" command
+// → implemented in extra/book/book.cpp
+//
+// 4) Post-station automatic review mode
+// → I will not be involved in the engine because it is a problem that the GUI should assist.
+// etc..
+
+#include "learn.h"
+
+#include "autograd.h"
+#include "sfen_reader.h"
+
+#include "misc.h"
+#include "position.h"
+#include "thread.h"
+#include "tt.h"
+#include "uci.h"
+#include "search.h"
+#include "timeman.h"
+
+#include "nnue/evaluate_nnue.h"
+#include "nnue/evaluate_nnue_learner.h"
+
+#include "syzygy/tbprobe.h"
+
+#include <chrono>
+#include <climits>
+#include <cmath>    // std::exp(),std::pow(),std::log()
+#include <cstring>  // memcpy()
+#include <filesystem>
+#include <fstream>
+#include <iomanip>
+#include <limits>
+#include <list>
+#include <memory>
+#include <optional>
+#include <random>
+#include <regex>
+#include <shared_mutex>
+#include <sstream>
+#include <unordered_set>
+#include <iostream>
+
+#if defined (_OPENMP)
+#include <omp.h>
+#endif
+
+using namespace std;
+
+template <typename T>
+T operator +=(std::atomic<T>& x, const T rhs)
+{
+    T old = x.load(std::memory_order_consume);
+
+    // It is allowed that the value is rewritten from other thread at this timing.
+    // The idea that the value is not destroyed is good.
+    T desired = old + rhs;
+    while (!x.compare_exchange_weak(old, desired, std::memory_order_release, std::memory_order_consume))
+        desired = old + rhs;
+    return desired;
+}
+template <typename T>
+T operator -= (std::atomic<T>& x, const T rhs) { return x += -rhs; }
+
+namespace Learner
+{
+    static double winning_probability_coefficient = 1.0 / PawnValueEg / 4.0 * std::log(10.0);
+
+    // Score scale factors. ex) If we set src_score_min_value = 0.0,
+    // src_score_max_value = 1.0, dest_score_min_value = 0.0,
+    // dest_score_max_value = 10000.0, [0.0, 1.0] will be scaled to [0, 10000].
+    static double src_score_min_value = 0.0;
+    static double src_score_max_value = 1.0;
+    static double dest_score_min_value = 0.0;
+    static double dest_score_max_value = 1.0;
+
+    // A constant used in elmo (WCSC27). Adjustment required.
+    // Since elmo does not internally divide the expression, the value is different.
+    // You can set this value with the learn command.
+    // 0.33 is equivalent to the constant (0.5) used in elmo (WCSC27)
+    static double elmo_lambda_low = 1.0;
+    static double elmo_lambda_high = 1.0;
+    static double elmo_lambda_limit = 32000;
+
+    // Using stockfish's WDL with win rate model instead of sigmoid
+    static bool use_wdl = false;
+
+    static void append_files_from_dir(
+        std::vector<std::string>& filenames,
+        const std::string& base_dir,
+        const std::string& target_dir)
+    {
+        string kif_base_dir = Path::combine(base_dir, target_dir);
+
+        namespace sys = std::filesystem;
+        sys::path p(kif_base_dir); // Origin of enumeration
+        std::for_each(sys::directory_iterator(p), sys::directory_iterator(),
+            [&](const sys::path& path) {
+                if (sys::is_regular_file(path))
+                    filenames.push_back(Path::combine(target_dir, path.filename().generic_string()));
+            });
+    }
+
+    static void rebase_files(
+        std::vector<std::string>& filenames,
+        const std::string& base_dir)
+    {
+        for (auto& file : filenames)
+        {
+            file = Path::combine(base_dir, file);
+        }
+    }
+
+    static double calculate_lambda(double teacher_signal)
+    {
+        // If the evaluation value in deep search exceeds elmo_lambda_limit
+        // then apply elmo_lambda_high instead of elmo_lambda_low.
+        const double lambda =
+            (std::abs(teacher_signal) >= elmo_lambda_limit)
+            ? elmo_lambda_high
+            : elmo_lambda_low;
+
+        return lambda;
+    }
+
+    // We use our own simple static autograd for automatic
+    // differentiation of the loss function. While it works it has it's caveats.
+    // To work fast enough it requires memoization and reference semantics.
+    // Memoization is mostly opaque to the user and is only per eval basis.
+    // As for reference semantics, we cannot copy every node,
+    // because we need a way to reuse computation.
+    // But we can't really use shared_ptr because of the overhead. That means
+    // that we have to ensure all parts of a loss expression are not destroyed
+    // before use. When lvalue references are used to construct a node it will
+    // store just a reference, it only perform a copy of the rvalue reference arguments.
+    // This means that we need some storage for the whole computation tree
+    // that keeps the values after function returns and never moves them to
+    // a different memory location. This means that we cannot use local
+    // variables and just return by value - because there may be dangling references left.
+    // We also cannot create a struct with this tree on demand because one cannot
+    // use `auto` as a struct members. This is a big issue, and the only way
+    // to solve it as of now is to use static thread_local variables and rely on the
+    // following assumptions:
+    // 1. the expression node must not change for the duration of the program
+    //    within a single instance of a function. This is usually not a problem
+    //    because almost all information is carried by the type. There is an
+    //    exception though, we have ConstantRef and Constant nodes that
+    //    do not encode the constants in the type, so it's possible
+    //    that these nodes are different on the first call to the function
+    //    then later. We MUST ensure that one function is only ever used
+    //    for one specific expression.
+    // 2. thread_local variables are not expensive. Usually after creation
+    //    it only requires a single unsynchronized boolean check and that's
+    //    how most compilers implement it.
+    //
+    // So the general way to do things right now is to use static thread_local
+    // variables for all named autograd nodes. Results being nodes should be
+    // returned by reference, so that there's no need to copy the returned objects.
+    // Parameters being nodes should be taken by lvalue reference if they are
+    // used more than once (to enable reference semantics to reuse computation),
+    // but they can be rvalues and forward on first use if there's only one use
+    // of the node in the scope.
+    // We must keep in mind that the node tree created by such a function
+    // is never going to change as thread_local variables are initialized
+    // on first call. This means that one cannot use one function as a factory
+    // for different autograd expression trees.
+
+    template <typename ShallowT, typename TeacherT, typename ResultT, typename LambdaT>
+    static auto& cross_entropy_(
+        ShallowT& q_,
+        TeacherT& p_,
+        ResultT& t_,
+        LambdaT& lambda_
+    )
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        constexpr double epsilon = 1e-12;
+
+        static thread_local auto teacher_entropy_ = -(p_ * log(p_ + epsilon) + (1.0 - p_) * log(1.0 - p_ + epsilon));
+        static thread_local auto outcome_entropy_ = -(t_ * log(t_ + epsilon) + (1.0 - t_) * log(1.0 - t_ + epsilon));
+        static thread_local auto teacher_loss_ = -(p_ * log(q_) + (1.0 - p_) * log(1.0 - q_));
+        static thread_local auto outcome_loss_ = -(t_ * log(q_) + (1.0 - t_) * log(1.0 - q_));
+        static thread_local auto result_ = lambda_ * teacher_loss_ + (1.0 - lambda_) * outcome_loss_;
+        static thread_local auto entropy_ = lambda_ * teacher_entropy_ + (1.0 - lambda_) * outcome_entropy_;
+        static thread_local auto cross_entropy_ = result_ - entropy_;
+
+        return cross_entropy_;
+    }
+
+    template <typename ValueT>
+    static auto& scale_score_(ValueT&& v_)
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        // Normalize to [0.0, 1.0].
+        static thread_local auto normalized_ =
+            (std::forward<ValueT>(v_) - ConstantRef<double>(src_score_min_value))
+            / (ConstantRef<double>(src_score_max_value) - ConstantRef<double>(src_score_min_value));
+
+        // Scale to [dest_score_min_value, dest_score_max_value].
+        static thread_local auto scaled_ =
+            normalized_
+            * (ConstantRef<double>(dest_score_max_value) - ConstantRef<double>(dest_score_min_value))
+            + ConstantRef<double>(dest_score_min_value);
+
+        return scaled_;
+    }
+
+    static Value scale_score(Value v)
+    {
+        // Normalize to [0.0, 1.0].
+        auto normalized =
+            ((double)v - src_score_min_value)
+            / (src_score_max_value - src_score_min_value);
+
+        // Scale to [dest_score_min_value, dest_score_max_value].
+        auto scaled =
+            normalized
+            * (dest_score_max_value - dest_score_min_value)
+            + dest_score_min_value;
+
+        return Value(scaled);
+    }
+
+    template <typename ValueT>
+    static auto& expected_perf_(ValueT&& v_)
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        static thread_local auto perf_ = sigmoid(std::forward<ValueT>(v_) * ConstantRef<double>(winning_probability_coefficient));
+
+        return perf_;
+    }
+
+    template <typename ValueT, typename PlyT, typename T = typename ValueT::ValueType>
+    static auto& expected_perf_use_wdl_(
+        ValueT& v_,
+        PlyT&& ply_
+    )
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        // Coefficients of a 3rd order polynomial fit based on fishtest data
+        // for two parameters needed to transform eval to the argument of a
+        // logistic function.
+        static constexpr T as[] = { -8.24404295, 64.23892342, -95.73056462, 153.86478679 };
+        static constexpr T bs[] = { -3.37154371, 28.44489198, -56.67657741,  72.05858751 };
+
+        // The model captures only up to 240 plies, so limit input (and rescale)
+        static thread_local auto m_ = std::forward<PlyT>(ply_) / 64.0;
+
+        static thread_local auto a_ = (((as[0] * m_ + as[1]) * m_ + as[2]) * m_) + as[3];
+        static thread_local auto b_ = (((bs[0] * m_ + bs[1]) * m_ + bs[2]) * m_) + bs[3];
+
+        // Return win rate in per mille
+        static thread_local auto sv_ = (v_ - a_) / b_;
+        static thread_local auto svn_ = (-v_ - a_) / b_;
+
+        static thread_local auto win_pct_ = sigmoid(sv_);
+        static thread_local auto loss_pct_ = sigmoid(svn_);
+
+        static thread_local auto draw_pct_ = 1.0 - win_pct_ - loss_pct_;
+
+        static thread_local auto perf_ = win_pct_ + draw_pct_ * 0.5;
+
+        return perf_;
+    }
+
+    static double expected_perf_use_wdl(
+        Value v,
+        int ply
+    )
+    {
+        // Coefficients of a 3rd order polynomial fit based on fishtest data
+        // for two parameters needed to transform eval to the argument of a
+        // logistic function.
+        static constexpr double as[] = { -8.24404295, 64.23892342, -95.73056462, 153.86478679 };
+        static constexpr double bs[] = { -3.37154371, 28.44489198, -56.67657741,  72.05858751 };
+
+        // The model captures only up to 240 plies, so limit input (and rescale)
+        auto m = ply / 64.0;
+
+        auto a = (((as[0] * m + as[1]) * m + as[2]) * m) + as[3];
+        auto b = (((bs[0] * m + bs[1]) * m + bs[2]) * m) + bs[3];
+
+        // Return win rate in per mille
+        auto sv = ((double)v - a) / b;
+        auto svn = ((double)-v - a) / b;
+
+        auto win_pct = Math::sigmoid(sv);
+        auto loss_pct = Math::sigmoid(svn);
+
+        auto draw_pct = 1.0 - win_pct - loss_pct;
+
+        auto perf = win_pct + draw_pct * 0.5;
+
+        return perf;
+    }
+
+    [[maybe_unused]] static ValueWithGrad<double> get_loss_noob(
+        Value shallow, Value teacher_signal, int result, int /* ply */)
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        static thread_local auto q_ = VariableParameter<double, 0>{};
+        static thread_local auto p_ = ConstantParameter<double, 1>{};
+        static thread_local auto loss_ = pow(q_ - p_, 2.0) * (1.0 / (2400.0 * 2.0 * 600.0));
+
+        auto args = std::tuple(
+            (double)shallow,
+            (double)teacher_signal,
+            (double)result,
+            calculate_lambda(teacher_signal)
+        );
+
+        return loss_.eval(args);
+    }
+
+    static auto& get_loss_cross_entropy_()
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        static thread_local auto& q_ = expected_perf_(VariableParameter<double, 0>{});
+        static thread_local auto& p_ = expected_perf_(scale_score_(ConstantParameter<double, 1>{}));
+        static thread_local auto t_ = (ConstantParameter<double, 2>{} + 1.0) * 0.5;
+        static thread_local auto lambda_ = ConstantParameter<double, 3>{};
+        static thread_local auto& loss_ = cross_entropy_(q_, p_, t_, lambda_);
+
+        return loss_;
+    }
+
+    static auto get_loss_cross_entropy_args(
+        Value shallow, Value teacher_signal, int result)
+    {
+        return std::tuple(
+            (double)shallow,
+            (double)teacher_signal,
+            (double)result,
+            calculate_lambda(teacher_signal)
+        );
+    }
+
+    static ValueWithGrad<double> get_loss_cross_entropy(
+        Value shallow, Value teacher_signal, int result, int /* ply */)
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        static thread_local auto& loss_ = get_loss_cross_entropy_();
+
+        auto args = get_loss_cross_entropy_args(shallow, teacher_signal, result);
+
+        return loss_.eval(args);
+    }
+
+    static ValueWithGrad<double> get_loss_cross_entropy_no_grad(
+        Value shallow, Value teacher_signal, int result, int /* ply */)
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        static thread_local auto& loss_ = get_loss_cross_entropy_();
+
+        auto args = get_loss_cross_entropy_args(shallow, teacher_signal, result);
+
+        return { loss_.value(args), 0.0 };
+    }
+
+    static auto& get_loss_cross_entropy_use_wdl_()
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        static thread_local auto ply_ = ConstantParameter<double, 4>{};
+        static thread_local auto shallow_ = VariableParameter<double, 0>{};
+        static thread_local auto& q_ = expected_perf_use_wdl_(shallow_, ply_);
+        // We could do just this but MSVC crashes with an internal compiler error :(
+        // static thread_local auto& scaled_teacher_ = scale_score_(ConstantParameter<double, 1>{});
+        // static thread_local auto& p_ = expected_perf_use_wdl_(scaled_teacher_, ply_);
+        static thread_local auto p_ = ConstantParameter<double, 1>{};
+        static thread_local auto t_ = (ConstantParameter<double, 2>{} + 1.0) * 0.5;
+        static thread_local auto lambda_ = ConstantParameter<double, 3>{};
+        static thread_local auto& loss_ = cross_entropy_(q_, p_, t_, lambda_);
+
+        return loss_;
+    }
+
+    static auto get_loss_cross_entropy_use_wdl_args(
+        Value shallow, Value teacher_signal, int result, int ply)
+    {
+        return std::tuple(
+            (double)shallow,
+            // This is required because otherwise MSVC crashes :(
+            expected_perf_use_wdl(scale_score(teacher_signal), ply),
+            (double)result,
+            calculate_lambda(teacher_signal),
+            (double)std::min(240, ply)
+        );
+    }
+
+    static ValueWithGrad<double> get_loss_cross_entropy_use_wdl(
+        Value shallow, Value teacher_signal, int result, int ply)
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        static thread_local auto& loss_ = get_loss_cross_entropy_use_wdl_();
+
+        auto args = get_loss_cross_entropy_use_wdl_args(shallow, teacher_signal, result, ply);
+
+        return loss_.eval(args);
+    }
+
+    static ValueWithGrad<double> get_loss_cross_entropy_use_wdl_no_grad(
+        Value shallow, Value teacher_signal, int result, int ply)
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        static thread_local auto& loss_ = get_loss_cross_entropy_use_wdl_();
+
+        auto args = get_loss_cross_entropy_use_wdl_args(shallow, teacher_signal, result, ply);
+
+        return { loss_.value(args), 0.0 };
+    }
+
+    static auto get_loss(Value shallow, Value teacher_signal, int result, int ply)
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        if (use_wdl)
+        {
+            return get_loss_cross_entropy_use_wdl(shallow, teacher_signal, result, ply);
+        }
+        else
+        {
+            return get_loss_cross_entropy(shallow, teacher_signal, result, ply);
+        }
+    }
+
+    static auto get_loss_no_grad(Value shallow, Value teacher_signal, int result, int ply)
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        if (use_wdl)
+        {
+            return get_loss_cross_entropy_use_wdl_no_grad(shallow, teacher_signal, result, ply);
+        }
+        else
+        {
+            return get_loss_cross_entropy_no_grad(shallow, teacher_signal, result, ply);
+        }
+    }
+
+    [[maybe_unused]] static auto get_loss(
+        Value teacher_signal,
+        Value shallow,
+        const PackedSfenValue& psv)
+    {
+        return get_loss(shallow, teacher_signal, psv.game_result, psv.gamePly);
+    }
+
+    static auto get_loss_no_grad(
+        Value teacher_signal,
+        Value shallow,
+        const PackedSfenValue& psv)
+    {
+        return get_loss_no_grad(shallow, teacher_signal, psv.game_result, psv.gamePly);
+    }
+
+    // Class to generate sfen with multiple threads
+    struct LearnerThink
+    {
+        struct Params
+        {
+            // Mini batch size size. Be sure to set it on the side that uses this class.
+            uint64_t mini_batch_size = LEARN_MINI_BATCH_SIZE;
+
+            // Option to exclude early stage from learning
+            int reduction_gameply = 1;
+
+            // If the absolute value of the evaluation value of the deep search
+            // of the teacher phase exceeds this value, discard the teacher phase.
+            int eval_limit = 32000;
+
+            // Flag whether to dig a folder each time the evaluation function is saved.
+            // If true, do not dig the folder.
+            bool save_only_once = false;
+
+            bool shuffle = true;
+
+            bool verbose = false;
+
+            double newbob_decay = 0.5;
+            int newbob_num_trials = 4;
+            uint64_t auto_lr_drop = 0;
+
+            std::string best_nn_directory;
+
+            uint64_t eval_save_interval = LEARN_EVAL_SAVE_INTERVAL;
+            uint64_t loss_output_interval = 1'000'000;
+
+            size_t sfen_read_size = SfenReader::DEFAULT_SFEN_READ_SIZE;
+            size_t thread_buffer_size = SfenReader::DEFAULT_THREAD_BUFFER_SIZE;
+
+            bool use_draw_games_in_training = true;
+            bool use_draw_games_in_validation = true;
+            bool skip_duplicated_positions_in_training = true;
+
+            bool assume_quiet = false;
+            bool smart_fen_skipping = false;
+
+            double learning_rate = 1.0;
+            double max_grad = 1.0;
+
+            string validation_set_file_name;
+            string seed;
+
+            std::vector<std::string> filenames;
+
+            uint64_t num_threads;
+
+            void enforce_constraints()
+            {
+                num_threads = Options["Threads"];
+
+                if (loss_output_interval == 0)
+                {
+                    loss_output_interval = LEARN_RMSE_OUTPUT_INTERVAL * mini_batch_size;
+                }
+
+                // If reduction_gameply is set to 0, rand(0) will be divided by 0, so correct it to 1.
+                reduction_gameply = max(reduction_gameply, 1);
+
+                if (newbob_decay != 1.0 && !Options["SkipLoadingEval"]) {
+                    // Save the current net to [EvalSaveDir]\original.
+                    Eval::NNUE::save_eval("original");
+
+                    // Set the folder above to best_nn_directory so that the trainer can
+                    // resotre the network parameters from the original net file.
+                    best_nn_directory =
+                        Path::combine(Options["EvalSaveDir"], "original");
+                }
+            }
+        };
+
+        // Number of phases used for calculation such as mse
+        // mini-batch size = 1M is standard, so 0.2% of that should be negligible in terms of time.
+        // Since search() is performed with depth = 1 in calculation of
+        // move match rate, simple comparison is not possible...
+        static constexpr uint64_t sfen_for_mse_size = 2000;
+
+        LearnerThink(const Params& prm) :
+            params(prm),
+            prng(prm.seed),
+            sr(
+                prm.filenames,
+                prm.shuffle,
+                SfenReaderMode::Cyclic,
+                prm.num_threads,
+                std::to_string(prng.next_random_seed()),
+                prm.sfen_read_size,
+                prm.thread_buffer_size),
+            learn_loss_sum{}
+        {
+            save_count = 0;
+            loss_output_count = 0;
+            last_lr_drop = 0;
+            best_loss = std::numeric_limits<double>::infinity();
+            latest_loss_sum = 0.0;
+            latest_loss_count = 0;
+            total_done = 0;
+            trials = params.newbob_num_trials;
+            dir_number = 0;
+        }
+
+        void learn(uint64_t epochs);
+
+    private:
+        static void set_learning_search_limits();
+
+        void learn_worker(Thread& th, std::atomic<uint64_t>& counter, uint64_t limit);
+
+        void update_weights(const PSVector& psv, uint64_t epoch);
+
+        void calc_loss(const PSVector& psv, uint64_t epoch);
+
+        void calc_loss_worker(
+            Thread& th,
+            std::atomic<uint64_t>& counter,
+            const PSVector& psv,
+            Loss& test_loss_sum,
+            atomic<double>& sum_norm,
+            atomic<int>& move_accord_count
+        );
+
+        Value get_shallow_value(Position& pos);
+
+        bool check_progress();
+
+        // save merit function parameters to a file
+        bool save(bool is_final = false);
+
+        Params params;
+
+        PRNG prng;
+
+        // sfen reader
+        SfenReader sr;
+
+        uint64_t save_count;
+        uint64_t loss_output_count;
+
+        std::atomic<bool> stop_flag;
+
+        uint64_t total_done;
+
+        uint64_t last_lr_drop;
+        double best_loss;
+        double latest_loss_sum;
+        uint64_t latest_loss_count;
+
+        int trials;
+        int dir_number;
+
+        // For calculation of learning data loss
+        Loss learn_loss_sum;
+    };
+
+    void LearnerThink::set_learning_search_limits()
+    {
+        Threads.main()->ponder = false;
+
+        // About Search::Limits
+        // Be careful because this member variable is global and affects other threads.
+        auto& limits = Search::Limits;
+
+        limits.startTime = now();
+
+        // Make the search equivalent to the "go infinite" command. (Because it is troublesome if time management is done)
+        limits.infinite = true;
+
+        // Since PV is an obstacle when displayed, erase it.
+        limits.silent = true;
+
+        // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it.
+        limits.nodes = 0;
+
+        // depth is also processed by the one passed as an argument of Learner::search().
+        limits.depth = 0;
+    }
+
+    void LearnerThink::learn(uint64_t epochs)
+    {
+#if defined(_OPENMP)
+        omp_set_num_threads((int)Options["Threads"]);
+#endif
+
+        set_learning_search_limits();
+
+        Eval::NNUE::verify_any_net_loaded();
+
+        const PSVector sfen_for_mse =
+            params.validation_set_file_name.empty()
+            ? sr.read_for_mse(sfen_for_mse_size)
+            : sr.read_validation_set(
+                params.validation_set_file_name,
+                params.eval_limit,
+                params.use_draw_games_in_validation);
+
+        if (params.validation_set_file_name.empty()
+            && sfen_for_mse.size() != sfen_for_mse_size)
+        {
+            auto out = sync_region_cout.new_region();
+            out
+                << "INFO (learn): Error reading sfen_for_mse. Read " << sfen_for_mse.size()
+                << " out of " << sfen_for_mse_size << '\n';
+
+            return;
+        }
+
+        if (params.newbob_decay != 1.0) {
+
+            calc_loss(sfen_for_mse, 0);
+
+            best_loss = latest_loss_sum / latest_loss_count;
+            latest_loss_sum = 0.0;
+            latest_loss_count = 0;
+
+            auto out = sync_region_cout.new_region();
+            out << "INFO (learn): initial loss = " << best_loss << endl;
+        }
+
+        stop_flag = false;
+
+        for(uint64_t epoch = 1; epoch <= epochs; ++epoch)
+        {
+            std::atomic<uint64_t> counter{0};
+
+            Threads.execute_with_workers([this, &counter](auto& th){
+                learn_worker(th, counter, params.mini_batch_size);
+            });
+
+            total_done += params.mini_batch_size;
+
+            Threads.wait_for_workers_finished();
+
+            if (stop_flag)
+                break;
+
+            update_weights(sfen_for_mse, epoch);
+
+            if (stop_flag)
+                break;
+        }
+
+        Eval::NNUE::finalize_net();
+
+        save(true);
+    }
+
+    void LearnerThink::learn_worker(Thread& th, std::atomic<uint64_t>& counter, uint64_t limit)
+    {
+        const auto thread_id = th.thread_idx();
+        auto& pos = th.rootPos;
+
+        std::vector<StateInfo, AlignedAllocator<StateInfo>> state(MAX_PLY);
+
+        while(!stop_flag)
+        {
+            const auto iter = counter.fetch_add(1);
+            if (iter >= limit)
+                break;
+
+            PackedSfenValue ps;
+
+        RETRY_READ:;
+
+            if (!sr.read_to_thread_buffer(thread_id, ps))
+            {
+                // If we ran out of data we stop completely
+                // because there's nothing left to do.
+                stop_flag = true;
+                break;
+            }
+
+            if (params.eval_limit < abs(ps.score))
+                goto RETRY_READ;
+
+            if (!params.use_draw_games_in_training && ps.game_result == 0)
+                goto RETRY_READ;
+
+            // Skip over the opening phase
+            if (ps.gamePly < prng.rand(params.reduction_gameply))
+                goto RETRY_READ;
+
+            StateInfo si;
+            if (pos.set_from_packed_sfen(ps.sfen, &si, &th) != 0)
+            {
+                // Malformed sfen
+                auto out = sync_region_cout.new_region();
+                out << "ERROR: illigal packed sfen = " << pos.fen() << endl;
+                goto RETRY_READ;
+            }
+
+            const auto rootColor = pos.side_to_move();
+
+            // A function that adds the current `pos` and `ps`
+            // to the training set.
+            auto pos_add_grad = [&]() {
+
+                // Evaluation value of deep search
+                const Value shallow_value = Eval::evaluate(pos);
+
+                Eval::NNUE::add_example(pos, rootColor, shallow_value, ps, 1.0);
+            };
+
+            if (!pos.pseudo_legal((Move)ps.move) || !pos.legal((Move)ps.move))
+            {
+                goto RETRY_READ;
+            }
+
+            // We don't need to qsearch when doing smart skipping
+            if (!params.assume_quiet && !params.smart_fen_skipping)
+            {
+                int ply = 0;
+                pos.do_move((Move)ps.move, state[ply++]);
+
+                // Evaluation value of shallow search (qsearch)
+                const auto [_, pv] = Search::qsearch(pos);
+
+                for (auto m : pv)
+                {
+                    pos.do_move(m, state[ply++]);
+                }
+            }
+
+            if (params.smart_fen_skipping
+                && (pos.capture_or_promotion((Move)ps.move)
+                    || pos.checkers()))
+            {
+                goto RETRY_READ;
+            }
+
+            // We want to position being trained on not to be terminal
+            if (MoveList<LEGAL>(pos).size() == 0)
+                goto RETRY_READ;
+
+            // Since we have reached the end phase of PV, add the slope here.
+            pos_add_grad();
+        }
+    }
+
+    void LearnerThink::update_weights(const PSVector& psv, uint64_t epoch)
+    {
+        // I'm not sure this fencing is correct. But either way there
+        // should be no real issues happening since
+        // the read/write phases are isolated.
+        atomic_thread_fence(memory_order_seq_cst);
+        learn_loss_sum += Eval::NNUE::update_parameters(
+            Threads, epoch, params.verbose, params.learning_rate, params.max_grad, get_loss);
+        atomic_thread_fence(memory_order_seq_cst);
+
+        if (++save_count * params.mini_batch_size >= params.eval_save_interval)
+        {
+            save_count = 0;
+
+            const bool converged = save();
+            if (converged)
+            {
+                stop_flag = true;
+                return;
+            }
+        }
+
+        if (++loss_output_count * params.mini_batch_size >= params.loss_output_interval)
+        {
+            loss_output_count = 0;
+
+            // loss calculation
+            calc_loss(psv, epoch);
+
+            Eval::NNUE::check_health();
+        }
+    }
+
+    void LearnerThink::calc_loss(const PSVector& psv, uint64_t epoch)
+    {
+        TT.new_search();
+        TimePoint elapsed = now() - Search::Limits.startTime + 1;
+
+        auto out = sync_region_cout.new_region();
+
+        out << "\n";
+        out << "PROGRESS (calc_loss): " << now_string()
+             << ", " << total_done << " sfens"
+             << ", " << total_done * 1000 / elapsed  << " sfens/second"
+             << ", epoch " << epoch
+             << endl;
+
+        out << "  - learning rate = " << params.learning_rate << endl;
+
+        // For calculation of verification data loss
+        Loss test_loss_sum{};
+
+        // norm for learning
+        atomic<double> sum_norm{0.0};
+
+        // The number of times the pv first move of deep
+        // search matches the pv first move of search(1).
+        atomic<int> move_accord_count{0};
+
+        auto mainThread = Threads.main();
+        mainThread->execute_with_worker([&out](auto& th){
+            auto& pos = th.rootPos;
+            StateInfo si;
+            pos.set(StartFEN, false, &si, &th);
+            out << "  - startpos eval = " << Eval::evaluate(pos) << endl;
+        });
+        mainThread->wait_for_worker_finished();
+
+        // The number of tasks to do.
+        atomic<uint64_t> counter{0};
+        Threads.execute_with_workers([&](auto& th){
+            calc_loss_worker(
+                th,
+                counter,
+                psv,
+                test_loss_sum,
+                sum_norm,
+                move_accord_count
+            );
+        });
+        Threads.wait_for_workers_finished();
+
+        latest_loss_sum += test_loss_sum.value();
+        latest_loss_count += psv.size();
+
+        if (psv.size() && test_loss_sum.count() > 0)
+        {
+            test_loss_sum.print_only_loss("val", out);
+
+            if (learn_loss_sum.count() > 0)
+            {
+                learn_loss_sum.print_with_grad("train", out);
+            }
+
+            out << "  - norm = " << sum_norm << endl;
+            out << "  - move accuracy = " << (move_accord_count * 100.0 / psv.size()) << "%" << endl;
+        }
+        else
+        {
+            out << "ERROR: psv.size() = " << psv.size() << " ,  done = " << test_loss_sum.count() << endl;
+        }
+
+        learn_loss_sum.reset();
+    }
+
+    void LearnerThink::calc_loss_worker(
+        Thread& th,
+        std::atomic<uint64_t>& counter,
+        const PSVector& psv,
+        Loss& test_loss_sum,
+        atomic<double>& sum_norm,
+        atomic<int>& move_accord_count
+    )
+    {
+        Loss local_loss_sum{};
+        auto& pos = th.rootPos;
+
+        for(;;)
+        {
+            const auto task_id = counter.fetch_add(1);
+            if (task_id >= psv.size())
+            {
+                break;
+            }
+
+            const auto& ps = psv[task_id];
+
+            StateInfo si;
+            if (pos.set_from_packed_sfen(ps.sfen, &si, &th) != 0)
+            {
+                cout << "Error! : illegal packed sfen " << pos.fen() << endl;
+                continue;
+            }
+
+            const Value shallow_value = get_shallow_value(pos);
+
+            // Evaluation value of deep search
+            const auto deep_value = (Value)ps.score;
+
+            const auto loss = get_loss_no_grad(
+                deep_value,
+                shallow_value,
+                ps);
+
+            local_loss_sum += loss;
+            sum_norm += (double)abs(shallow_value);
+
+            // Determine if the teacher's move and the score of the shallow search match
+            const auto [value, pv] = Search::search(pos, 1);
+            if (pv.size() > 0 && (uint16_t)pv[0] == ps.move)
+                move_accord_count.fetch_add(1, std::memory_order_relaxed);
+        }
+
+        test_loss_sum += local_loss_sum;
+    }
+
+    Value LearnerThink::get_shallow_value(Position& pos)
+    {
+        // Evaluation value for shallow search
+        // The value of evaluate() may be used, but when calculating loss, learn_cross_entropy and
+        // Use qsearch() because it is difficult to compare the values.
+        // EvalHash has been disabled in advance. (If not, the same value will be returned every time)
+        const auto [_, pv] = Search::qsearch(pos);
+
+        const auto rootColor = pos.side_to_move();
+
+        std::vector<StateInfo, AlignedAllocator<StateInfo>> states(pv.size());
+        for (size_t i = 0; i < pv.size(); ++i)
+        {
+            pos.do_move(pv[i], states[i]);
+        }
+
+        const Value shallow_value =
+            (rootColor == pos.side_to_move())
+            ? Eval::evaluate(pos)
+            : -Eval::evaluate(pos);
+
+        for (auto it = pv.rbegin(); it != pv.rend(); ++it)
+            pos.undo_move(*it);
+
+        return shallow_value;
+    }
+
+    bool LearnerThink::check_progress()
+    {
+        auto out = sync_region_cout.new_region();
+
+        const double latest_loss = latest_loss_sum / latest_loss_count;
+        bool converged = false;
+        latest_loss_sum = 0.0;
+        latest_loss_count = 0;
+
+        auto drop_lr = [&]() {
+            last_lr_drop = total_done;
+
+            out
+                << "  - reducing learning rate from " << params.learning_rate
+                << " to " << (params.learning_rate * params.newbob_decay)
+                << " (" << trials << " more trials)" << endl;
+
+            params.learning_rate *= params.newbob_decay;
+        };
+
+        auto accept = [&]() {
+            out << "  - loss = " << latest_loss << " < best (" << best_loss << "), accepted" << endl;
+
+            best_loss = latest_loss;
+            trials = params.newbob_num_trials;
+        };
+
+        auto reject = [&]() {
+            out << "  - loss = " << latest_loss << " >= best (" << best_loss << "), rejected" << endl;
+
+            --trials;
+            if (trials > 0)
+            {
+                drop_lr();
+                return false;
+            }
+            else
+            {
+                return true;
+            }
+        };
+
+        out << "INFO (learning_rate):" << endl;
+
+        if (params.auto_lr_drop)
+        {
+            accept();
+
+            if (total_done >= last_lr_drop + params.auto_lr_drop)
+            {
+                drop_lr();
+            }
+        }
+        else if (latest_loss < best_loss)
+        {
+            accept();
+        }
+        else
+        {
+            converged = reject();
+        }
+
+        if (converged)
+        {
+            out << "  - converged" << endl;
+        }
+
+        return converged;
+    }
+
+    // Write evaluation function file.
+    bool LearnerThink::save(bool is_final)
+    {
+        // Each time you save, change the extension part of the file name like "0","1","2",..
+        // (Because I want to compare the winning rate for each evaluation function parameter later)
+
+        bool converged = false;
+
+        if (params.save_only_once)
+        {
+            // When EVAL_SAVE_ONLY_ONCE is defined,
+            // Do not dig a subfolder because I want to save it only once.
+            Eval::NNUE::save_eval("");
+        }
+        else if (is_final)
+        {
+            Eval::NNUE::save_eval("final");
+            converged = true;
+        }
+        else
+        {
+            // TODO: consider naming the output directory by epoch.
+            const std::string dir_name = std::to_string(dir_number++);
+            Eval::NNUE::save_eval(dir_name);
+
+            if (params.newbob_decay != 1.0 && latest_loss_count > 0)
+            {
+                converged = check_progress();
+                params.best_nn_directory = Path::combine((std::string)Options["EvalSaveDir"], dir_name);
+            }
+        }
+
+        return converged;
+    }
+
+    // Learning from the generated game record
+    void learn(istringstream& is)
+    {
+        LearnerThink::Params params;
+
+        // Number of epochs
+        uint64_t epochs = std::numeric_limits<uint64_t>::max();
+
+        // Game file storage folder (get game file with relative path from here)
+        string base_dir;
+        string target_dir;
+
+        uint64_t nn_batch_size = 1000;
+        string nn_options;
+
+        auto out = sync_region_cout.new_region();
+
+        // Assume the filenames are staggered.
+        while (true)
+        {
+            string option;
+            is >> option;
+
+            if (option == "")
+                break;
+
+            // specify the number of phases of mini-batch
+            if (option == "bat")
+            {
+                is >> params.mini_batch_size;
+                params.mini_batch_size *= 10000; // Unit is ten thousand
+            }
+
+            // Specify the folder in which the game record is stored and make it the rooting target.
+            else if (option == "targetdir") is >> target_dir;
+            else if (option == "targetfile")
+            {
+                std::string filename;
+                is >> filename;
+                params.filenames.push_back(filename);
+            }
+
+            // Specify the number of loops
+            else if (option == "epochs") is >> epochs;
+
+            // Game file storage folder (get game file with relative path from here)
+            else if (option == "basedir") is >> base_dir;
+
+            // Mini batch size
+            else if (option == "batchsize") is >> params.mini_batch_size;
+
+            // learning rate
+            else if (option == "lr") is >> params.learning_rate;
+            else if (option == "max_grad") is >> params.max_grad;
+
+            // Accept also the old option name.
+            else if (option == "use_draw_in_training"
+                  || option == "use_draw_games_in_training")
+                is >> params.use_draw_games_in_training;
+
+            // Accept also the old option name.
+            else if (option == "use_draw_in_validation"
+                  || option == "use_draw_games_in_validation")
+                is >> params.use_draw_games_in_validation;
+
+            // Accept also the old option name.
+            else if (option == "use_hash_in_training"
+                  || option == "skip_duplicated_positions_in_training")
+                is >> params.skip_duplicated_positions_in_training;
+
+            else if (option == "winning_probability_coefficient")
+                is >> winning_probability_coefficient;
+
+            // Using WDL with win rate model instead of sigmoid
+            else if (option == "use_wdl") is >> use_wdl;
+
+
+            // LAMBDA
+            else if (option == "lambda") is >> elmo_lambda_low;
+            else if (option == "lambda2") is >> elmo_lambda_high;
+            else if (option == "lambda_limit") is >> elmo_lambda_limit;
+
+            else if (option == "reduction_gameply") is >> params.reduction_gameply;
+
+            else if (option == "eval_limit") is >> params.eval_limit;
+            else if (option == "save_only_once") params.save_only_once = true;
+            else if (option == "no_shuffle") params.shuffle = false;
+
+            else if (option == "nn_batch_size") is >> nn_batch_size;
+            else if (option == "newbob_decay") is >> params.newbob_decay;
+            else if (option == "newbob_num_trials") is >> params.newbob_num_trials;
+            else if (option == "nn_options") is >> nn_options;
+            else if (option == "auto_lr_drop") is >> params.auto_lr_drop;
+
+            else if (option == "eval_save_interval") is >> params.eval_save_interval;
+            else if (option == "loss_output_interval") is >> params.loss_output_interval;
+            else if (option == "validation_set_file_name") is >> params.validation_set_file_name;
+
+            else if (option == "src_score_min_value") is >> src_score_min_value;
+            else if (option == "src_score_max_value") is >> src_score_max_value;
+            else if (option == "dest_score_min_value") is >> dest_score_min_value;
+            else if (option == "dest_score_max_value") is >> dest_score_max_value;
+
+            else if (option == "sfen_read_size") is >> params.sfen_read_size;
+            else if (option == "thread_buffer_size") is >> params.thread_buffer_size;
+
+            else if (option == "seed") is >> params.seed;
+            else if (option == "set_recommended_uci_options")
+            {
+                UCI::setoption("Use NNUE", "pure");
+                UCI::setoption("MultiPV", "1");
+                UCI::setoption("Contempt", "0");
+                UCI::setoption("Skill Level", "20");
+                UCI::setoption("UCI_Chess960", "false");
+                UCI::setoption("UCI_AnalyseMode", "false");
+                UCI::setoption("UCI_LimitStrength", "false");
+                UCI::setoption("PruneAtShallowDepth", "false");
+                UCI::setoption("EnableTranspositionTable", "false");
+            }
+            else if (option == "verbose") params.verbose = true;
+            else if (option == "assume_quiet") params.assume_quiet = true;
+            else if (option == "smart_fen_skipping") params.smart_fen_skipping = true;
+            else
+            {
+                out << "INFO: Unknown option: " << option << ". Ignoring.\n";
+            }
+        }
+
+        out << "INFO: Executing learn command\n";
+
+        // Issue a warning if OpenMP is disabled.
+#if !defined(_OPENMP)
+        out << "WARNING: OpenMP disabled." << endl;
+#endif
+
+        params.enforce_constraints();
+
+        // Right now we only have the individual files.
+        // We need to apply base_dir here
+        if (!target_dir.empty())
+        {
+            append_files_from_dir(params.filenames, base_dir, target_dir);
+        }
+        rebase_files(params.filenames, base_dir);
+
+        out << "INFO: Input files:\n";
+        for (auto s : params.filenames)
+            out << "  - " << s << '\n';
+
+        out << "INFO: Parameters:\n";
+        if (!params.validation_set_file_name.empty())
+        {
+            out << "  - validation set           : " << params.validation_set_file_name << endl;
+        }
+
+        out << "  - epochs                   : " << epochs << endl;
+        out << "  - epochs * minibatch size  : " << epochs * params.mini_batch_size << endl;
+        out << "  - eval_limit               : " << params.eval_limit << endl;
+        out << "  - save_only_once           : " << (params.save_only_once ? "true" : "false") << endl;
+        out << "  - shuffle on read          : " << (params.shuffle ? "true" : "false") << endl;
+
+        out << "  - Loss Function            : " << LOSS_FUNCTION << endl;
+        out << "  - minibatch size           : " << params.mini_batch_size << endl;
+
+        out << "  - nn_batch_size            : " << nn_batch_size << endl;
+        out << "  - nn_options               : " << nn_options << endl;
+
+        out << "  - learning rate            : " << params.learning_rate << endl;
+        out << "  - max_grad                 : " << params.max_grad << endl;
+        out << "  - use draws in training    : " << params.use_draw_games_in_training << endl;
+        out << "  - use draws in validation  : " << params.use_draw_games_in_validation << endl;
+        out << "  - skip repeated positions  : " << params.skip_duplicated_positions_in_training << endl;
+
+        out << "  - winning prob coeff       : " << winning_probability_coefficient << endl;
+        out << "  - use_wdl                  : " << use_wdl << endl;
+
+        out << "  - src_score_min_value      : " << src_score_min_value << endl;
+        out << "  - src_score_max_value      : " << src_score_max_value << endl;
+        out << "  - dest_score_min_value     : " << dest_score_min_value << endl;
+        out << "  - dest_score_max_value     : " << dest_score_max_value << endl;
+
+        out << "  - reduction_gameply        : " << params.reduction_gameply << endl;
+
+        out << "  - elmo_lambda_low          : " << elmo_lambda_low << endl;
+        out << "  - elmo_lambda_high         : " << elmo_lambda_high << endl;
+        out << "  - elmo_lambda_limit        : " << elmo_lambda_limit << endl;
+        out << "  - eval_save_interval       : " << params.eval_save_interval << " sfens" << endl;
+        out << "  - loss_output_interval     : " << params.loss_output_interval << " sfens" << endl;
+
+        out << "  - sfen_read_size           : " << params.sfen_read_size << endl;
+        out << "  - thread_buffer_size       : " << params.thread_buffer_size << endl;
+
+        out << "  - seed                     : " << params.seed << endl;
+        out << "  - verbose                  : " << (params.verbose ? "true" : "false") << endl;
+
+        if (params.auto_lr_drop) {
+            out << "  - learning rate scheduling : every " << params.auto_lr_drop << " sfens" << endl;
+        }
+        else if (params.newbob_decay != 1.0) {
+            out << "  - learning rate scheduling : newbob with decay" << endl;
+            out << "  - newbob_decay             : " << params.newbob_decay << endl;
+            out << "  - newbob_num_trials        : " << params.newbob_num_trials << endl;
+        }
+        else {
+            out << "  - learning rate scheduling : fixed learning rate" << endl;
+        }
+
+        out << endl;
+
+        out << "INFO: Started initialization." << endl;
+
+        Eval::NNUE::initialize_training(params.seed, out);
+        Eval::NNUE::set_batch_size(nn_batch_size);
+        Eval::NNUE::set_options(nn_options);
+
+        LearnerThink learn_think(params);
+
+        out << "Finished initialization." << endl;
+
+        out.unlock();
+
+        // Start learning.
+        learn_think.learn(epochs);
+    }
+
+} // namespace Learner
diff --git a/src/learn/learn.h b/src/learn/learn.h
index eda2bb32..842ffad0 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -1,101 +1,6 @@
 ﻿#ifndef _LEARN_H_
 #define _LEARN_H_
 
-#if defined(EVAL_LEARN)
-
-#include <vector>
-
-// =====================
-// Settings for learning
-// =====================
-
-// If you select one of the following, the details after that will be automatically selected.
-// If you don't select any of them, you need to set the subsequent details one by one.
-
-// Learning setting by elmo method. This is the default setting.
-// To make a standard squeeze diaphragm, specify "lambda 1" with the learn command.
-#define LEARN_ELMO_METHOD
-
-
-// ----------------------
-// update formula
-// ----------------------
-
-// Ada Grad. Recommended because it is stable.
-// #define ADA_GRAD_UPDATE
-
-// SGD looking only at the sign of the gradient. It requires less memory, but the accuracy is...
-// #define SGD_UPDATE
-
-// ----------------------
-// Settings for learning
-// ----------------------
-
-// mini-batch size.
-// Calculate the gradient by combining this number of phases.
-// If you make it smaller, the number of update_weights() will increase and the convergence will be faster. The gradient is incorrect.
-// If you increase it, the number of update_weights() decreases, so the convergence will be slow. The slope will come out accurately.
-// I don't think you need to change this value in most cases.
-
-#define LEARN_MINI_BATCH_SIZE (1000 * 1000 * 1)
-
-// The number of phases to read from the file at one time. After reading this much, shuffle.
-// It is better to have a certain size, but this number x 40 bytes x 3 times as much memory is consumed. 400MB*3 is consumed in the 10M phase.
-// Must be a multiple of THREAD_BUFFER_SIZE(=10000).
-
-#define LEARN_SFEN_READ_SIZE (1000 * 1000 * 10)
-
-// Saving interval of evaluation function at learning. Save each time you learn this number of phases.
-// Needless to say, the longer the saving interval, the shorter the learning time.
-// Folder name is incremented for each save like 0/, 1/, 2/...
-// By default, once every 1 billion phases.
-#define LEARN_EVAL_SAVE_INTERVAL (1000000000ULL)
-
-
-// ----------------------
-// Select the objective function
-// ----------------------
-
-// The objective function is the sum of squares of the difference in winning percentage
-// See learner.cpp for more information.
-
-//#define LOSS_FUNCTION_IS_WINNING_PERCENTAGE
-
-// Objective function is cross entropy
-// See learner.cpp for more information.
-// So-called ordinary "rag cloth squeezer"
-//#define LOSS_FUNCTION_IS_CROSS_ENTOROPY
-
-// A version in which the objective function is cross entropy, but the win rate function is not passed
-// #define LOSS_FUNCTION_IS_CROSS_ENTOROPY_FOR_VALUE
-
-// elmo (WCSC27) method
-// #define LOSS_FUNCTION_IS_ELMO_METHOD
-
-// ※ Other things may be added.
-
-
-// ----------------------
-// debug settings for learning
-// ----------------------
-
-// Reduce the output of rmse during learning to 1 for this number of times.
-// rmse calculation is done in one thread, so it takes some time, so reducing the output is effective.
-#define LEARN_RMSE_OUTPUT_INTERVAL 1
-
-
-// ----------------------
-// learning from zero vector
-// ----------------------
-
-// Start learning the evaluation function parameters from the zero vector.
-// Initialize to zero, generate a game, learn from zero vector,
-// Game generation → If you repeat learning, you will get parameters that do not depend on the professional game. (maybe)
-// (very time consuming)
-
-//#define RESET_TO_ZERO_VECTOR
-
-
 // ----------------------
 // Floating point for learning
 // ----------------------
@@ -105,7 +10,7 @@
 // Even if it is a double type, there is almost no difference in the way of convergence, so fix it to float.
 
 // when using float
-typedef float LearnFloatType;
+using LearnFloatType = float;
 
 // when using double
 //typedef double LearnFloatType;
@@ -114,59 +19,6 @@ typedef float LearnFloatType;
 //#include "half_float.h"
 //typedef HalfFloat::float16 LearnFloatType;
 
-// ----------------------
-// save memory
-// ----------------------
-
-// Use a triangular array for the Weight array (of which is KPP) to save memory.
-// If this is used, the weight array for learning will be about 3 times as large as the evaluation function file.
-
-#define USE_TRIANGLE_WEIGHT_ARRAY
-
-// ----------------------
-// dimension down
-// ----------------------
-
-// Dimension reduction for mirrors (left/right symmetry) and inverse (forward/backward symmetry).
-// All on by default.
-
-// Dimension reduction using mirror and inverse for KK. (Unclear effect)
-// USE_KK_MIRROR_WRITE must be on when USE_KK_INVERSE_WRITE is on.
-#define USE_KK_MIRROR_WRITE
-#define USE_KK_INVERSE_WRITE
-
-// Dimension reduction using Mirror and Inverse for KKP. (Inverse is not so effective)
-// When USE_KKP_INVERSE_WRITE is turned on, USE_KKP_MIRROR_WRITE must also be turned on.
-#define USE_KKP_MIRROR_WRITE
-#define USE_KKP_INVERSE_WRITE
-
-// Perform dimension reduction using a mirror for KPP. (Turning this off requires double the teacher position)
-// KPP has no inverse. (Because there is only K on the front side)
-#define USE_KPP_MIRROR_WRITE
-
-// Perform a dimension reduction using a mirror for KPPP. (Turning this off requires double the teacher position)
-// KPPP has no inverse. (Because there is only K on the front side)
-#define USE_KPPP_MIRROR_WRITE
-
-// Reduce the dimension by KPP for learning the KKPP component.
-// Learning is very slow.
-// Do not use as it is not debugged.
-//#define USE_KKPP_LOWER_DIM
-
-
-// ======================
-// Settings for creating teacher phases
-// ======================
-
-// ----------------------
-// write out the draw
-// ----------------------
-
-// When you reach a draw, write it out as a teacher position
-// It's subtle whether it's better to do this.
-// #define LEARN_GENSFEN_USE_DRAW_RESULT
-
-
 // ======================
 // configure
 // ======================
@@ -175,63 +27,122 @@ typedef float LearnFloatType;
 // Learning with the method of elmo (WCSC27)
 // ----------------------
 
-#if defined( LEARN_ELMO_METHOD )
-#define LOSS_FUNCTION_IS_ELMO_METHOD
-#define ADA_GRAD_UPDATE
-#endif
-
+#define LOSS_FUNCTION "ELMO_METHOD(WCSC27)"
 
 // ----------------------
 // Definition of struct used in Learner
 // ----------------------
-#include "../position.h"
+
+#include "autograd.h"
+#include "packed_sfen.h"
+
+#include "position.h"
+
+#include <sstream>
+#include <vector>
+#include <mutex>
+#include <string>
 
 namespace Learner
 {
-	//Structure in which PackedSfen and evaluation value are integrated
-	// If you write different contents for each option, it will be a problem when reusing the teacher game
-	// For the time being, write all the following members regardless of the options.
-	struct PackedSfenValue
-	{
-		// phase
-		PackedSfen sfen;
+    // ----------------------
+    // Settings for learning
+    // ----------------------
 
-		// Evaluation value returned from Learner::search()
-		int16_t score;
+    // mini-batch size.
+    // Calculate the gradient by combining this number of phases.
+    // If you make it smaller, the number of update_weights() will increase and the convergence will be faster. The gradient is incorrect.
+    // If you increase it, the number of update_weights() decreases, so the convergence will be slow. The slope will come out accurately.
+    // I don't think you need to change this value in most cases.
 
-		// PV first move
-		// Used when finding the match rate with the teacher
-		uint16_t move;
+    constexpr std::size_t LEARN_MINI_BATCH_SIZE = 1000 * 1000 * 1;
 
-		// Trouble of the phase from the initial phase.
-		uint16_t gamePly;
+    // Saving interval of evaluation function at learning. Save each time you learn this number of phases.
+    // Needless to say, the longer the saving interval, the shorter the learning time.
+    // Folder name is incremented for each save like 0/, 1/, 2/...
+    // By default, once every 1 billion phases.
+    constexpr std::size_t LEARN_EVAL_SAVE_INTERVAL = 100'000'000ULL;
 
-		// 1 if the player on this side ultimately wins the game. -1 if you are losing.
-		// 0 if a draw is reached.
-		// The draw is in the teacher position generation command gensfen,
-		// Only write if LEARN_GENSFEN_DRAW_RESULT is enabled.
-		int8_t game_result;
+    // Reduce the output of rmse during learning to 1 for this number of times.
+    // rmse calculation is done in one thread, so it takes some time, so reducing the output is effective.
+    constexpr std::size_t LEARN_RMSE_OUTPUT_INTERVAL = 1;
 
-		// When exchanging the file that wrote the teacher aspect with other people
-		//Because this structure size is not fixed, pad it so that it is 40 bytes in any environment.
-		uint8_t padding;
+    // Learning from the generated game record
+    void learn(std::istringstream& is);
 
-		// 32 + 2 + 2 + 2 + 1 + 1 = 40bytes
-	};
+    using CalcLossFunc = ValueWithGrad<double>(Value, Value, int, int);
 
-	// Type that returns the reading line and the evaluation value at that time
-	// Used in Learner::search(), Learner::qsearch().
-	typedef std::pair<Value, std::vector<Move> > ValueAndPV;
+    struct Loss
+    {
+        double value() const
+        {
+            return m_loss.value;
+        }
 
-	// So far, only Yaneura King 2018 Otafuku has this stub
-	// This stub is required if EVAL_LEARN is defined.
-	extern Learner::ValueAndPV  search(Position& pos, int depth , size_t multiPV = 1 , uint64_t NodesLimit = 0);
-	extern Learner::ValueAndPV qsearch(Position& pos);
+        double grad() const
+        {
+            return m_loss.grad;
+        }
 
-	double calc_grad(Value shallow, const PackedSfenValue& psv);
+        uint64_t count() const
+        {
+            return m_count;
+        }
 
+        Loss() = default;
+
+        Loss(const Loss& other) :
+            m_loss(other.m_loss),
+            m_count(other.m_count)
+        {
+        }
+
+        Loss& operator += (const ValueWithGrad<double>& rhs)
+        {
+            std::unique_lock lock(m_mutex);
+
+            m_loss += rhs.abs();
+            m_count += 1;
+
+            return *this;
+        }
+
+        Loss& operator += (const Loss& rhs)
+        {
+            std::unique_lock lock(m_mutex);
+
+            m_loss += rhs.m_loss.abs();
+            m_count += rhs.m_count;
+
+            return *this;
+        }
+
+        void reset()
+        {
+            std::unique_lock lock(m_mutex);
+
+            m_loss = ValueWithGrad<double>{ 0.0, 0.0 };
+            m_count = 0;
+        }
+
+        template <typename StreamT>
+        void print_with_grad(const std::string& prefix, StreamT& s) const
+        {
+            s << "  - " << prefix << "_loss       = " << m_loss.value / (double)m_count << std::endl;
+            s << "  - " << prefix << "_grad_norm  = " << m_loss.grad / (double)m_count << std::endl;
+        }
+
+        template <typename StreamT>
+        void print_only_loss(const std::string& prefix, StreamT& s) const
+        {
+            s << "  - " << prefix << "_loss       = " << m_loss.value / (double)m_count << std::endl;
+        }
+
+    private:
+        ValueWithGrad<double> m_loss{ 0.0, 0.0 };
+        uint64_t m_count{0};
+        std::mutex m_mutex;
+    };
 }
 
-#endif
-
 #endif // ifndef _LEARN_H_
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
deleted file mode 100644
index 9f02a594..00000000
--- a/src/learn/learner.cpp
+++ /dev/null
@@ -1,3569 +0,0 @@
-﻿// learning routines
-//
-// 1) Automatic generation of game records
-// → "gensfen" command
-// 2) Learning evaluation function parameters from the generated game record
-// → "learn" command
-// → Shuffle in the teacher phase is also an extension of this command.
-// Example) "learn shuffle"
-// 3) Automatic generation of fixed traces
-// → "makebook think" command
-// → implemented in extra/book/book.cpp
-// 4) Post-station automatic review mode
-// → I will not be involved in the engine because it is a problem that the GUI should assist.
-// etc..
-
-#if defined(EVAL_LEARN)
-
-#include <chrono>
-#include <filesystem>
-#include <random>
-#include <regex>
-
-#include "learn.h"
-#include "multi_think.h"
-#include "../uci.h"
-#include "../syzygy/tbprobe.h"
-
-// evaluate header for learning
-#include "../eval/evaluate_common.h"
-
-// ----------------------
-// constant string based on the settings
-// ----------------------
-
-// Character string according to update formula. (Output for debugging.)
-// Implemented various update expressions, but concluded that AdaGrad is the best in terms of speed and memory.
-#if defined(ADA_GRAD_UPDATE)
-#define LEARN_UPDATE "AdaGrad"
-#elif defined(SGD_UPDATE)
-#define LEARN_UPDATE "SGD"
-#endif
-
-#if defined(LOSS_FUNCTION_IS_WINNING_PERCENTAGE)
-#define LOSS_FUNCTION "WINNING_PERCENTAGE"
-#elif defined(LOSS_FUNCTION_IS_CROSS_ENTOROPY)
-#define LOSS_FUNCTION "CROSS_ENTOROPY"
-#elif defined(LOSS_FUNCTION_IS_CROSS_ENTOROPY_FOR_VALUE)
-#define LOSS_FUNCTION "CROSS_ENTOROPY_FOR_VALUE"
-#elif defined(LOSS_FUNCTION_IS_ELMO_METHOD)
-#define LOSS_FUNCTION "ELMO_METHOD(WCSC27)"
-#endif
-
-// -----------------------------------
-// Below, the implementation section.
-// -----------------------------------
-
-#include <sstream>
-#include <fstream>
-#include <unordered_set>
-#include <iomanip>
-#include <list>
-#include <cmath>	// std::exp(),std::pow(),std::log()
-#include <cstring>	// memcpy()
-
-#if defined (_OPENMP)
-#include <omp.h>
-#endif
-
-#if defined(_MSC_VER)
-// The C++ filesystem cannot be used unless it is C++17 or later or MSVC.
-// I tried to use windows.h, but with g++ of msys2 I can not get the files in the folder well.
-// Use dirent.h because there is no help for it.
-#include <filesystem>
-#elif defined(__GNUC__)
-#include <dirent.h>
-#endif
-
-#include "../misc.h"
-#include "../thread.h"
-#include "../position.h"
-//#include "../extra/book/book.h"
-#include "../tt.h"
-#include "multi_think.h"
-
-#if defined(EVAL_NNUE)
-#include "../nnue/evaluate_nnue_learner.h"
-#include <climits>
-#include <shared_mutex>
-#endif
-
-using namespace std;
-
-//// This is defined in the search section.
-//extern Book::BookMoveSelector book;
-
-// Addition and subtraction definition for atomic<T>
-// Aligned with atomicAdd() in Apery/learner.hpp.
-template <typename T>
-T operator += (std::atomic<T>& x, const T rhs)
-{
-	T old = x.load(std::memory_order_consume);
-	// It is allowed that the value is rewritten from other thread at this timing.
-	// The idea that the value is not destroyed is good.
-	T desired = old + rhs;
-	while (!x.compare_exchange_weak(old, desired, std::memory_order_release, std::memory_order_consume))
-		desired = old + rhs;
-	return desired;
-}
-template <typename T>
-T operator -= (std::atomic<T>& x, const T rhs) { return x += -rhs; }
-
-namespace Learner
-{
-
-// Phase array: PSVector stands for packed sfen vector.
-typedef std::vector<PackedSfenValue> PSVector;
-
-bool write_out_draw_game_in_training_data_generation = false;
-bool use_draw_games_in_training = false;
-bool use_draw_games_in_validation = false;
-bool skip_duplicated_positions_in_training = true;
-bool detect_draw_by_consecutive_low_score = false;
-bool detect_draw_by_insufficient_mating_material = false;
-// 1.0 / PawnValueEg / 4.0 * log(10.0)
-double winning_probability_coefficient = 0.00276753015984861260098316280611;
-// Score scale factors.  ex) If we set src_score_min_value = 0.0,
-// src_score_max_value = 1.0, dest_score_min_value = 0.0,
-// dest_score_max_value = 10000.0, [0.0, 1.0] will be scaled to [0, 10000].
-double src_score_min_value = 0.0;
-double src_score_max_value = 1.0;
-double dest_score_min_value = 0.0;
-double dest_score_max_value = 1.0;
-// Assume teacher signals are the scores of deep searches, and convert them into winning
-// probabilities in the trainer. Sometimes we want to use the winning probabilities in the training
-// data directly. In those cases, we set false to this variable.
-bool convert_teacher_signal_to_winning_probability = true;
-// Use raw NNUE eval value in the Eval::evaluate(). If hybrid eval is enabled, training data
-// generation and training don't work well.
-// https://discordapp.com/channels/435943710472011776/733545871911813221/748524079761326192
-bool use_raw_nnue_eval = true;
-// Using WDL with win rate model instead of sigmoid
-bool use_wdl = false;
-
-// -----------------------------------
-// write phase file
-// -----------------------------------
-
-// Helper class for exporting Sfen
-struct SfenWriter
-{
-		// File name to write and number of threads to create
-	SfenWriter(string filename, int thread_num)
-	{
-		sfen_buffers_pool.reserve((size_t)thread_num * 10);
-		sfen_buffers.resize(thread_num);
-
-		// When performing additional learning, the quality of the teacher generated after learning the evaluation function does not change much and I want to earn more teacher positions.
-		// Since it is preferable that old teachers also use it, it has such a specification.
-		fs.open(filename, ios::out | ios::binary | ios::app);
-		filename_ = filename;
-
-		finished = false;
-	}
-
-	~SfenWriter()
-	{
-		finished = true;
-		file_worker_thread.join();
-		fs.close();
-
-		// all buffers should be empty since file_worker_thread has written all..
-		for (auto p : sfen_buffers) { assert(p == nullptr); }
-		assert(sfen_buffers_pool.empty());
-	}
-
-	// For each thread, flush the file by this number of phases.
-	const size_t SFEN_WRITE_SIZE = 5000;
-
-	// write one by pairing the phase and evaluation value (in packed sfen format)
-	void write(size_t thread_id, const PackedSfenValue& psv)
-	{
-		// We have a buffer for each thread and add it there.
-		// If the buffer overflows, write it to a file.
-
-		// This buffer is prepared for each thread.
-		auto& buf = sfen_buffers[thread_id];
-
-		// Secure since there is no buf at the first time and immediately after writing the thread buffer.
-		if (!buf)
-		{
-			buf = new PSVector();
-			buf->reserve(SFEN_WRITE_SIZE);
-		}
-
-		// It is prepared for each thread, so one thread does not call this write() function at the same time.
-		// There is no need to exclude at this point.
-		buf->push_back(psv);
-
-		if (buf->size() >= SFEN_WRITE_SIZE)
-		{
-			// If you load it in sfen_buffers_pool, the worker will do the rest.
-
-			// Mutex lock is required when changing the contents of sfen_buffers_pool.
-			std::unique_lock<std::mutex> lk(mutex);
-			sfen_buffers_pool.push_back(buf);
-
-			buf = nullptr;
-			// If you set buf == nullptr, the buffer will be allocated the next time this function is called.
-		}
-	}
-
-	// Move what remains in the buffer for your thread to a buffer for writing to a file.
-	void finalize(size_t thread_id)
-	{
-		std::unique_lock<std::mutex> lk(mutex);
-
-		auto& buf = sfen_buffers[thread_id];
-
-		// There is a case that buf==nullptr, so that check is necessary.
-		if (buf && buf->size() != 0)
-			sfen_buffers_pool.push_back(buf);
-
-		buf = nullptr;
-	}
-
-	// Start the write_worker thread.
-	void start_file_write_worker()
-	{
-		file_worker_thread = std::thread([&] { this->file_write_worker(); });
-	}
-
-	// Dedicated thread to write to file
-	void file_write_worker()
-	{
-		auto output_status = [&]()
-		{
-			// also output the current time
-			sync_cout << endl << sfen_write_count << " sfens , at " << now_string() << sync_endl;
-
-			// This is enough for flush().
-			fs.flush();
-		};
-
-		while (!finished || sfen_buffers_pool.size())
-		{
-			vector<PSVector*> buffers;
-			{
-				std::unique_lock<std::mutex> lk(mutex);
-
-				// copy the whole
-				buffers = sfen_buffers_pool;
-				sfen_buffers_pool.clear();
-			}
-
-			// sleep() if you didn't get anything
-			if (!buffers.size())
-				sleep(100);
-			else
-			{
-				for (auto ptr : buffers)
-				{
-					fs.write((const char*)&((*ptr)[0]), sizeof(PackedSfenValue) * ptr->size());
-
-					sfen_write_count += ptr->size();
-
-#if 1
-					// Add the processed number here, and if it exceeds save_every, change the file name and reset this counter.
-					save_every_counter += ptr->size();
-					if (save_every_counter >= save_every)
-					{
-						save_every_counter = 0;
-						// Change the file name.
-
-						fs.close();
-
-						// Sequential number attached to the file
-						int n = (int)(sfen_write_count / save_every);
-						// Rename the file and open it again. Add ios::app in consideration of overwriting. (Depending on the operation, it may not be necessary.)
-						string filename = filename_ + "_" + std::to_string(n);
-						fs.open(filename, ios::out | ios::binary | ios::app);
-						cout << endl << "output sfen file = " << filename << endl;
-					}
-#endif
-
-					// Output'.' every time when writing a game record.
-					std::cout << ".";
-
-					// Output the number of phases processed every 40 times
-					// Finally, the remainder of the teacher phase of each thread is written out, so halfway numbers are displayed, but is it okay?
-					// If you overuse the threads to the maximum number of logical cores, the console will be clogged, so it may be a little more loose.
-					if ((++time_stamp_count % 40) == 0)
-						output_status();
-
-					// Since this memory is unnecessary, release it at this timing.
-					delete ptr;
-				}
-			}
-		}
-
-		// Output the time stamp again before the end.
-		output_status();
-	}
-
-	// Change the file name in this unit.
-	uint64_t save_every = UINT64_MAX;
-
-private:
-
-	fstream fs;
-
-	// File name passed in the constructor
-	std::string filename_;
-
-	// Add the processed number here, and if it exceeds save_every, change the file name and reset this counter.
-	uint64_t save_every_counter = 0;
-
-	// thread to write to the file
-	std::thread file_worker_thread;
-	// Flag that all threads have finished
-	atomic<bool> finished;
-
-	// Counter for time stamp output
-	uint64_t time_stamp_count = 0;
-
-	// buffer before writing to file
-	// sfen_buffers is the buffer for each thread
-	// sfen_buffers_pool is a buffer for writing.
-	// After loading the phase in the former buffer by SFEN_WRITE_SIZE, transfer it to the latter.
-	std::vector<PSVector*> sfen_buffers;
-	std::vector<PSVector*> sfen_buffers_pool;
-
-	// Mutex required to access sfen_buffers_pool
-	std::mutex mutex;
-
-	// number of written phases
-	uint64_t sfen_write_count = 0;
-};
-
-// -----------------------------------
-// worker that creates the game record (for each thread)
-// -----------------------------------
-
-// Class to generate sfen with multiple threads
-struct MultiThinkGenSfen : public MultiThink
-{
-	MultiThinkGenSfen(int search_depth_, int search_depth2_, SfenWriter& sw_)
-		: search_depth(search_depth_), search_depth2(search_depth2_), sw(sw_)
-	{
-		hash.resize(GENSFEN_HASH_SIZE);
-
-		// Output for confirmation if the same random seed is not drawn when parallelizing and gensfening the PC.
-		std::cout << prng << std::endl;
-	}
-
-	virtual void thread_worker(size_t thread_id);
-	void start_file_write_worker() { sw.start_file_write_worker(); }
-
-	// search_depth = search depth for normal search
-	int search_depth;
-	int search_depth2;
-
-	// Number of the nodes to be searched.
-	// 0 represents no limits.
-	uint64_t nodes;
-
-	// Upper limit of evaluation value of generated situation
-	int eval_limit;
-
-	// minimum ply with random move
-	int random_move_minply;
-	// maximum ply with random move
-	int random_move_maxply;
-	// Number of random moves in one station
-	int random_move_count;
-	// Move balls with a probability of 1/N when randomly moving like Apery.
-	// When you move the ball again, there is a 1/N chance that it will randomly move once in the opponent's number.
-	// Apery has N=2. Specifying 0 here disables this function.
-	int random_move_like_apery;
-
-	// For when using multi pv instead of random move.
-	// random_multi_pv is the number of candidates for MultiPV.
-	// When adopting the move of the candidate move, the difference between the evaluation value of the move of the 1st place and the evaluation value of the move of the Nth place is
-	// Must be in the range random_multi_pv_diff.
-	// random_multi_pv_depth is the search depth for MultiPV.
-	int random_multi_pv;
-	int random_multi_pv_diff;
-	int random_multi_pv_depth;
-
-	// The minimum and maximum ply (number of steps from the initial phase) of the phase to write out.
-	int write_minply;
-	int write_maxply;
-
-	// sfen exporter
-	SfenWriter& sw;
-
-	// hash to limit the export of the same phase
-	// It must be 2**N because it will be used as the mask to calculate hash_index.
-	static const uint64_t GENSFEN_HASH_SIZE = 64 * 1024 * 1024;
-
-	vector<Key> hash; // 64MB*sizeof(HASH_KEY) = 512MB
-};
-
-//  thread_id    = 0..Threads.size()-1
-void MultiThinkGenSfen::thread_worker(size_t thread_id)
-{
-	// For the time being, it will be treated as a draw at the maximum number of steps to write.
-	const int MAX_PLY2 = write_maxply;
-
-	//Maximum StateInfo + Search PV to advance to leaf buffer
-	std::vector<StateInfo,AlignedAllocator<StateInfo>> states(MAX_PLY2 + MAX_PLY /* == search_depth + α */);
-	StateInfo si;
-
-	// This move. Use this move to advance the stage.
-	Move m = MOVE_NONE;
-
-	// end flag
-	bool quit = false;
-
-	// Variables for draw adjudication.
-	// Todo: Make this as an option.
-	int adj_draw_ply = 80; // start the adjudication when ply reaches this value
-	int adj_draw_cnt = 8;  // 4 move scores for each side have to be checked
-	int adj_draw_score = 0;  // move score in CP
-
-	// repeat until the specified number of times
-	while (!quit)
-	{
-		// It is necessary to set a dependent thread for Position.
-		// When parallelizing, Threads (since this is a vector<Thread*>,
-		// Do the same for up to Threads[0]...Threads[thread_num-1].
-		auto th = Threads[thread_id];
-
-		auto& pos = th->rootPos;
-    pos.set(StartFEN, false, &si, th);
-
-    // Test cod for Packed SFEN.
-    //{
-    //  PackedSfen packed_sfen;
-    //  pos.sfen_pack(packed_sfen);
-    //  std::cout << pos << std::endl;
-    //  pos.set_from_packed_sfen(packed_sfen, &si, th);
-    //  std::string actual = pos.fen();
-    //  assert(actual == StartFEN);
-    //}
-
-		// Refer to the members of BookMoveSelector defined in the search section.
-		//auto& book = ::book;
-
-		// Save the situation for one station, and write it out including the winning and losing at the end.
-		// The function to write is flush_psv() below this.
-		PSVector a_psv;
-		a_psv.reserve(MAX_PLY2 + MAX_PLY);
-
-		// Write out the phases loaded in a_psv to a file.
-		// lastTurnIsWin: win/loss in the next phase after the final phase in a_psv
-		// 1 when winning. -1 when losing. Pass 0 for a draw.
-		// Return value: true if the specified number of phases has already been reached and the process ends.
-		auto flush_psv = [&](int8_t lastTurnIsWin)
-		{
-			int8_t isWin = lastTurnIsWin;
-
-			// From the final stage (one step before) to the first stage, give information on the outcome of the game for each stage.
-			// The phases stored in a_psv are assumed to be continuous (in order).
-			for (auto it = a_psv.rbegin(); it != a_psv.rend(); ++it)
-			{
-				// If isWin == 0 (draw), multiply by -1 and it will remain 0 (draw)
-				isWin = - isWin;
-				it->game_result = isWin;
-
-				// When I tried to write out the phase, it reached the specified number of times.
-				// Because the counter is added in get_next_loop_count()
-				// If you don't call this when the phase is output, the counter goes crazy.
-				auto loop_count = get_next_loop_count();
-				if (loop_count == UINT64_MAX)
-				{
-					// Set the end flag.
-					quit = true;
-					return;
-				}
-
-				// Write out one aspect.
-				sw.write(thread_id, *it);
-
-#if 0
-				pos.set_from_packed_sfen(it->sfen);
-				cout << pos << "Win : " << it->isWin << " , " << it->score << endl;
-#endif
-			}
-		};
-
-		// ply flag for whether or not to randomly move by eyes
-		vector<bool> random_move_flag;
-		{
-			// If you want to add a random move, random_move_maxply be sure to enter random_move_count times before the first move.
-			// I want you to disperse so much.
-			// I'm not sure how best it is. Experimenting under various conditions.
-
-			// Make an array like a[0] = 0 ,a[1] = 1, ...
-			// Fisher-Yates shuffle and take out the first N items.
-			// Actually, I only want N pieces, so I only need to shuffle the first N pieces with Fisher-Yates.
-
-			vector<int> a;
-			a.reserve((size_t)random_move_maxply);
-
-			// random_move_minply ,random_move_maxply is specified by 1 origin,
-			// Note that we are handling 0 origin here.
-			for (int i = std::max(random_move_minply - 1 , 0) ; i < random_move_maxply; ++i)
-				a.push_back(i);
-
-			// In case of Apery random move, insert() may be called random_move_count times.
-			// Reserve only the size considering it.
-			random_move_flag.resize((size_t)random_move_maxply + random_move_count);
-
-			// A random move that exceeds the size() of a[] cannot be applied, so limit it.
-			for (int i = 0 ; i < std::min(random_move_count, (int)a.size()) ; ++i)
-			{
-				swap(a[i], a[prng.rand((uint64_t)a.size() - i) + i]);
-				random_move_flag[a[i]] = true;
-			}
-		}
-
-		// A counter that keeps track of the number of random moves
-		// When random_move_minply == -1, random moves are performed continuously, so use it at this time.
-		int random_move_c = 0;
-
-		// Save history of move scores for adjudication
-		vector<int> move_hist_scores;
-
-		// ply: steps from the initial stage
-		for (int ply = 0; ; ++ply)
-		{
-			//cout << pos << endl;
-
-			// Current search depth
-			// Goto will fly, so declare it first.
-			int depth = search_depth + (int)prng.rand(search_depth2 - search_depth + 1);
-
-			// has it reached the length
-			if (ply >= MAX_PLY2)
-			{
-				if (write_out_draw_game_in_training_data_generation) {
-				// Write out as win/loss = draw.
-				// This way it is harder to allow the opponent to enter the ball when I enter (may)
-				flush_psv(0);
-				}
-				break;
-			}
-
-      if (pos.is_draw(ply)) {
-		  if (write_out_draw_game_in_training_data_generation) {
-			  // Write if draw.
-			  flush_psv(0);
-		  }
-        break;
-      }
-
-			// Initialize the Syzygy Ending Tablebase and sort the moves.
-			Search::RootMoves rootMoves;
-			for (const auto& m : MoveList<LEGAL>(pos))
-				rootMoves.emplace_back(m);
-			if (!rootMoves.empty())
-				Tablebases::rank_root_moves(pos, rootMoves);
-
-			// If there is no legal move, terminate the game if position
-			// is mate or a stalemate.
-			else {
-				if (pos.checkers()) // Mate
-					flush_psv(-1);
-				else if (write_out_draw_game_in_training_data_generation) {
-					flush_psv(0); // Stalemate
-				}
-				break;
-			}
-
-			// Adjudicate game to a draw if the last 4 scores of each engine is 0.
-			if (detect_draw_by_consecutive_low_score) {
-				if (ply >= adj_draw_ply) {
-					int draw_cnt = 0;
-					bool is_adj_draw = false;
-
-					for (vector<int>::reverse_iterator it = move_hist_scores.rbegin();
-						it != move_hist_scores.rend(); ++it) 
-					{
-						if (abs(*it) <= adj_draw_score)
-							draw_cnt++;
-						else
-							break;  // score should be successive
-
-						if (draw_cnt >= adj_draw_cnt) {
-							is_adj_draw = true;
-							break;
-						}
-					}
-
-					if (is_adj_draw) {
-						if (write_out_draw_game_in_training_data_generation)
-							flush_psv(0);
-						break;
-					}
-				}
-			}
-
-			// Draw by insufficient mating material
-			if (detect_draw_by_insufficient_mating_material) {
-				if (pos.count<ALL_PIECES>() <= 4) {
-					int pcnt = pos.count<ALL_PIECES>();
-					// (1) KvK
-					if (pcnt == 2) {
-						if (write_out_draw_game_in_training_data_generation)
-							flush_psv(0);
-						break;
-					}
-					// (2) KvK + 1 minor piece
-					if (pcnt == 3) {
-						int minor_pc = pos.count<BISHOP>(WHITE) + pos.count<KNIGHT>(WHITE) +
-							pos.count<BISHOP>(BLACK) + pos.count<KNIGHT>(BLACK);
-						if (minor_pc == 1) {
-							if (write_out_draw_game_in_training_data_generation)
-								flush_psv(0);
-							break;
-						}
-					}
-					// (3) KBvKB, bishops of the same color
-					else if (pcnt == 4) {
-						if (pos.count<BISHOP>(WHITE) == 1 && pos.count<BISHOP>(BLACK) == 1) {
-							// Color of bishops is black.
-							if ((pos.pieces(WHITE, BISHOP) & DarkSquares)
-								&& (pos.pieces(BLACK, BISHOP) & DarkSquares))
-							{
-								if (write_out_draw_game_in_training_data_generation)
-									flush_psv(0);
-								break;
-							}
-							// Color of bishops is white.
-							if ((pos.pieces(WHITE, BISHOP) & ~DarkSquares)
-								&& (pos.pieces(BLACK, BISHOP) & ~DarkSquares))
-							{
-								if (write_out_draw_game_in_training_data_generation)
-									flush_psv(0);
-								break;
-							}
-						}
-					}
-				}
-			}
-
-			//// constant track
-			//if ((m = book.probe(pos)) != MOVE_NONE)
-			//{
-			//  // Hit the constant track.
-			//  // The move was stored in m.
-
-			//  // Do not use the fixed phase for learning.
-			//  a_psv.clear();
-
-			//  if (random_move_minply != -1)
-			// 		// Random move is performed with a certain probability even in the constant phase.
-			// 		goto RANDOM_MOVE;
-			//  else
-			// 		// When -1 is specified as random_move_minply, it points according to the standard until it goes out of the standard.
-			// 		// Prepare an innumerable number of situations that have left the constant as ConsiderationBookMoveCount true using a huge constant
-			// 		// Used for purposes such as performing a random move 5 times from there.
-			// 		goto DO_MOVE;
-			//}
-
-			{
-				// search_depth～search_depth2 Evaluation value of hand reading and PV (best responder row)
-				// There should be no problem if you narrow the search window.
-
-				auto pv_value1 = search(pos, depth, 1, nodes);
-
-				auto value1 = pv_value1.first;
-				auto& pv1 = pv_value1.second;
-
-				// For situations where the absolute evaluation value is greater than or equal to this value
-				// It doesn't make much sense to use that aspect for learning, so this game ends.
-				// Treat this as having won or lost.
-
-				// If you win one move, declarative win, mate_in(2) will be returned here, so it will be the same value as the upper limit of eval_limit,
-				// This if expression is always true. The same applies to resign.
-
-				if (abs(value1) >= eval_limit)
-				{
-					// sync_cout << pos << "eval limit = "<< eval_limit << "over ,move = "<< pv1[0] << sync_endl;
-
-					// If value1 >= eval_limit in this aspect, you win (the turn side of this aspect).
-					flush_psv((value1 >= eval_limit) ? 1 : -1);
-					break;
-				}
-
-				// Verification of a strange move
-				if (pv1.size() > 0
-					&& (pv1[0] == MOVE_NONE || pv1[0] == MOVE_NULL)
-					)
-				{
-					// MOVE_WIN is checking if it is the declaration victory stage before this
-					// The declarative winning move should never come back here.
-					// Also, when MOVE_RESIGN, value1 is a one-stop score, which should be the minimum value of eval_limit (-31998)...
-					cout << "Error! : " << pos.fen() << m << value1 << endl;
-					break;
-				}
-
-				// Save the move score for adjudication.
-				move_hist_scores.push_back(value1);
-
-				// Use PV's move to the leaf node and use the value that evaluated() is called on that leaf node.
-				auto evaluate_leaf = [&](Position& pos , vector<Move>& pv)
-				{
-					auto rootColor = pos.side_to_move();
-
-					int ply2 = ply;
-					for (auto m : pv)
-					{
-						// As a verification for debugging, make sure there are no illegal players in the middle.
-						// NULL_MOVE does not come.
-
-						// I tested it out enough so I can comment it out.
-#if 1
-						// I shouldn't be an illegal player.
-						// declarative win and not mated() are tested above so
-						// It is guaranteed that MOVE_WIN and MOVE_RESIGN do not come as a reader. (Should...)
-						if (!pos.pseudo_legal(m) || !pos.legal(m))
-						{
-							cout << "Error! : " << pos.fen() << m << endl;
-						}
-#endif
-						pos.do_move(m, states[ply2++]);
-						
-						//Because the difference calculation of evaluate() cannot be performed unless each node evaluate() is called!
-						// If the depth is 8 or more, it seems faster not to calculate this difference.
-#if defined(EVAL_NNUE)
-            if (depth < 8)
-              Eval::NNUE::update_eval(pos);
-#endif  // defined(EVAL_NNUE)
-					}
-
-					// reach leaf
-					Value v;
-					if (pos.checkers()) {
-						// Sometime a king is checked.  An example is a case that a checkmate is
-						// found in the search.  If Eval::evaluate() is called whne a king is
-						// checked, classic eval crashes by an assertion.  To avoid crashes, return
-						// value1 instead of the score of the PV leaf.
-						v = value1;
-					}
-					else {
-						v = Eval::evaluate(pos);
-					// evaluate() returns the evaluation value on the turn side, so
-					// If it's a turn different from root_color, you must invert v and return it.
-					if (rootColor != pos.side_to_move())
-						v = -v;
-					}
-
-					// Rewind.
-					// Is it C++x14, and isn't there even foreach to turn in reverse?
-					//  for (auto it : boost::adaptors::reverse(pv))
-
-					for (auto it = pv.rbegin(); it != pv.rend(); ++it)
-						pos.undo_move(*it);
-
-					return v;
-				};
-
-#if 0
-				dbg_hit_on(pv_value1.first == leaf_value);
-				// gensfen depth 3 eval_limit 32000
-				// Total 217749 Hits 203579 hit rate (%) 93.490
-				// gensfen depth 6 eval_limit 32000
-				// Total 78407 Hits 69190 hit rate (%) 88.245
-				// gensfen depth 6 eval_limit 3000
-				// Total 53879 Hits 43713 hit rate (%) 81.132
-
-				// Problems such as pruning with moves in the substitution table.
-				// This is a little uncomfortable as a teacher...
-#endif
-
-				//If depth 0, pv is not obtained, so search again at depth 2.
-				if (search_depth <= 0)
-				{
-					pv_value1 = search(pos, 2);
-					pv1 = pv_value1.second;
-				}
-
-				// The surroundings of the initial stage are all similar
-				// Do not write it out because it can lead to overlearning when used for learning.
-				// → comparative experiment should be done
-				if (ply < write_minply - 1)
-				{
-					a_psv.clear();
-					goto SKIP_SAVE;
-				}
-
-				// Did you just write the same phase?
-				// This may include the same aspect as it is generated in parallel on multiple PCs, so
-				// It is better to do the same process when reading.
-				{
-					auto key = pos.key();
-					auto hash_index = (size_t)(key & (GENSFEN_HASH_SIZE - 1));
-					auto key2 = hash[hash_index];
-					if (key == key2)
-					{
-						// when skipping regarding earlier
-						// Clear the saved situation because the win/loss information will be incorrect.
-						// anyway, when the hash matches, it's likely that the previous phases also match
-						// Not worth writing out.
-						a_psv.clear();
-						goto SKIP_SAVE;
-					}
-					hash[hash_index] = key; // Replace with the current key.
-				}
-
-				// Temporary saving of the situation.
-				{
-					a_psv.emplace_back(PackedSfenValue());
-					auto &psv = a_psv.back();
-
-					// If pack is requested, write the packed sfen and the evaluation value at that time.
-					// The final writing is after winning or losing.
-					pos.sfen_pack(psv.sfen);
-
-          //{
-          //  std::string before_fen = pos.fen();
-          //  pos.set_from_packed_sfen(psv.sfen, &si, th);
-          //  std::string after_fen = pos.fen();
-          //  assert(before_fen == after_fen);
-          //}
-
-					// Get the value of evaluate() as seen from the root color on the leaf node of the PV line.
-					//I don't know the goodness and badness of using the return value of search() as it is.
-					psv.score = evaluate_leaf(pos, pv1);
-					psv.gamePly = ply;
-
-					// Take out the first PV hand. This should be present unless depth 0.
-					assert(pv_value1.second.size() >= 1);
-					Move pv_move1 = pv_value1.second[0];
-					psv.move = pv_move1;
-				}
-
-			SKIP_SAVE:;
-
-				// For some reason, I could not get PV (hit the substitution table etc. and got stuck?) so go to the next game.
-				// It's a rare case, so you can ignore it.
-				if (pv1.size() == 0)
-					break;
-
-				// search_depth Advance the phase by hand reading.
-				m = pv1[0];
-			}
-
-		RANDOM_MOVE:;
-
-			// Phase to randomly choose one from legal hands
-			if (
-				// 1. Random move of random_move_count times from random_move_minply to random_move_maxply
-				(random_move_minply != -1 && ply <(int)random_move_flag.size() && random_move_flag[ply]) ||
-				// 2. A mode to perform random move of random_move_count times after leaving the track
-				(random_move_minply == -1 && random_move_c <random_move_count))
-			{
-				++random_move_c;
-
-				// It's not a mate, so there should be one legal hand...
-				if (random_multi_pv == 0)
-				{
-					// normal random move
-
-					MoveList<LEGAL> list(pos);
-
-					// I don't really know the goodness and badness of making this the Apery method.
-					if (random_move_like_apery == 0
-						|| prng.rand(random_move_like_apery) != 0
-					)
-					{
-						// Normally one move from legal move
-						m = list.at((size_t)prng.rand((uint64_t)list.size()));
-					}
-					else {
-						// if you can move the ball, move the ball
-						Move moves[8]; // Near 8
-						Move* p = &moves[0];
-						for (auto& m : list)
-							if (type_of(pos.moved_piece(m)) == KING)
-								*(p++) = m;
-						size_t n = p - &moves[0];
-						if (n != 0)
-						{
-							// move to move the ball
-							m = moves[prng.rand(n)];
-
-							// In Apery method, at this time there is a 1/2 chance that the opponent will also move randomly
-							if (prng.rand(2) == 0)
-							{
-								// Is it a simple hack to add a "1" next to random_move_flag[ply]?
-								random_move_flag.insert(random_move_flag.begin() + ply + 1, 1, true);
-							}
-						}
-						else
-							// Normally one move from legal move
-							m = list.at((size_t)prng.rand((uint64_t)list.size()));
-					}
-
-					// I put in the code of two handed balls, but if you choose one from legal hands, it should be equivalent to that
-					// I decided it's unnecessary because it just makes the code more complicated.
-				}
-				else {
-					// Since the logic becomes complicated, I'm sorry, I will search again with MultiPV here.
-					Learner::search(pos, random_multi_pv_depth, random_multi_pv);
-					// Select one from the top N hands of root Moves
-
-					auto& rm = pos.this_thread()->rootMoves;
-
-					uint64_t s = min((uint64_t)rm.size(), (uint64_t)random_multi_pv);
-					for (uint64_t i = 1; i < s; ++i)
-					{
-						// The difference from the evaluation value of rm[0] must be within the range of random_multi_pv_diff.
-						// It can be assumed that rm[x].score is arranged in descending order.
-						if (rm[0].score > rm[i].score + random_multi_pv_diff)
-						{
-							s = i;
-							break;
-						}
-					}
-
-					m = rm[prng.rand(s)].pv[0];
-
-					// I haven't written one phase yet, but it ended, so the writing process ends and the next game starts.
-					if (!is_ok(m))
-						break;
-				}
-
-				// When trying to evaluate the move from the outcome of the game,
-				// There is a random move this time, so try not to fall below this.
-				a_psv.clear(); // clear saved aspect
-			}
-
-		DO_MOVE:;
-			pos.do_move(m, states[ply]);
-
-			// Call node evaluate() for each difference calculation.
-			Eval::NNUE::update_eval(pos);
-
-		} // for (int ply = 0; ; ++ply)
-
-	} // while(!quit)
-
-	sw.finalize(thread_id);
-}
-
-// -----------------------------------
-// Command to generate a game record (master thread)
-// -----------------------------------
-
-// Command to generate a game record
-void gen_sfen(Position&, istringstream& is)
-{
-	// number of threads (given by USI setoption)
-	uint32_t thread_num = (uint32_t)Options["Threads"];
-
-	// Number of generated game records default = 8 billion phases (Ponanza specification)
-	uint64_t loop_max = 8000000000UL;
-
-	// Stop the generation when the evaluation value reaches this value.
-	int eval_limit = 3000;
-
-	// search depth
-	int search_depth = 3;
-	int search_depth2 = INT_MIN;
-
-	// Number of nodes to be searched.
-	uint64_t nodes = 0;
-
-	// minimum ply, maximum ply and number of random moves
-	int random_move_minply = 1;
-	int random_move_maxply = 24;
-	int random_move_count = 5;
-	// A function to move the random move mainly like Apery
-	// If this is set to 3, the ball will move with a probability of 1/3.
-	int random_move_like_apery = 0;
-	// If you search with multipv instead of random move and choose from among them randomly, set random_multi_pv = 1 or more.
-	int random_multi_pv = 0;
-	int random_multi_pv_diff = 32000;
-	int random_multi_pv_depth = INT_MIN;
-
-	// The minimum and maximum ply (number of steps from the initial phase) of the phase to write out.
-	int write_minply = 16;
-	int write_maxply = 400;
-
-	// File name to write
-	string output_file_name = "generated_kifu.bin";
-
-	string token;
-
-	// When hit to eval hash, as a evaluation value near the initial stage, if a hash collision occurs and a large value is written
-	// When eval_limit is set small, eval_limit will be exceeded every time in the initial phase, and phase generation will not proceed.
-	// Therefore, eval hash needs to be disabled.
-	// After that, when the hash of the eval hash collides, the evaluation value of a strange value is used, and it may be unpleasant to use it for the teacher.
-	bool use_eval_hash = false;
-
-	// Save to file in this unit.
-	// File names are serialized like file_1.bin, file_2.bin.
-	uint64_t save_every = UINT64_MAX;
-
-	// Add a random number to the end of the file name.
-	bool random_file_name = false;
-
-	while (true)
-	{
-		token = "";
-		is >> token;
-		if (token == "")
-			break;
-
-		if (token == "depth")
-			is >> search_depth;
-		else if (token == "depth2")
-			is >> search_depth2;
-		else if (token == "nodes")
-			is >> nodes;
-		else if (token == "loop")
-			is >> loop_max;
-		else if (token == "output_file_name")
-			is >> output_file_name;
-		else if (token == "eval_limit")
-		{
-			is >> eval_limit;
-			// Limit the maximum to a one-stop score. (Otherwise you might not end the loop)
-			eval_limit = std::min(eval_limit, (int)mate_in(2));
-		}
-		else if (token == "random_move_minply")
-			is >> random_move_minply;
-		else if (token == "random_move_maxply")
-			is >> random_move_maxply;
-		else if (token == "random_move_count")
-			is >> random_move_count;
-		else if (token == "random_move_like_apery")
-			is >> random_move_like_apery;
-		else if (token == "random_multi_pv")
-			is >> random_multi_pv;
-		else if (token == "random_multi_pv_diff")
-			is >> random_multi_pv_diff;
-		else if (token == "random_multi_pv_depth")
-			is >> random_multi_pv_depth;
-		else if (token == "write_minply")
-			is >> write_minply;
-		else if (token == "write_maxply")
-			is >> write_maxply;
-		else if (token == "use_eval_hash")
-			is >> use_eval_hash;
-		else if (token == "save_every")
-			is >> save_every;
-		else if (token == "random_file_name")
-			is >> random_file_name;
-		// Accept also the old option name.
-		else if (token == "use_draw_in_training_data_generation" || token == "write_out_draw_game_in_training_data_generation")
-			is >> write_out_draw_game_in_training_data_generation;
-		// Accept also the old option name.
-		else if (token == "use_game_draw_adjudication" || token == "detect_draw_by_consecutive_low_score")
-			is >> detect_draw_by_consecutive_low_score;
-		else if (token == "detect_draw_by_insufficient_mating_material")
-			is >> detect_draw_by_insufficient_mating_material;
-		else if (token == "use_raw_nnue_eval")
-			is >> use_raw_nnue_eval;
-		else
-			cout << "Error! : Illegal token " << token << endl;
-	}
-
-#if defined(USE_GLOBAL_OPTIONS)
-	// Save it for later restore.
-	auto oldGlobalOptions = GlobalOptions;
-	GlobalOptions.use_eval_hash = use_eval_hash;
-#endif
-
-	// If search depth2 is not set, leave it the same as search depth.
-	if (search_depth2 == INT_MIN)
-		search_depth2 = search_depth;
-	if (random_multi_pv_depth == INT_MIN)
-		random_multi_pv_depth = search_depth;
-
-	if (random_file_name)
-	{
-		// Give a random number to output_file_name at this point.
-		// Do not use std::random_device().  Because it always the same integers on MinGW.
-		PRNG r(std::chrono::system_clock::now().time_since_epoch().count());
-		// Just in case, reassign the random numbers.
-		for(int i=0;i<10;++i)
-			r.rand(1);
-		auto to_hex = [](uint64_t u){
-			std::stringstream ss;
-			ss << std::hex << u;
-			return ss.str();
-		};
-		// I don't want to wear 64bit numbers by accident, so I'm going to make a 64bit number 2 just in case.
-		output_file_name = output_file_name + "_" + to_hex(r.rand<uint64_t>()) + to_hex(r.rand<uint64_t>());
-	}
-
-	std::cout << "gensfen : " << endl
-		<< "  search_depth = " << search_depth << " to " << search_depth2 << endl
-		<< "  nodes = " << nodes << endl
-		<< "  loop_max = " << loop_max << endl
-		<< "  eval_limit = " << eval_limit << endl
-		<< "  thread_num (set by USI setoption) = " << thread_num << endl
-		//<< "  book_moves (set by USI setoption) = " << Options["BookMoves"] << endl
-		<< "  random_move_minply     = " << random_move_minply << endl
-		<< "  random_move_maxply     = " << random_move_maxply << endl
-		<< "  random_move_count      = " << random_move_count << endl
-		<< "  random_move_like_apery = " << random_move_like_apery << endl
-		<< "  random_multi_pv        = " << random_multi_pv << endl
-		<< "  random_multi_pv_diff   = " << random_multi_pv_diff << endl
-		<< "  random_multi_pv_depth  = " << random_multi_pv_depth << endl
-		<< "  write_minply           = " << write_minply << endl
-		<< "  write_maxply           = " << write_maxply << endl
-		<< "  output_file_name       = " << output_file_name << endl
-		<< "  use_eval_hash          = " << use_eval_hash << endl
-		<< "  save_every             = " << save_every << endl
-		<< "  random_file_name       = " << random_file_name << endl
-		<< "  write_out_draw_game_in_training_data_generation = " << write_out_draw_game_in_training_data_generation << endl
-		<< "  detect_draw_by_consecutive_low_score = " << detect_draw_by_consecutive_low_score << endl
-		<< "  detect_draw_by_insufficient_mating_material = " << detect_draw_by_insufficient_mating_material << endl;
-
-	// Show if the training data generator uses NNUE.
-	Eval::verify_NNUE();
-
-	// Create and execute threads as many as Options["Threads"].
-	{
-		SfenWriter sw(output_file_name, thread_num);
-		sw.save_every = save_every;
-
-		MultiThinkGenSfen multi_think(search_depth, search_depth2, sw);
-		multi_think.nodes = nodes;
-		multi_think.set_loop_max(loop_max);
-		multi_think.eval_limit = eval_limit;
-		multi_think.random_move_minply = random_move_minply;
-		multi_think.random_move_maxply = random_move_maxply;
-		multi_think.random_move_count = random_move_count;
-		multi_think.random_move_like_apery = random_move_like_apery;
-		multi_think.random_multi_pv = random_multi_pv;
-		multi_think.random_multi_pv_diff = random_multi_pv_diff;
-		multi_think.random_multi_pv_depth = random_multi_pv_depth;
-		multi_think.write_minply = write_minply;
-		multi_think.write_maxply = write_maxply;
-		multi_think.start_file_write_worker();
-		multi_think.go_think();
-
-		// Since we are joining with the destructor of SfenWriter, please give a message that it has finished after the join
-		// Enclose this in a block because it should be displayed.
-	}
-
-	std::cout << "gensfen finished." << endl;
-
-#if defined(USE_GLOBAL_OPTIONS)
-	// Restore Global Options.
-	GlobalOptions = oldGlobalOptions;
-#endif
-
-}
-
-// -----------------------------------
-// command to learn from the generated game (learn)
-// -----------------------------------
-
-// ordinary sigmoid function
-double sigmoid(double x)
-{
-	return 1.0 / (1.0 + std::exp(-x));
-}
-
-// A function that converts the evaluation value to the winning rate [0,1]
-double winning_percentage(double value)
-{
-	// 1/(1+10^(-Eval/4))
-	// = 1/(1+e^(-Eval/4*ln(10))
-	// = sigmoid(Eval/4*ln(10))
-	return sigmoid(value * winning_probability_coefficient);
-}
-
-// A function that converts the evaluation value to the winning rate [0,1]
-double winning_percentage_wdl(double value, int ply)
-{
-	double wdl_w = UCI::win_rate_model_double( value, ply);
-	double wdl_l = UCI::win_rate_model_double(-value, ply);
-	double wdl_d = 1000.0 - wdl_w - wdl_l;
-
-	return (wdl_w + wdl_d / 2.0) / 1000.0;
-}
-
-// A function that converts the evaluation value to the winning rate [0,1]
-double winning_percentage(double value, int ply)
-{
-	if (use_wdl) {
-		return winning_percentage_wdl(value, ply);
-	}
-	else {
-		return winning_percentage(value);
-	}
-}
-
-double calc_cross_entropy_of_winning_percentage(double deep_win_rate, double shallow_eval, int ply)
-{
-	double p = deep_win_rate;
-	double q = winning_percentage(shallow_eval, ply);
-	return -p * std::log(q) - (1 - p) * std::log(1 - q);
-}
-
-double calc_d_cross_entropy_of_winning_percentage(double deep_win_rate, double shallow_eval, int ply)
-{
-	constexpr double epsilon = 0.000001;
-	double y1 = calc_cross_entropy_of_winning_percentage(deep_win_rate, shallow_eval          , ply);
-	double y2 = calc_cross_entropy_of_winning_percentage(deep_win_rate, shallow_eval + epsilon, ply);
-
-	// Divide by the winning_probability_coefficient to match scale with the sigmoidal win rate
-	return ((y2 - y1) / epsilon) / winning_probability_coefficient;
-}
-
-double dsigmoid(double x)
-{
-	// Sigmoid function
-	// f(x) = 1/(1+exp(-x))
-	// the first derivative is
-	// f'(x) = df/dx = f(x)・{ 1-f(x)}
-	// becomes
-
-	return sigmoid(x) * (1.0 - sigmoid(x));
-}
-
-// When the objective function is the sum of squares of the difference in winning percentage
-#if defined (LOSS_FUNCTION_IS_WINNING_PERCENTAGE)
-// function to calculate the gradient
-double calc_grad(Value deep, Value shallow, PackedSfenValue& psv)
-{
-	// The square of the win rate difference minimizes it in the objective function.
-	// Objective function J = 1/2m Σ (win_rate(shallow)-win_rate(deep) )^2
-	// However, σ is a sigmoid function that converts the evaluation value into the difference in the winning percentage.
-	// m is the number of samples. shallow is the evaluation value for a shallow search (qsearch()). deep is the evaluation value for deep search.
-	// If W is the feature vector (parameter of the evaluation function) and Xi and Yi are teachers
-	// shallow = W*Xi // * is the Hadamard product, transposing W and meaning X
-	// f(Xi) = win_rate(W*Xi)
-	// If σ(i th deep) = Yi,
-	// J = m/2 Σ (f(Xi)-Yi )^2
-	// becomes a common expression.
-	// W is a vector, and if we write the jth element as Wj, from the chain rule
-	// ∂J/∂Wj = ∂J/∂f ・∂f/∂W ・∂W/∂Wj
-	// = 1/m Σ (f(Xi)-y) ・f'(Xi) ・ 1
-
-	// 1/m will be multiplied later, but the contents of Σ can be retained in the array as the value of the gradient.
-	// f'(Xi) = win_rate'(shallow) = sigmoid'(shallow/600) = dsigmoid(shallow / 600) / 600
-	// This /600 at the end is adjusted by the learning rate, so do not write it..
-	// Also, the coefficient of 1/m is unnecessary if you use the update formula that has the automatic gradient adjustment function like Adam and AdaGrad.
-	// Therefore, it is not necessary to save it in memory.
-
-	double p = winning_percentage(deep);
-	double q = winning_percentage(shallow);
-	return (q - p) * dsigmoid(double(shallow) / 600.0);
-}
-#endif
-
-#if defined (LOSS_FUNCTION_IS_CROSS_ENTOROPY)
-double calc_grad(Value deep, Value shallow, const PackedSfenValue& psv)
-{
-	// Objective function with cross entropy
-
-	// For the concept and nature of cross entropy,
-	// http://nnadl-ja.github.io/nnadl_site_ja/chap3.html#the_cross-entropy_cost_function
-	// http://postd.cc/visual-information-theory-3/
-	// Refer to etc.
-
-	// Objective function design)
-	// We want to make the distribution of p closer to the distribution of q → Think of it as the problem of minimizing the cross entropy between the probability distributions of p and q.
-	// J = H(p,q) =-Σ p(x) log(q(x)) = -p log q-(1-p) log(1-q)
-	// x
-
-	// p is a constant and q is a Wi function (q = σ(W・Xi) ).
-	// ∂J/∂Wi = -p・q'/q-(1-p)(1-q)'/(1-q)
-	// = ...
-	// = q-p.
-
-	double p = winning_percentage(deep);
-	double q = winning_percentage(shallow);
-
-	return q - p;
-}
-#endif
-
-#if defined ( LOSS_FUNCTION_IS_CROSS_ENTOROPY_FOR_VALUE )
-double calc_grad(Value deep, Value shallow, const PackedSfenValue& psv)
-{
-	// Version that does not pass the winning percentage function
-	// This, unless EVAL_LIMIT is set low, trying to match the evaluation value with the shape of the end stage
-	// eval may exceed the range of eval.
-	return shallow - deep;
-}
-#endif
-
-#if defined ( LOSS_FUNCTION_IS_ELMO_METHOD )
-
-// A constant used in elmo (WCSC27). Adjustment required.
-// Since elmo does not internally divide the expression, the value is different.
-// You can set this value with the learn command.
-// 0.33 is equivalent to the constant (0.5) used in elmo (WCSC27)
-double ELMO_LAMBDA = 0.33;
-double ELMO_LAMBDA2 = 0.33;
-double ELMO_LAMBDA_LIMIT = 32000;
-
-double calc_grad(Value teacher_signal, Value shallow , const PackedSfenValue& psv)
-{
-	// elmo (WCSC27) method
-	// Correct with the actual game wins and losses.
-
-	// Training Formula · Issue #71 · nodchip/Stockfish https://github.com/nodchip/Stockfish/issues/71
-	double scaled_teacher_signal = teacher_signal;
-	// Normalize to [0.0, 1.0].
-	scaled_teacher_signal = (scaled_teacher_signal - src_score_min_value) / (src_score_max_value - src_score_min_value);
-	// Scale to [dest_score_min_value, dest_score_max_value].
-	scaled_teacher_signal = scaled_teacher_signal * (dest_score_max_value - dest_score_min_value) + dest_score_min_value;
-
-	const double q = winning_percentage(shallow, psv.gamePly);
-	// Teacher winning probability.
-	double p = scaled_teacher_signal;
-	if (convert_teacher_signal_to_winning_probability) {
-		p = winning_percentage(scaled_teacher_signal, psv.gamePly);
-	}
-
-	// Use 1 as the correction term if the expected win rate is 1, 0 if you lose, and 0.5 if you draw.
-	// game_result = 1,0,-1 so add 1 and divide by 2.
-	const double t = double(psv.game_result + 1) / 2;
-
-	// If the evaluation value in deep search exceeds ELMO_LAMBDA_LIMIT, apply ELMO_LAMBDA2 instead of ELMO_LAMBDA.
-	const double lambda = (abs(teacher_signal) >= ELMO_LAMBDA_LIMIT) ? ELMO_LAMBDA2 : ELMO_LAMBDA;
-
-	double grad;
-	if (use_wdl) {
-		double dce_p = calc_d_cross_entropy_of_winning_percentage(p, shallow, psv.gamePly);
-		double dce_t = calc_d_cross_entropy_of_winning_percentage(t, shallow, psv.gamePly);
-		grad = lambda * dce_p + (1.0 - lambda) * dce_t;
-	}
-	else {
-		// Use the actual win rate as a correction term.
-		// This is the idea of ​​elmo (WCSC27), modern O-parts.
-		grad = lambda * (q - p) + (1.0 - lambda) * (q - t);
-	}
-
-	return grad;
-}
-
-// Calculate cross entropy during learning
-// The individual cross entropy of the win/loss term and win rate term of the elmo expression is returned to the arguments cross_entropy_eval and cross_entropy_win.
-void calc_cross_entropy(Value teacher_signal, Value shallow, const PackedSfenValue& psv,
-	double& cross_entropy_eval, double& cross_entropy_win, double& cross_entropy,
-	double& entropy_eval, double& entropy_win, double& entropy)
-{
-	// Training Formula · Issue #71 · nodchip/Stockfish https://github.com/nodchip/Stockfish/issues/71
-	double scaled_teacher_signal = teacher_signal;
-	// Normalize to [0.0, 1.0].
-	scaled_teacher_signal = (scaled_teacher_signal - src_score_min_value) / (src_score_max_value - src_score_min_value);
-	// Scale to [dest_score_min_value, dest_score_max_value].
-	scaled_teacher_signal = scaled_teacher_signal * (dest_score_max_value - dest_score_min_value) + dest_score_min_value;
-
-	// Teacher winning probability.
-	double p = scaled_teacher_signal;
-	if (convert_teacher_signal_to_winning_probability) {
-		p = winning_percentage(scaled_teacher_signal);
-	}
-	const double q /* eval_winrate    */ = winning_percentage(shallow);
-	const double t = double(psv.game_result + 1) / 2;
-
-	constexpr double epsilon = 0.000001;
-
-	// If the evaluation value in deep search exceeds ELMO_LAMBDA_LIMIT, apply ELMO_LAMBDA2 instead of ELMO_LAMBDA.
-	const double lambda = (abs(teacher_signal) >= ELMO_LAMBDA_LIMIT) ? ELMO_LAMBDA2 : ELMO_LAMBDA;
-
-	const double m = (1.0 - lambda) * t + lambda * p;
-
-	cross_entropy_eval =
-		(-p * std::log(q + epsilon) - (1.0 - p) * std::log(1.0 - q + epsilon));
-	cross_entropy_win =
-		(-t * std::log(q + epsilon) - (1.0 - t) * std::log(1.0 - q + epsilon));
-	entropy_eval =
-		(-p * std::log(p + epsilon) - (1.0 - p) * std::log(1.0 - p + epsilon));
-	entropy_win =
-		(-t * std::log(t + epsilon) - (1.0 - t) * std::log(1.0 - t + epsilon));
-
-	cross_entropy =
-		(-m * std::log(q + epsilon) - (1.0 - m) * std::log(1.0 - q + epsilon));
-	entropy =
-		(-m * std::log(m + epsilon) - (1.0 - m) * std::log(1.0 - m + epsilon));
-}
-
-#endif
-
-
-// Other variations may be prepared as the objective function..
-
-
-double calc_grad(Value shallow, const PackedSfenValue& psv) {
-	return calc_grad((Value)psv.score, shallow, psv);
-}
-
-// Sfen reader
-struct SfenReader
-{
-	// Do not use std::random_device().  Because it always the same integers on MinGW.
-	SfenReader(int thread_num) : prng(std::chrono::system_clock::now().time_since_epoch().count())
-	{
-		packed_sfens.resize(thread_num);
-		total_read = 0;
-		total_done = 0;
-		last_done = 0;
-		next_update_weights = 0;
-		save_count = 0;
-		end_of_files = false;
-		no_shuffle = false;
-		stop_flag = false;
-
-		hash.resize(READ_SFEN_HASH_SIZE);
-	}
-
-	~SfenReader()
-	{
-		if (file_worker_thread.joinable())
-			file_worker_thread.join();
-
-		for (auto p : packed_sfens)
-			delete p;
-		for (auto p : packed_sfens_pool)
-			delete p;
-	}
-
-	// number of phases used for calculation such as mse
-	// mini-batch size = 1M is standard, so 0.2% of that should be negligible in terms of time.
-	//Since search() is performed with depth = 1 in calculation of move match rate, simple comparison is not possible...
-	const uint64_t sfen_for_mse_size = 2000;
-
-	// Load the phase for calculation such as mse.
-	void read_for_mse()
-	{
-		auto th = Threads.main();
-		Position& pos = th->rootPos;
-		for (uint64_t i = 0; i < sfen_for_mse_size; ++i)
-		{
-			PackedSfenValue ps;
-			if (!read_to_thread_buffer(0, ps))
-			{
-				cout << "Error! read packed sfen , failed." << endl;
-				break;
-			}
-			sfen_for_mse.push_back(ps);
-
-			// Get the hash key.
-			StateInfo si;
-			pos.set_from_packed_sfen(ps.sfen,&si,th);
-			sfen_for_mse_hash.insert(pos.key());
-		}
-	}
-
-	void read_validation_set(const string file_name, int eval_limit)
-	{
-		ifstream fs(file_name, ios::binary);
-
-		while (fs)
-		{
-			PackedSfenValue p;
-			if (fs.read((char*)&p, sizeof(PackedSfenValue)))
-			{
-				if (eval_limit < abs(p.score))
-					continue;
-				if (!use_draw_games_in_validation && p.game_result == 0)
-					continue;
-				sfen_for_mse.push_back(p);
-			} else {
-				break;
-			}
-		}
-	}
-
-	// Number of phases buffered by each thread 0.1M phases. 4M phase at 40HT
-	const size_t THREAD_BUFFER_SIZE = 10 * 1000;
-
-	// Buffer for reading files (If this is made larger, the shuffle becomes larger and the phases may vary.
-	// If it is too large, the memory consumption will increase.
-	// SFEN_READ_SIZE is a multiple of THREAD_BUFFER_SIZE.
-	const size_t SFEN_READ_SIZE = LEARN_SFEN_READ_SIZE;
-
-	// [ASYNC] Thread returns one aspect. Otherwise returns false.
-	bool read_to_thread_buffer(size_t thread_id, PackedSfenValue& ps)
-	{
-		// If there are any positions left in the thread buffer, retrieve one and return it.
-		auto& thread_ps = packed_sfens[thread_id];
-
-		// Fill the read buffer if there is no remaining buffer, but if it doesn't even exist, finish.
-		if ((thread_ps == nullptr || thread_ps->size() == 0) // If the buffer is empty, fill it.
-			&& !read_to_thread_buffer_impl(thread_id))
-			return false;
-
-		// read_to_thread_buffer_impl() returned true,
-		// Since the filling of the thread buffer with the phase has been completed successfully
-		// thread_ps->rbegin() is alive.
-
-		ps = *(thread_ps->rbegin());
-		thread_ps->pop_back();
-
-		// If you've run out of buffers, call delete yourself to free this buffer.
-		if (thread_ps->size() == 0)
-		{
-
-			delete thread_ps;
-			thread_ps = nullptr;
-		}
-
-		return true;
-	}
-
-	// [ASYNC] Read some aspects into thread buffer.
-	bool read_to_thread_buffer_impl(size_t thread_id)
-	{
-		while (true)
-		{
-			{
-				std::unique_lock<std::mutex> lk(mutex);
-				// If you can fill from the file buffer, that's fine.
-				if (packed_sfens_pool.size() != 0)
-				{
-					// It seems that filling is possible, so fill and finish.
-
-					packed_sfens[thread_id] = packed_sfens_pool.front();
-					packed_sfens_pool.pop_front();
-
-					total_read += THREAD_BUFFER_SIZE;
-
-					return true;
-				}
-			}
-
-			// The file to read is already gone. No more use.
-			if (end_of_files)
-				return false;
-
-			// Waiting for file worker to fill packed_sfens_pool.
-			// The mutex isn't locked, so it should fill up soon.
-			sleep(1);
-		}
-
-	}
-
-	// Start a thread that loads the phase file in the background.
-	void start_file_read_worker()
-	{
-		file_worker_thread = std::thread([&] { this->file_read_worker(); });
-	}
-
-	// for file read-only threads
-	void file_read_worker()
-	{
-		auto open_next_file = [&]()
-		{
-			if (fs.is_open())
-				fs.close();
-
-			// no more
-			if (filenames.size() == 0)
-				return false;
-
-			// Get the next file name.
-			string filename = *filenames.rbegin();
-			filenames.pop_back();
-
-			fs.open(filename, ios::in | ios::binary);
-			cout << "open filename = " << filename << endl;
-			assert(fs);
-
-			return true;
-		};
-
-		while (true)
-		{
-			// Wait for the buffer to run out.
-			// This size() is read only, so you don't need to lock it.
-			while (!stop_flag && packed_sfens_pool.size() >= SFEN_READ_SIZE / THREAD_BUFFER_SIZE)
-				sleep(100);
-			if (stop_flag)
-				return;
-
-			PSVector sfens;
-			sfens.reserve(SFEN_READ_SIZE);
-
-			// Read from the file into the file buffer.
-			while (sfens.size() < SFEN_READ_SIZE)
-			{
-				PackedSfenValue p;
-				if (fs.read((char*)&p, sizeof(PackedSfenValue)))
-				{
-					sfens.push_back(p);
-				} else
-				{
-					// read failure
-					if (!open_next_file())
-					{
-						// There was no next file. Abon.
-						cout << "..end of files." << endl;
-						end_of_files = true;
-						return;
-					}
-				}
-			}
-
-			// Shuffle the read phase data.
-			// random shuffle by Fisher-Yates algorithm
-
-			if (!no_shuffle)
-			{
-				auto size = sfens.size();
-				for (size_t i = 0; i < size; ++i)
-					swap(sfens[i], sfens[(size_t)(prng.rand((uint64_t)size - i) + i)]);
-			}
-
-			// Divide this by THREAD_BUFFER_SIZE. There should be size pieces.
-			// SFEN_READ_SIZE shall be a multiple of THREAD_BUFFER_SIZE.
-			assert((SFEN_READ_SIZE % THREAD_BUFFER_SIZE)==0);
-
-			auto size = size_t(SFEN_READ_SIZE / THREAD_BUFFER_SIZE);
-			std::vector<PSVector*> ptrs;
-			ptrs.reserve(size);
-
-			for (size_t i = 0; i < size; ++i)
-			{
-				// Delete this pointer on the receiving side.
-				PSVector* ptr = new PSVector();
-				ptr->resize(THREAD_BUFFER_SIZE);
-				memcpy(&((*ptr)[0]), &sfens[i * THREAD_BUFFER_SIZE], sizeof(PackedSfenValue) * THREAD_BUFFER_SIZE);
-
-				ptrs.push_back(ptr);
-			}
-
-			// Since sfens is ready, look at the occasion and copy
-			{
-				std::unique_lock<std::mutex> lk(mutex);
-
-				// You can ignore this time because you just copy the pointer...
-				// The mutex lock is required because the contents of packed_sfens_pool are changed.
-
-				for (size_t i = 0; i < size; ++i)
-					packed_sfens_pool.push_back(ptrs[i]);
-			}
-		}
-	}
-
-	// sfen files
-	vector<string> filenames;
-
-	// number of phases read (file to memory buffer)
-	atomic<uint64_t> total_read;
-
-	// number of processed phases
-	atomic<uint64_t> total_done;
-
-	// number of cases processed so far
-	uint64_t last_done;
-
-	// If total_read exceeds this value, update_weights() and calculate mse.
-	uint64_t next_update_weights;
-
-	uint64_t save_count;
-
-	// Do not shuffle when reading the phase.
-	bool no_shuffle;
-
-	bool stop_flag;
-
-	// Determine if it is a phase for calculating rmse.
-	// (The computational aspects of rmse should not be used for learning.)
-	bool is_for_rmse(Key key) const
-	{
-			return sfen_for_mse_hash.count(key) != 0;
-	}
-
-	// hash to limit the reading of the same situation
-	// Is there too many 64 million phases? Or Not really..
-	// It must be 2**N because it will be used as the mask to calculate hash_index.
-	static const uint64_t READ_SFEN_HASH_SIZE = 64 * 1024 * 1024;
-	vector<Key> hash; // 64MB*8 = 512MB
-
-	// test phase for mse calculation
-	PSVector sfen_for_mse;
-
-protected:
-
-	// worker thread reading file in background
-	std::thread file_worker_thread;
-
-	// Random number to shuffle when reading the phase
-	PRNG prng;
-
-	// Did you read the files and reached the end?
-	atomic<bool> end_of_files;
-
-
-	// handle of sfen file
-	std::fstream fs;
-
-	// sfen for each thread
-	// (When the thread is used up, the thread should call delete to release it.)
-	std::vector<PSVector*> packed_sfens;
-
-	// Mutex when accessing packed_sfens_pool
-	std::mutex mutex;
-
-	// pool of sfen. The worker thread read from the file is added here.
-	// Each worker thread fills its own packed_sfens[thread_id] from here.
-	// * Lock and access the mutex.
-	std::list<PSVector*> packed_sfens_pool;
-
-	// Hold the hash key so that the mse calculation phase is not used for learning.
-	std::unordered_set<Key> sfen_for_mse_hash;
-};
-
-// Class to generate sfen with multiple threads
-struct LearnerThink: public MultiThink
-{
-	LearnerThink(SfenReader& sr_):sr(sr_),stop_flag(false), save_only_once(false)
-	{
-#if defined ( LOSS_FUNCTION_IS_ELMO_METHOD )
-		learn_sum_cross_entropy_eval = 0.0;
-		learn_sum_cross_entropy_win = 0.0;
-		learn_sum_cross_entropy = 0.0;
-		learn_sum_entropy_eval = 0.0;
-		learn_sum_entropy_win = 0.0;
-		learn_sum_entropy = 0.0;
-#endif
-#if defined(EVAL_NNUE)
-		newbob_scale = 1.0;
-		newbob_decay = 1.0;
-		newbob_num_trials = 2;
-		best_loss = std::numeric_limits<double>::infinity();
-		latest_loss_sum = 0.0;
-		latest_loss_count = 0;
-#endif
-	}
-
-	virtual void thread_worker(size_t thread_id);
-
-	// Start a thread that loads the phase file in the background.
-	void start_file_read_worker() { sr.start_file_read_worker(); }
-
-	// save merit function parameters to a file
-	bool save(bool is_final=false);
-
-	// sfen reader
-	SfenReader& sr;
-
-	// Learning iteration counter
-	uint64_t epoch = 0;
-
-	// Mini batch size size. Be sure to set it on the side that uses this class.
-	uint64_t mini_batch_size = 1000*1000;
-
-	bool stop_flag;
-
-	// Discount rate
-	double discount_rate;
-
-	// Option to exclude early stage from learning
-	int reduction_gameply;
-
-	// Option not to learn kk/kkp/kpp/kppp
-	std::array<bool,4> freeze;
-
-	// If the absolute value of the evaluation value of the deep search of the teacher phase exceeds this value, discard the teacher phase.
-	int eval_limit;
-
-	// Flag whether to dig a folder each time the evaluation function is saved.
-	// If true, do not dig the folder.
-	bool save_only_once;
-
-	// --- loss calculation
-
-#if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
-	// For calculation of learning data loss
-	atomic<double> learn_sum_cross_entropy_eval;
-	atomic<double> learn_sum_cross_entropy_win;
-	atomic<double> learn_sum_cross_entropy;
-	atomic<double> learn_sum_entropy_eval;
-	atomic<double> learn_sum_entropy_win;
-	atomic<double> learn_sum_entropy;
-#endif
-
-#if defined(EVAL_NNUE)
-	shared_timed_mutex nn_mutex;
-	double newbob_scale;
-	double newbob_decay;
-	int newbob_num_trials;
-	double best_loss;
-	double latest_loss_sum;
-	uint64_t latest_loss_count;
-	std::string best_nn_directory;
-#endif
-
-	uint64_t eval_save_interval;
-	uint64_t loss_output_interval;
-	uint64_t mirror_percentage;
-
-	// Loss calculation.
-	// done: Number of phases targeted this time
-	void calc_loss(size_t thread_id , uint64_t done);
-
-	// Define the loss calculation in ↑ as a task and execute it
-	TaskDispatcher task_dispatcher;
-};
-
-void LearnerThink::calc_loss(size_t thread_id, uint64_t done)
-{
-	// There is no point in hitting the replacement table, so at this timing the generation of the replacement table is updated.
-	// It doesn't matter if you have disabled the substitution table.
-	TT.new_search();
-
-
-#if defined(EVAL_NNUE)
-	std::cout << "PROGRESS: " << now_string() << ", ";
-	std::cout << sr.total_done << " sfens";
-	std::cout << ", iteration " << epoch;
-	std::cout << ", eta = " << Eval::get_eta() << ", ";
-#endif
-
-#if !defined(LOSS_FUNCTION_IS_ELMO_METHOD)
-	double sum_error = 0;
-	double sum_error2 = 0;
-	double sum_error3 = 0;
-#endif
-
-#if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
-	// For calculation of verification data loss
-	atomic<double> test_sum_cross_entropy_eval,test_sum_cross_entropy_win,test_sum_cross_entropy;
-	atomic<double> test_sum_entropy_eval,test_sum_entropy_win,test_sum_entropy;
-	test_sum_cross_entropy_eval = 0;
-	test_sum_cross_entropy_win = 0;
-	test_sum_cross_entropy = 0;
-	test_sum_entropy_eval = 0;
-	test_sum_entropy_win = 0;
-	test_sum_entropy = 0;
-
-	// norm for learning
-	atomic<double> sum_norm;
-	sum_norm = 0;
-#endif
-
-	// The number of times the pv first move of deep search matches the pv first move of search(1).
-	atomic<int> move_accord_count;
-	move_accord_count = 0;
-
-	// Display the value of eval() in the initial stage of Hirate and see the shaking.
-	auto th = Threads[thread_id];
-	auto& pos = th->rootPos;
-	StateInfo si;
-  pos.set(StartFEN, false, &si, th);
-  std::cout << "hirate eval = " << Eval::evaluate(pos);
-
-	//Eval::print_eval_stat(pos);
-
-	// It's better to parallelize here, but it's a bit troublesome because the search before slave has not finished.
-	// I created a mechanism to call task, so I will use it.
-
-	// The number of tasks to do.
-	atomic<int> task_count;
-	task_count = (int)sr.sfen_for_mse.size();
-	task_dispatcher.task_reserve(task_count);
-
-	// Create a task to search for the situation and give it to each thread.
-	for (const auto& ps : sr.sfen_for_mse)
-	{
-		// Assign work to each thread using TaskDispatcher.
-		// A task definition for that.
-		// It is not possible to capture pos used in ↑, so specify the variables you want to capture one by one.
-		auto task = [&ps,&test_sum_cross_entropy_eval,&test_sum_cross_entropy_win,&test_sum_cross_entropy,&test_sum_entropy_eval,&test_sum_entropy_win,&test_sum_entropy, &sum_norm,&task_count ,&move_accord_count](size_t thread_id)
-		{
-			// Does C++ properly capture a new ps instance for each loop?.
-			auto th = Threads[thread_id];
-			auto& pos = th->rootPos;
-			StateInfo si;
-			if (pos.set_from_packed_sfen(ps.sfen ,&si, th) != 0)
-			{
-				// Unfortunately, as an sfen for rmse calculation, an invalid sfen was drawn.
-				cout << "Error! : illegal packed sfen " << pos.fen() << endl;
-			}
-
-			// Evaluation value for shallow search
-			// The value of evaluate() may be used, but when calculating loss, learn_cross_entropy and
-			// Use qsearch() because it is difficult to compare the values.
-			// EvalHash has been disabled in advance. (If not, the same value will be returned every time)
-			auto r = qsearch(pos);
-
-			auto shallow_value = r.first;
-			{
-				const auto rootColor = pos.side_to_move();
-				const auto pv = r.second;
-				std::vector<StateInfo,AlignedAllocator<StateInfo>> states(pv.size());
-				for (size_t i = 0; i < pv.size(); ++i)
-				{
-					pos.do_move(pv[i], states[i]);
-					Eval::NNUE::update_eval(pos);
-				}
-				shallow_value = (rootColor == pos.side_to_move()) ? Eval::evaluate(pos) : -Eval::evaluate(pos);
-				for (auto it = pv.rbegin(); it != pv.rend(); ++it)
-					pos.undo_move(*it);
-			}
-
-			// Evaluation value of deep search
-			auto deep_value = (Value)ps.score;
-
-			// Note) This code does not consider when eval_limit is specified in the learn command.
-
-			// --- error calculation
-
-#if !defined(LOSS_FUNCTION_IS_ELMO_METHOD)
-			auto grad = calc_grad(deep_value, shallow_value, ps);
-
-			// something like rmse
-			sum_error += grad*grad;
-			// Add the absolute value of the gradient
-			sum_error2 += abs(grad);
-			// Add the absolute value of the difference between the evaluation values
-			sum_error3 += abs(shallow_value - deep_value);
-#endif
-
-			// --- calculation of cross entropy
-
-			// For the time being, regarding the win rate and loss terms only in the elmo method
-			// Calculate and display the cross entropy.
-
-#if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
-			double test_cross_entropy_eval, test_cross_entropy_win, test_cross_entropy;
-			double test_entropy_eval, test_entropy_win, test_entropy;
-			calc_cross_entropy(deep_value, shallow_value, ps, test_cross_entropy_eval, test_cross_entropy_win, test_cross_entropy, test_entropy_eval, test_entropy_win, test_entropy);
-			// The total cross entropy need not be abs() by definition.
-			test_sum_cross_entropy_eval += test_cross_entropy_eval;
-			test_sum_cross_entropy_win += test_cross_entropy_win;
-			test_sum_cross_entropy += test_cross_entropy;
-			test_sum_entropy_eval += test_entropy_eval;
-			test_sum_entropy_win += test_entropy_win;
-			test_sum_entropy += test_entropy;
-			sum_norm += (double)abs(shallow_value);
-#endif
-
-			// Determine if the teacher's move and the score of the shallow search match
-			{
-				auto r = search(pos,1);
-				if ((uint16_t)r.second[0] == ps.move)
-					move_accord_count.fetch_add(1, std::memory_order_relaxed);
-			}
-
-			// Reduced one task because I did it
-			--task_count;
-		};
-
-		// Throw the defined task to slave.
-		task_dispatcher.push_task_async(task);
-	}
-
-	// join yourself as a slave
-	task_dispatcher.on_idle(thread_id);
-
-	// wait for all tasks to complete
-	while (task_count)
-		sleep(1);
-
-#if !defined(LOSS_FUNCTION_IS_ELMO_METHOD)
-	// rmse = root mean square error: mean square error
-	// mae = mean absolute error: mean absolute error
-	auto dsig_rmse = std::sqrt(sum_error / (sfen_for_mse.size() + epsilon));
-	auto dsig_mae = sum_error2 / (sfen_for_mse.size() + epsilon);
-	auto eval_mae = sum_error3 / (sfen_for_mse.size() + epsilon);
-	cout << " , dsig rmse = " << dsig_rmse << " , dsig mae = " << dsig_mae
-		<< " , eval mae = " << eval_mae;
-#endif
-
-#if defined ( LOSS_FUNCTION_IS_ELMO_METHOD )
-#if defined(EVAL_NNUE)
-	latest_loss_sum += test_sum_cross_entropy - test_sum_entropy;
-	latest_loss_count += sr.sfen_for_mse.size();
-#endif
-
-// learn_cross_entropy may be called train cross entropy in the world of machine learning,
-// When omitting the acronym, it is nice to be able to distinguish it from test cross entropy(tce) by writing it as lce.
-
-	if (sr.sfen_for_mse.size() && done)
-	{
-		cout
-			<< " , test_cross_entropy_eval = "  << test_sum_cross_entropy_eval / sr.sfen_for_mse.size()
-			<< " , test_cross_entropy_win = "   << test_sum_cross_entropy_win / sr.sfen_for_mse.size()
-			<< " , test_entropy_eval = "        << test_sum_entropy_eval / sr.sfen_for_mse.size()
-			<< " , test_entropy_win = "         << test_sum_entropy_win / sr.sfen_for_mse.size()
-			<< " , test_cross_entropy = "       << test_sum_cross_entropy / sr.sfen_for_mse.size()
-			<< " , test_entropy = "             << test_sum_entropy / sr.sfen_for_mse.size()
-			<< " , norm = "						<< sum_norm
-			<< " , move accuracy = "			<< (move_accord_count * 100.0 / sr.sfen_for_mse.size()) << "%";
-		if (done != static_cast<uint64_t>(-1))
-		{
-			cout
-				<< " , learn_cross_entropy_eval = " << learn_sum_cross_entropy_eval / done
-				<< " , learn_cross_entropy_win = "  << learn_sum_cross_entropy_win / done
-				<< " , learn_entropy_eval = "       << learn_sum_entropy_eval / done
-				<< " , learn_entropy_win = "        << learn_sum_entropy_win / done
-				<< " , learn_cross_entropy = "      << learn_sum_cross_entropy / done
-				<< " , learn_entropy = "            << learn_sum_entropy / done;
-		}
-		cout << endl;
-	}
-	else {
-		cout << "Error! : sr.sfen_for_mse.size() = " << sr.sfen_for_mse.size() << " ,  done = " << done << endl;
-	}
-
-	// Clear 0 for next time.
-	learn_sum_cross_entropy_eval = 0.0;
-	learn_sum_cross_entropy_win = 0.0;
-	learn_sum_cross_entropy = 0.0;
-	learn_sum_entropy_eval = 0.0;
-	learn_sum_entropy_win = 0.0;
-	learn_sum_entropy = 0.0;
-#else
-	<< endl;
-#endif
-}
-
-
-void LearnerThink::thread_worker(size_t thread_id)
-{
-#if defined(_OPENMP)
-	omp_set_num_threads((int)Options["Threads"]);
-#endif
-
-	auto th = Threads[thread_id];
-	auto& pos = th->rootPos;
-
-	while (true)
-	{
-	// display mse (this is sometimes done only for thread 0)
-	// Immediately after being read from the file...
-
-#if defined(EVAL_NNUE)
-		// Lock the evaluation function so that it is not used during updating.
-		shared_lock<shared_timed_mutex> read_lock(nn_mutex, defer_lock);
-		if (sr.next_update_weights <= sr.total_done ||
-		    (thread_id != 0 && !read_lock.try_lock()))
-#else
-		if (sr.next_update_weights <= sr.total_done)
-#endif
-		{
-			if (thread_id != 0)
-			{
-				// Wait except thread_id == 0.
-
-				if (stop_flag)
-					break;
-
-				// I want to parallelize rmse calculation etc., so if task() is loaded, process it.
-				task_dispatcher.on_idle(thread_id);
-				continue;
-			}
-			else
-			{
-				// Only thread_id == 0 performs the following update process.
-
-				// The weight array is not updated for the first time.
-				if (sr.next_update_weights == 0)
-				{
-					sr.next_update_weights += mini_batch_size;
-					continue;
-				}
-
-#if !defined(EVAL_NNUE)
-				// Output the current time. Output every time.
-				std::cout << sr.total_done << " sfens , at " << now_string() << std::endl;
-
-				// Reflect the gradient in the weight array at this timing. The calculation of the gradient is just right for each 1M phase in terms of mini-batch.
-				Eval::update_weights(epoch , freeze);
-
-				// Display epoch and current eta for debugging.
-				std::cout << "epoch = " << epoch << " , eta = " << Eval::get_eta() << std::endl;
-#else
-				{
-					// update parameters
-
-					// Lock the evaluation function so that it is not used during updating.
-					lock_guard<shared_timed_mutex> write_lock(nn_mutex);
-					Eval::NNUE::UpdateParameters(epoch);
-				}
-#endif
-				++epoch;
-
-				// Save once every 1 billion phases.
-
-				// However, the elapsed time during update_weights() and calc_rmse() is ignored.
-				if (++sr.save_count * mini_batch_size >= eval_save_interval)
-				{
-					sr.save_count = 0;
-
-					// During this time, as the gradient calculation proceeds, the value becomes too large and I feel annoyed, so stop other threads.
-					const bool converged = save();
-					if (converged)
-					{
-						stop_flag = true;
-						sr.stop_flag = true;
-						break;
-					}
-				}
-
-				// Calculate rmse. This is done for samples of 10,000 phases.
-				// If you do with 40 cores, update_weights every 1 million phases
-				// I don't think it's so good to be tiring.
-				static uint64_t loss_output_count = 0;
-				if (++loss_output_count * mini_batch_size >= loss_output_interval)
-				{
-					loss_output_count = 0;
-
-					// Number of cases processed this time
-					uint64_t done = sr.total_done - sr.last_done;
-
-					// loss calculation
-					calc_loss(thread_id , done);
-
-#if defined(EVAL_NNUE)
-					Eval::NNUE::CheckHealth();
-#endif
-
-					// Make a note of how far you have totaled.
-					sr.last_done = sr.total_done;
-				}
-
-				// Next time, I want you to do this series of processing again when you process only mini_batch_size.
-				sr.next_update_weights += mini_batch_size;
-
-				// Since I was waiting for the update of this sr.next_update_weights except the main thread,
-				// Once this value is updated, it will start moving again.
-			}
-		}
-
-		PackedSfenValue ps;
-	RetryRead:;
-		if (!sr.read_to_thread_buffer(thread_id, ps))
-		{
-			// ran out of thread pool for my thread.
-			// Because there are almost no phases left,
-			// Terminate all other threads.
-
-			stop_flag = true;
-			break;
-		}
-
-		// The evaluation value exceeds the learning target value.
-		// Ignore this aspect information.
-		if (eval_limit <abs(ps.score))
-			goto RetryRead;
-
-
-		if (!use_draw_games_in_training && ps.game_result == 0)
-			goto RetryRead;
-
-
-		// Skip over the opening phase
-		if (ps.gamePly < prng.rand(reduction_gameply))
-			goto RetryRead;
-
-#if 0
-		auto sfen = pos.sfen_unpack(ps.data);
-		pos.set(sfen);
-#endif
-		// ↑ Since it is slow when passing through sfen, I made a dedicated function.
-		StateInfo si;
-		const bool mirror = prng.rand(100) < mirror_percentage;
-		if (pos.set_from_packed_sfen(ps.sfen,&si,th,mirror) != 0)
-		{
-			// I got a strange sfen. Should be debugged!
-			// Since it is an illegal sfen, it may not be displayed with pos.sfen(), but it is better than not.
-			cout << "Error! : illigal packed sfen = " << pos.fen() << endl;
-			goto RetryRead;
-		}
-#if !defined(EVAL_NNUE)
-		{
-			auto key = pos.key();
-			// Exclude the phase used for rmse calculation.
-			if (sr.is_for_rmse(key) && skip_duplicated_positions_in_training)
-				goto RetryRead;
-
-			// Exclude the most recently used aspect.
-			auto hash_index = size_t(key & (sr.READ_SFEN_HASH_SIZE - 1));
-			auto key2 = sr.hash[hash_index];
-			if (key == key2 && skip_duplicated_positions_in_training)
-				goto RetryRead;
-			sr.hash[hash_index] = key; // Replace with the current key.
-		}
-#endif
-
-		// There is a possibility that all the pieces are blocked and stuck.
-		// Also, the declaration win phase is excluded from learning because you cannot go to leaf with PV moves.
-		// (shouldn't write out such teacher aspect itself, but may have written it out with an old generation routine)
-	// Skip the position if there are no legal moves (=checkmated or stalemate).
-		if (MoveList<LEGAL>(pos).size() == 0)
-			goto RetryRead;
-
-		// I can read it, so try displaying it.
-		//		cout << pos << value << endl;
-
-		// Evaluation value of shallow search (qsearch)
-		auto r = qsearch(pos);
-		auto pv = r.second;
-
-		// Evaluation value of deep search
-		auto deep_value = (Value)ps.score;
-
-		// I feel that the mini batch has a better gradient.
-		// Go to the leaf node as it is, add only to the gradient array, and later try AdaGrad at the time of rmse aggregation.
-
-		auto rootColor = pos.side_to_move();
-
-		// If the initial PV is different, it is better not to use it for learning.
-		// If it is the result of searching a completely different place, it may become noise.
-		// It may be better not to study where the difference in evaluation values ​​is too large.
-
-#if 0
-		// If you do this, about 13% of the phases will be excluded from the learning target. Good and bad are subtle.
-		if (pv.size() >= 1 && (uint16_t)pv[0] != ps.move)
-		{
-			// dbg_hit_on(false);
-			continue;
-		}
-#endif
-
-#if 0
-		// It may be better not to study where the difference in evaluation values ​​is too large.
-		// → It's okay because it passes the win rate function... About 30% of the phases are out of the scope of learning...
-		if (abs((int16_t)r.first - ps.score) >= Eval::PawnValue * 4)
-		{
-//			dbg_hit_on(false);
-			continue;
-		}
-		//		dbg_hit_on(true);
-#endif
-
-		int ply = 0;
-
-		// A helper function that adds the gradient to the current phase.
-		auto pos_add_grad = [&]() {
-			// Use the value of evaluate in leaf as shallow_value.
-			// Using the return value of qsearch() as shallow_value,
-			// If PV is interrupted in the middle, the phase where evaluate() is called to calculate the gradient, and
-			// I don't think this is a very desirable property, as the aspect that gives that gradient will be different.
-			// I have turned off the substitution table, but since the pv array has not been updated due to one stumbling block etc...
-
-			Value shallow_value = (rootColor == pos.side_to_move()) ? Eval::evaluate(pos) : -Eval::evaluate(pos);
-
-#if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
-			// Calculate loss for training data
-			double learn_cross_entropy_eval, learn_cross_entropy_win, learn_cross_entropy;
-			double learn_entropy_eval, learn_entropy_win, learn_entropy;
-			calc_cross_entropy(deep_value, shallow_value, ps, learn_cross_entropy_eval, learn_cross_entropy_win, learn_cross_entropy, learn_entropy_eval, learn_entropy_win, learn_entropy);
-			learn_sum_cross_entropy_eval += learn_cross_entropy_eval;
-			learn_sum_cross_entropy_win += learn_cross_entropy_win;
-			learn_sum_cross_entropy += learn_cross_entropy;
-			learn_sum_entropy_eval += learn_entropy_eval;
-			learn_sum_entropy_win += learn_entropy_win;
-			learn_sum_entropy += learn_entropy;
-#endif
-
-#if !defined(EVAL_NNUE)
-			// Slope
-			double dj_dw = calc_grad(deep_value, shallow_value, ps);
-
-			// Add jd_dw as the gradient (∂J/∂Wj) for the feature vector currently appearing in the leaf node.
-
-			// If it is not PV termination, apply a discount rate.
-			if (discount_rate != 0 && ply != (int)pv.size())
-				dj_dw *= discount_rate;
-
-			// Since we have reached leaf, add the gradient to the features that appear in this phase.
-			// Update based on gradient later.
-			Eval::add_grad(pos, rootColor, dj_dw, freeze);
-#else
-			const double example_weight =
-			    (discount_rate != 0 && ply != (int)pv.size()) ? discount_rate : 1.0;
-			Eval::NNUE::AddExample(pos, rootColor, ps, example_weight);
-#endif
-
-			// Since the processing is completed, the counter of the processed number is incremented
-			sr.total_done++;
-		};
-
-		StateInfo state[MAX_PLY]; // PV of qsearch cannot be so long.
-		bool illegal_move = false;
-		for (auto m : pv)
-		{
-			// I shouldn't be an illegal player.
-			// An illegal move sometimes comes here...
-			if (!pos.pseudo_legal(m) || !pos.legal(m))
-			{
-				//cout << pos << m << endl;
-				//assert(false);
-				illegal_move = true;
-				break;
-			}
-
-			// Processing when adding the gradient to the node on each PV.
-			//If discount_rate is 0, this process is not performed.
-			if (discount_rate != 0)
-				pos_add_grad();
-
-			pos.do_move(m, state[ply++]);
-
-			// Since the value of evaluate in leaf is used, the difference is updated.
-			Eval::NNUE::update_eval(pos);
-		}
-
-		if (illegal_move) {
-			sync_cout << "An illical move was detected... Excluded the position from the learning data..." << sync_endl;
-			continue;
-		}
-
-		// Since we have reached the end phase of PV, add the slope here.
-		pos_add_grad();
-
-		// rewind the phase
-		for (auto it = pv.rbegin(); it != pv.rend(); ++it)
-			pos.undo_move(*it);
-
-#if 0
-		// When adding the gradient to the root phase
-		shallow_value = (rootColor == pos.side_to_move()) ? Eval::evaluate(pos) : -Eval::evaluate(pos);
-		dj_dw = calc_grad(deep_value, shallow_value, ps);
-		Eval::add_grad(pos, rootColor, dj_dw , without_kpp);
-#endif
-
-	}
-
-}
-
-// Write evaluation function file.
-bool LearnerThink::save(bool is_final)
-{
-	// Each time you save, change the extension part of the file name like "0","1","2",..
-	// (Because I want to compare the winning rate for each evaluation function parameter later)
-
-	if (save_only_once)
-	{
-		// When EVAL_SAVE_ONLY_ONCE is defined,
-		// Do not dig a subfolder because I want to save it only once.
-		Eval::save_eval("");
-	}
-	else if (is_final) {
-		Eval::save_eval("final");
-		return true;
-	}
-	else {
-		static int dir_number = 0;
-		const std::string dir_name = std::to_string(dir_number++);
-		Eval::save_eval(dir_name);
-#if defined(EVAL_NNUE)
-		if (newbob_decay != 1.0 && latest_loss_count > 0) {
-			static int trials = newbob_num_trials;
-			const double latest_loss = latest_loss_sum / latest_loss_count;
-			latest_loss_sum = 0.0;
-			latest_loss_count = 0;
-			cout << "loss: " << latest_loss;
-			if (latest_loss < best_loss) {
-				cout << " < best (" << best_loss << "), accepted" << endl;
-				best_loss = latest_loss;
-				best_nn_directory = Path::Combine((std::string)Options["EvalSaveDir"], dir_name);
-				trials = newbob_num_trials;
-			} else {
-				cout << " >= best (" << best_loss << "), rejected" << endl;
-				if (best_nn_directory.empty()) {
-					cout << "WARNING: no improvement from initial model" << endl;
-				} else {
-					cout << "restoring parameters from " << best_nn_directory << endl;
-					Eval::NNUE::RestoreParameters(best_nn_directory);
-				}
-				if (--trials > 0 && !is_final) {
-					cout << "reducing learning rate scale from " << newbob_scale
-					     << " to " << (newbob_scale * newbob_decay)
-					     << " (" << trials << " more trials)" << endl;
-					newbob_scale *= newbob_decay;
-					Eval::NNUE::SetGlobalLearningRateScale(newbob_scale);
-				}
-			}
-			if (trials == 0) {
-				cout << "converged" << endl;
-				return true;
-			}
-		}
-#endif
-	}
-	return false;
-}
-
-// Shuffle_files(), shuffle_files_quick() subcontracting, writing part.
-// output_file_name: Name of the file to write
-// prng: random number
-// afs: fstream of each teacher phase file
-// a_count: The number of teacher positions inherent in each file.
-void shuffle_write(const string& output_file_name , PRNG& prng , vector<fstream>& afs , vector<uint64_t>& a_count)
-{
-	uint64_t total_sfen_count = 0;
-	for (auto c : a_count)
-		total_sfen_count += c;
-
-	// number of exported phases
-	uint64_t write_sfen_count = 0;
-
-	// Output the progress on the screen for each phase.
-	const uint64_t buffer_size = 10000000;
-
-	auto print_status = [&]()
-	{
-		// Output progress every 10M phase or when all writing is completed
-		if (((write_sfen_count % buffer_size) == 0) ||
-			(write_sfen_count == total_sfen_count))
-			cout << write_sfen_count << " / " << total_sfen_count << endl;
-	};
-
-
-	cout << endl <<  "write : " << output_file_name << endl;
-
-	fstream fs(output_file_name, ios::out | ios::binary);
-
-	// total teacher positions
-	uint64_t sum = 0;
-	for (auto c : a_count)
-		sum += c;
-
-	while (sum != 0)
-	{
-		auto r = prng.rand(sum);
-
-		// Aspects stored in fs[0] file ... Aspects stored in fs[1] file ...
-		//Think of it as a series like, and determine in which file r is pointing.
-		// The contents of the file are shuffled, so you can take the next element from that file.
-		// Each file has a_count[x] phases, so this process can be written as follows.
-
-		uint64_t n = 0;
-		while (a_count[n] <= r)
-			r -= a_count[n++];
-
-		// This confirms n. Before you forget it, reduce the remaining number.
-
-		--a_count[n];
-		--sum;
-
-		PackedSfenValue psv;
-		// It's better to read and write all at once until the performance is not so good...
-		if (afs[n].read((char*)&psv, sizeof(PackedSfenValue)))
-		{
-			fs.write((char*)&psv, sizeof(PackedSfenValue));
-			++write_sfen_count;
-			print_status();
-		}
-	}
-	print_status();
-	fs.close();
-	cout << "done!" << endl;
-}
-
-// Subcontracting the teacher shuffle "learn shuffle" command.
-// output_file_name: name of the output file where the shuffled teacher positions will be written
-void shuffle_files(const vector<string>& filenames , const string& output_file_name , uint64_t buffer_size )
-{
-	// The destination folder is
-	// tmp/ for temporary writing
-
-	// Temporary file is written to tmp/ folder for each buffer_size phase.
-	// For example, if buffer_size = 20M, you need a buffer of 20M*40bytes = 800MB.
-	// In a PC with a small memory, it would be better to reduce this.
-	// However, if the number of files increases too much, it will not be possible to open at the same time due to OS restrictions.
-	// There should have been a limit of 512 per process on Windows, so you can open here as 500,
-	// The current setting is 500 files x 20M = 10G = 10 billion phases.
-
-	PSVector buf;
-	buf.resize(buffer_size);
-	// ↑ buffer, a marker that indicates how much you have used
-	uint64_t buf_write_marker = 0;
-
-	// File name to write (incremental counter because it is a serial number)
-	uint64_t write_file_count = 0;
-
-	// random number to shuffle
-	// Do not use std::random_device().  Because it always the same integers on MinGW.
-	PRNG prng(std::chrono::system_clock::now().time_since_epoch().count());
-
-	// generate the name of the temporary file
-	auto make_filename = [](uint64_t i)
-	{
-		return "tmp/" + to_string(i) + ".bin";
-	};
-
-	// Exported files in tmp/ folder, number of teacher positions stored in each
-	vector<uint64_t> a_count;
-
-	auto write_buffer = [&](uint64_t size)
-	{
-		// shuffle from buf[0] to buf[size-1]
-		for (uint64_t i = 0; i < size; ++i)
-			swap(buf[i], buf[(uint64_t)(prng.rand(size - i) + i)]);
-
-		// write to a file
-		fstream fs;
-		fs.open(make_filename(write_file_count++), ios::out | ios::binary);
-		fs.write((char*)&buf[0], size * sizeof(PackedSfenValue));
-		fs.close();
-		a_count.push_back(size);
-
-		buf_write_marker = 0;
-		cout << ".";
-	};
-
-	Dependency::mkdir("tmp");
-
-	// Shuffle and export as a 10M phase shredded file.
-	for (auto filename : filenames)
-	{
-		fstream fs(filename, ios::in | ios::binary);
-		cout << endl << "open file = " << filename;
-		while (fs.read((char*)&buf[buf_write_marker], sizeof(PackedSfenValue)))
-			if (++buf_write_marker == buffer_size)
-				write_buffer(buffer_size);
-
-		// Read in units of sizeof(PackedSfenValue),
-		// Ignore the last remaining fraction. (Fails in fs.read, so exit while)
-		// (The remaining fraction seems to be half-finished data that was created because it was stopped halfway during teacher generation.)
-
-	}
-
-	if (buf_write_marker != 0)
-		write_buffer(buf_write_marker);
-
-	// Only shuffled files have been written write_file_count.
-	// As a second pass, if you open all of them at the same time, select one at random and load one phase at a time
-	// Now you have shuffled.
-
-	// Original file for shirt full + tmp file + file to write requires 3 times the storage capacity of the original file.
-	// 1 billion SSD is not enough for shuffling because it is 400GB for 10 billion phases.
-	// If you want to delete (or delete by hand) the original file at this point after writing to tmp,
-	// The storage capacity is about twice that of the original file.
-	// So, maybe we should have an option to delete the original file.
-
-	// Files are opened at the same time. It is highly possible that this will exceed FOPEN_MAX.
-	// In that case, rather than adjusting buffer_size to reduce the number of files.
-
-	vector<fstream> afs;
-	for (uint64_t i = 0; i < write_file_count; ++i)
-		afs.emplace_back(fstream(make_filename(i),ios::in | ios::binary));
-
-	// Throw to the subcontract function and end.
-	shuffle_write(output_file_name, prng, afs, a_count);
-}
-
-// Subcontracting the teacher shuffle "learn shuffleq" command.
-// This is written in 1 pass.
-// output_file_name: name of the output file where the shuffled teacher positions will be written
-void shuffle_files_quick(const vector<string>& filenames, const string& output_file_name)
-{
-	// number of phases read
-	uint64_t read_sfen_count = 0;
-
-	// random number to shuffle
-	// Do not use std::random_device().  Because it always the same integers on MinGW.
-	PRNG prng(std::chrono::system_clock::now().time_since_epoch().count());
-
-	// number of files
-	size_t file_count = filenames.size();
-
-	// Number of teacher positions stored in each file in filenames
-	vector<uint64_t> a_count(file_count);
-
-	// Count the number of teacher aspects in each file.
-	vector<fstream> afs(file_count);
-
-	for (size_t i = 0; i <file_count ;++i)
-	{
-		auto filename = filenames[i];
-		auto& fs = afs[i];
-
-		fs.open(filename, ios::in | ios::binary);
-		fs.seekg(0, fstream::end);
-		uint64_t eofPos = (uint64_t)fs.tellg();
-		fs.clear(); // Otherwise, the next seek may fail.
-		fs.seekg(0, fstream::beg);
-		uint64_t begPos = (uint64_t)fs.tellg();
-		uint64_t file_size = eofPos - begPos;
-		uint64_t sfen_count = file_size / sizeof(PackedSfenValue);
-		a_count[i] = sfen_count;
-
-		// Output the number of sfen stored in each file.
-		cout << filename << " = " << sfen_count << " sfens." << endl;
-	}
-
-	// Since we know the file size of each file,
-	// open them all at once (already open),
-	// Select one at a time and load one phase at a time
-	// Now you have shuffled.
-
-	// Throw to the subcontract function and end.
-	shuffle_write(output_file_name, prng, afs, a_count);
-}
-
-// Subcontracting the teacher shuffle "learn shufflem" command.
-// Read the whole memory and write it out with the specified file name.
-void shuffle_files_on_memory(const vector<string>& filenames,const string output_file_name)
-{
-	PSVector buf;
-
-	for (auto filename : filenames)
-	{
-		std::cout << "read : " << filename << std::endl;
-		read_file_to_memory(filename, [&buf](uint64_t size) {
-			assert((size % sizeof(PackedSfenValue)) == 0);
-			// Expand the buffer and read after the last end.
-			uint64_t last = buf.size();
-			buf.resize(last + size / sizeof(PackedSfenValue));
-			return (void*)&buf[last];
-		});
-	}
-
-	// shuffle from buf[0] to buf[size-1]
-	// Do not use std::random_device().  Because it always the same integers on MinGW.
-	PRNG prng(std::chrono::system_clock::now().time_since_epoch().count());
-	uint64_t size = (uint64_t)buf.size();
-	std::cout << "shuffle buf.size() = " << size << std::endl;
-	for (uint64_t i = 0; i < size; ++i)
-		swap(buf[i], buf[(uint64_t)(prng.rand(size - i) + i)]);
-
-	std::cout << "write : " << output_file_name << endl;
-
-	// If the file to be written exceeds 2GB, it cannot be written in one shot with fstream::write, so use wrapper.
-	write_memory_to_file(output_file_name, (void*)&buf[0], (uint64_t)sizeof(PackedSfenValue)*(uint64_t)buf.size());
-
-	std::cout << "..shuffle_on_memory done." << std::endl;
-}
-
-bool fen_is_ok(Position& pos, std::string input_fen) {
-	std::string pos_fen = pos.fen();
-	std::istringstream ss_input(input_fen);
-	std::istringstream ss_pos(pos_fen);
-
-	// example : "2r4r/4kpp1/nb1np3/p2p3p/B2P1BP1/PP6/4NPKP/2R1R3 w - h6 0 24"
-	//       --> "2r4r/4kpp1/nb1np3/p2p3p/B2P1BP1/PP6/4NPKP/2R1R3"
-	std::string str_input, str_pos;
-	ss_input >> str_input;
-	ss_pos >> str_pos;
-
-	// Only compare "Piece placement field" between input_fen and pos.fen().
-	return str_input == str_pos;
-}
-
-void convert_bin(const vector<string>& filenames, const string& output_file_name, const int ply_minimum, const int ply_maximum, const int interpolate_eval, const bool check_invalid_fen, const bool check_illegal_move)
-{
-	std::cout << "check_invalid_fen=" << check_invalid_fen << std::endl;
-	std::cout << "check_illegal_move=" << check_illegal_move << std::endl;
-
-	std::fstream fs;
-	uint64_t data_size=0;
-	uint64_t filtered_size = 0;
-	uint64_t filtered_size_fen = 0;
-	uint64_t filtered_size_move = 0;
-	uint64_t filtered_size_ply = 0;
-	auto th = Threads.main();
-	auto &tpos = th->rootPos;
-	// convert plain rag to packed sfenvalue for Yaneura king
-	fs.open(output_file_name, ios::app | ios::binary);
-	StateListPtr states;
-	for (auto filename : filenames) {
-		std::cout << "convert " << filename << " ... ";
-		std::string line;
-		ifstream ifs;
-		ifs.open(filename);
-		PackedSfenValue p;
-		data_size = 0;
-		filtered_size = 0;
-		filtered_size_fen = 0;
-		filtered_size_move = 0;
-		filtered_size_ply = 0;
-		p.gamePly = 1; // Not included in apery format. Should be initialized
-		bool ignore_flag_fen = false;
-		bool ignore_flag_move = false;
-		bool ignore_flag_ply = false;
-		while (std::getline(ifs, line)) {
-			std::stringstream ss(line);
-			std::string token;
-			std::string value;
-			ss >> token;
-			if (token == "fen") {
-				states = StateListPtr(new std::deque<StateInfo>(1)); // Drop old and create a new one
-				std::string input_fen = line.substr(4);
-				tpos.set(input_fen, false, &states->back(), Threads.main());
-				if (check_invalid_fen && !fen_is_ok(tpos, input_fen)) {
-					ignore_flag_fen = true;
-					filtered_size_fen++;
-				}
-				else {
-					tpos.sfen_pack(p.sfen);
-				}
-			}
-			else if (token == "move") {
-				ss >> value;
-				Move move = UCI::to_move(tpos, value);
-				if (check_illegal_move && move == MOVE_NONE) {
-					ignore_flag_move = true;
-					filtered_size_move++;
-				}
-				else {
-					p.move = move;
-				}
-			}
-			else if (token == "score") {
-				double score;
-				ss >> score;
-				// Training Formula · Issue #71 · nodchip/Stockfish https://github.com/nodchip/Stockfish/issues/71
-				// Normalize to [0.0, 1.0].
-				score = (score - src_score_min_value) / (src_score_max_value - src_score_min_value);
-				// Scale to [dest_score_min_value, dest_score_max_value].
-				score = score * (dest_score_max_value - dest_score_min_value) + dest_score_min_value;
-				p.score = Math::clamp((int32_t)std::round(score) , -(int32_t)VALUE_MATE , (int32_t)VALUE_MATE);
-			}
-			else if (token == "ply") {
-				int temp;
-				ss >> temp;
-				if(temp < ply_minimum || temp > ply_maximum){
-					ignore_flag_ply = true;
-					filtered_size_ply++;
-				}
-				p.gamePly = uint16_t(temp); // No cast here?
-				if (interpolate_eval != 0){
-					p.score = min(3000, interpolate_eval * temp);
-				}
-			}
-			else if (token == "result") {
-				int temp;
-				ss >> temp;
-				p.game_result = int8_t(temp); // Do you need a cast here?
-				if (interpolate_eval){
-					p.score = p.score * p.game_result;
-				}
-			}
-			else if (token == "e") {
-				if (!(ignore_flag_fen || ignore_flag_move || ignore_flag_ply)) {
-					fs.write((char*)&p, sizeof(PackedSfenValue));
-					data_size+=1;
-					// debug
-					// std::cout<<tpos<<std::endl;
-					// std::cout<<p.score<<","<<int(p.gamePly)<<","<<int(p.game_result)<<std::endl;
-				}
-				else {
-					filtered_size++;
-				}
-				ignore_flag_fen = false;
-				ignore_flag_move = false;
-				ignore_flag_ply = false;
-			}
-		}
-		std::cout << "done " << data_size << " parsed " << filtered_size << " is filtered"
-				  << " (invalid fen:" << filtered_size_fen << ", illegal move:" << filtered_size_move << ", invalid ply:" << filtered_size_ply << ")" << std::endl;
-		ifs.close();
-	}
-	std::cout << "all done" << std::endl;
-	fs.close();
-}
-
-static inline void ltrim(std::string &s) {
-	s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](int ch) {
-		return !std::isspace(ch);
-	}));
-}
-
-static inline void rtrim(std::string &s) {
-	s.erase(std::find_if(s.rbegin(), s.rend(), [](int ch) {
-		return !std::isspace(ch);
-	}).base(), s.end());
-}
-
-static inline void trim(std::string &s) {
-	ltrim(s);
-	rtrim(s);
-}
-
-int parse_game_result_from_pgn_extract(std::string result) {
-	// White Win
-	if (result == "\"1-0\"") {
-		return 1;
-	}
-	// Black Win
-	else if (result == "\"0-1\"") {
-		return -1;
-	}
-	// Draw
-	else {
-		return 0;
-	}
-}
-
-// 0.25 -->  0.25 * PawnValueEg
-// #-4  --> -mate_in(4)
-// #3   -->  mate_in(3)
-// -M4  --> -mate_in(4)
-// +M3  -->  mate_in(3)
-Value parse_score_from_pgn_extract(std::string eval, bool& success) {
-	success = true;
-
-	if (eval.substr(0, 1) == "#") {
-		if (eval.substr(1, 1) == "-") {
-			return -mate_in(stoi(eval.substr(2, eval.length() - 2)));
-		}
-		else {
-			return mate_in(stoi(eval.substr(1, eval.length() - 1)));
-		}
-	}
-	else if (eval.substr(0, 2) == "-M") {
-		//std::cout << "eval=" << eval << std::endl;
-		return -mate_in(stoi(eval.substr(2, eval.length() - 2)));
-	}
-	else if (eval.substr(0, 2) == "+M") {
-		//std::cout << "eval=" << eval << std::endl;
-		return mate_in(stoi(eval.substr(2, eval.length() - 2)));
-	}
-	else {
-		char *endptr;
-		double value = strtod(eval.c_str(), &endptr);
-
-		if (*endptr != '\0') {
-			success = false;
-			return VALUE_ZERO;
-		}
-		else {
-			return Value(value * static_cast<double>(PawnValueEg));
-		}
-	}
-}
-
-// for Debug
-//#define DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT
-
-bool is_like_fen(std::string fen) {
-	int count_space = std::count(fen.cbegin(), fen.cend(), ' ');
-	int count_slash = std::count(fen.cbegin(), fen.cend(), '/');
-
-#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
-	//std::cout << "count_space=" << count_space << std::endl;
-	//std::cout << "count_slash=" << count_slash << std::endl;
-#endif
-
-	return count_space == 5 && count_slash == 7;
-}
-
-void convert_bin_from_pgn_extract(const vector<string>& filenames, const string& output_file_name, const bool pgn_eval_side_to_move, const bool convert_no_eval_fens_as_score_zero)
-{
-	std::cout << "pgn_eval_side_to_move=" << pgn_eval_side_to_move << std::endl;
-	std::cout << "convert_no_eval_fens_as_score_zero=" << convert_no_eval_fens_as_score_zero << std::endl;
-
-	auto th = Threads.main();
-	auto &pos = th->rootPos;
-
-	std::fstream ofs;
-	ofs.open(output_file_name, ios::out | ios::binary);
-
-	int game_count = 0;
-	int fen_count = 0;
-
-	for (auto filename : filenames) {
-		std::cout << now_string() << " convert " << filename << std::endl;
-		ifstream ifs;
-		ifs.open(filename);
-
-		int game_result = 0;
-
-		std::string line;
-		while (std::getline(ifs, line)) {
-
-			if (line.empty()) {
-				continue;
-			}
-
-			else if (line.substr(0, 1) == "[") {
-				std::regex pattern_result(R"(\[Result (.+?)\])");
-				std::smatch match;
-
-				// example: [Result "1-0"]
-				if (std::regex_search(line, match, pattern_result)) {
-					game_result = parse_game_result_from_pgn_extract(match.str(1));
-#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
-					std::cout << "game_result=" << game_result << std::endl;
-#endif
-					game_count++;
-					if (game_count % 10000 == 0) {
-						std::cout << now_string() << " game_count=" << game_count << ", fen_count=" << fen_count << std::endl;
-					}
-				}
-
-				continue;
-			}
-
-			else {
-				int gamePly = 1;
-				auto itr = line.cbegin();
-
-				while (true) {
-					gamePly++;
-
-					PackedSfenValue psv;
-					memset((char*)&psv, 0, sizeof(PackedSfenValue));
-
-					// fen
-					{
-						bool fen_found = false;
-
-						while (!fen_found) {
-							std::regex pattern_bracket(R"(\{(.+?)\})");
-							std::smatch match;
-							if (!std::regex_search(itr, line.cend(), match, pattern_bracket)) {
-								break;
-							}
-
-							itr += match.position(0) + match.length(0) - 1;
-							std::string str_fen = match.str(1);
-							trim(str_fen);
-
-							if (is_like_fen(str_fen)) {
-								fen_found = true;
-
-								StateInfo si;
-								pos.set(str_fen, false, &si, th);
-								pos.sfen_pack(psv.sfen);
-							}
-
-#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
-							std::cout << "str_fen=" << str_fen << std::endl;
-							std::cout << "fen_found=" << fen_found << std::endl;
-#endif
-						}
-
-						if (!fen_found) {
-							break;
-						}
-					}
-
-					// move
-					{
-						std::regex pattern_move(R"(\}(.+?)\{)");
-						std::smatch match;
-						if (!std::regex_search(itr, line.cend(), match, pattern_move)) {
-							break;
-						}
-
-						itr += match.position(0) + match.length(0) - 1;
-						std::string str_move = match.str(1);
-						trim(str_move);
-#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
-						std::cout << "str_move=" << str_move << std::endl;
-#endif
-						psv.move = UCI::to_move(pos, str_move);
-					}
-
-					// eval
-					bool eval_found = false;
-					{
-						std::regex pattern_bracket(R"(\{(.+?)\})");
-						std::smatch match;
-						if (!std::regex_search(itr, line.cend(), match, pattern_bracket)) {
-							break;
-						}
-
-						std::string str_eval_clk = match.str(1);
-						trim(str_eval_clk);
-#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
-						std::cout << "str_eval_clk=" << str_eval_clk << std::endl;
-#endif
-
-						// example: { [%eval 0.25] [%clk 0:10:00] }
-						// example: { [%eval #-4] [%clk 0:10:00] }
-						// example: { [%eval #3] [%clk 0:10:00] }
-						// example: { +0.71/22 1.2s }
-						// example: { -M4/7 0.003s }
-						// example: { M3/245 0.017s }
-						// example: { +M1/245 0.010s, White mates }
-						// example: { 0.60 }
-						// example: { book }
-						// example: { rnbqkb1r/pp3ppp/2p1pn2/3p4/2PP4/2N2N2/PP2PPPP/R1BQKB1R w KQkq - 0 5 }
-
-						// Considering the absence of eval
-						if (!is_like_fen(str_eval_clk)) {
-							itr += match.position(0) + match.length(0) - 1;
-
-							if (str_eval_clk != "book") {
-								std::regex pattern_eval1(R"(\[\%eval (.+?)\])");
-								std::regex pattern_eval2(R"((.+?)\/)");
-
-								std::string str_eval;
-								if (std::regex_search(str_eval_clk, match, pattern_eval1) ||
-									std::regex_search(str_eval_clk, match, pattern_eval2)) {
-									str_eval = match.str(1);
-									trim(str_eval);
-								}
-								else {
-									str_eval = str_eval_clk;
-								}
-
-								bool success = false;
-								Value value = parse_score_from_pgn_extract(str_eval, success);
-								if (success) {
-									eval_found = true;
-									psv.score = Math::clamp(value, -VALUE_MATE , VALUE_MATE);
-								}
-
-#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
-								std::cout << "str_eval=" << str_eval << std::endl;
-								std::cout << "success=" << success << ", psv.score=" << psv.score << std::endl;
-#endif
-							}
-						}
-					}
-
-					// write
-					if (eval_found || convert_no_eval_fens_as_score_zero) {
-						if (!eval_found && convert_no_eval_fens_as_score_zero) {
-							psv.score = 0;
-						}
-
-						psv.gamePly = gamePly;
-						psv.game_result = game_result;
-
-						if (pos.side_to_move() == BLACK) {
-							if (!pgn_eval_side_to_move) {
-								psv.score *= -1;
-							}
-							psv.game_result *= -1;
-						}
-
-						ofs.write((char*)&psv, sizeof(PackedSfenValue));
-
-						fen_count++;
-					}
-				}
-
-				game_result = 0;
-			}
-		}
-	}
-
-	std::cout << now_string() << " game_count=" << game_count << ", fen_count=" << fen_count << std::endl;
-	std::cout << now_string() << " all done" << std::endl;
-	ofs.close();
-}
-
-void convert_plain(const vector<string>& filenames, const string& output_file_name)
-{
-	Position tpos;
-	std::ofstream ofs;
-	ofs.open(output_file_name, ios::app);
-	auto th = Threads.main();
-	for (auto filename : filenames) {
-		std::cout << "convert " << filename << " ... ";
-
-		// Just convert packedsfenvalue to text
-		std::fstream fs;
-		fs.open(filename, ios::in | ios::binary);
-		PackedSfenValue p;
-		while (true)
-		{
-			if (fs.read((char*)&p, sizeof(PackedSfenValue))) {
-				StateInfo si;
-				tpos.set_from_packed_sfen(p.sfen, &si, th, false);
-
-				// write as plain text
-				ofs << "fen " << tpos.fen() << std::endl;
-				ofs << "move " << UCI::move(Move(p.move), false) << std::endl;
-				ofs << "score " << p.score << std::endl;
-				ofs << "ply " << int(p.gamePly) << std::endl;
-				ofs << "result " << int(p.game_result) << std::endl;
-				ofs << "e" << std::endl;
-			}
-			else {
-				break;
-			}
-		}
-		fs.close();
-		std::cout << "done" << std::endl;
-	}
-	ofs.close();
-	std::cout << "all done" << std::endl;
-}
-
-// Learning from the generated game record
-void learn(Position&, istringstream& is)
-{
-	auto thread_num = (int)Options["Threads"];
-	SfenReader sr(thread_num);
-
-	LearnerThink learn_think(sr);
-	vector<string> filenames;
-
-	// mini_batch_size 1M aspect by default. This can be increased.
-	auto mini_batch_size = LEARN_MINI_BATCH_SIZE;
-
-	// Number of loops (read the game record file this number of times)
-	int loop = 1;
-
-	// Game file storage folder (get game file with relative path from here)
-	string base_dir;
-
-	string target_dir;
-
-	// If 0, it will be the default value.
-	double eta1 = 0.0;
-	double eta2 = 0.0;
-	double eta3 = 0.0;
-	uint64_t eta1_epoch = 0; // eta2 is not applied by default
-	uint64_t eta2_epoch = 0; // eta3 is not applied by default
-
-#if defined(USE_GLOBAL_OPTIONS)
-	// Save it for later restore.
-	auto oldGlobalOptions = GlobalOptions;
-	// If you hit the eval hash, you can not calculate rmse etc. so turn it off.
-	GlobalOptions.use_eval_hash = false;
-	// If you hit the replacement table, pruning may occur at the previous evaluation value, so turn it off.
-	GlobalOptions.use_hash_probe = false;
-#endif
-
-	// --- Function that only shuffles the teacher aspect
-
-	// normal shuffle
-	bool shuffle_normal = false;
-	uint64_t buffer_size = 20000000;
-	// fast shuffling assuming each file is shuffled
-	bool shuffle_quick = false;
-	// A function to read the entire file in memory and shuffle it. (Requires file size memory)
-	bool shuffle_on_memory = false;
-	// Conversion of packed sfen. In plain, it consists of sfen(string), evaluation value (integer), move (eg 7g7f, string), result (loss-1, win 1, draw 0)
-	bool use_convert_plain = false;
-	// convert plain format teacher to Yaneura King's bin
-	bool use_convert_bin = false;
-	int ply_minimum = 0;
-	int ply_maximum = 114514;
-	bool interpolate_eval = 0;
-	bool check_invalid_fen = false;
-	bool check_illegal_move = false;
-	// convert teacher in pgn-extract format to Yaneura King's bin
-	bool use_convert_bin_from_pgn_extract = false;
-	bool pgn_eval_side_to_move = false;
-	bool convert_no_eval_fens_as_score_zero = false;
-	// File name to write in those cases (default is "shuffled_sfen.bin")
-	string output_file_name = "shuffled_sfen.bin";
-
-	// If the absolute value of the evaluation value in the deep search of the teacher phase exceeds this value, that phase is discarded.
-	int eval_limit = 32000;
-
-	// Flag to save the evaluation function file only once near the end.
-	bool save_only_once = false;
-
-	// Shuffle about what you are pre-reading on the teacher aspect. (Shuffle of about 10 million phases)
-	// Turn on if you want to pass a pre-shuffled file.
-	bool no_shuffle = false;
-
-#if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
-	// elmo lambda
-	ELMO_LAMBDA = 0.33;
-	ELMO_LAMBDA2 = 0.33;
-	ELMO_LAMBDA_LIMIT = 32000;
-#endif
-
-	// Discount rate. If this is set to a value other than 0, the slope will be added even at other than the PV termination. (At that time, apply this discount rate)
-	double discount_rate = 0;
-
-	// if (gamePly <rand(reduction_gameply)) continue;
-	// An option to exclude the early stage from the learning target moderately like
-	// If set to 1, rand(1)==0, so nothing is excluded.
-	int reduction_gameply = 1;
-
-	// Optional item that does not let you learn KK/KKP/KPP/KPPP
-	array<bool,4> freeze = {};
-
-#if defined(EVAL_NNUE)
-	uint64_t nn_batch_size = 1000;
-	double newbob_decay = 1.0;
-	int newbob_num_trials = 2;
-	string nn_options;
-#endif
-
-	uint64_t eval_save_interval = LEARN_EVAL_SAVE_INTERVAL;
-	uint64_t loss_output_interval = 0;
-	uint64_t mirror_percentage = 0;
-
-	string validation_set_file_name;
-
-	// Assume the filenames are staggered.
-	while (true)
-	{
-		string option;
-		is >> option;
-
-		if (option == "")
-			break;
-
-		// specify the number of phases of mini-batch
-		if (option == "bat")
-		{
-			is >> mini_batch_size;
-			mini_batch_size *= 10000; // Unit is ten thousand
-		}
-
-		// Specify the folder in which the game record is stored and make it the rooting target.
-		else if (option == "targetdir") is >> target_dir;
-
-		// Specify the number of loops
-		else if (option == "loop")      is >> loop;
-
-		// Game file storage folder (get game file with relative path from here)
-		else if (option == "basedir")   is >> base_dir;
-
-		// Mini batch size
-		else if (option == "batchsize") is >> mini_batch_size;
-
-		// learning rate
-		else if (option == "eta")        is >> eta1;
-		else if (option == "eta1")       is >> eta1; // alias
-		else if (option == "eta2")       is >> eta2;
-		else if (option == "eta3")       is >> eta3;
-		else if (option == "eta1_epoch") is >> eta1_epoch;
-		else if (option == "eta2_epoch") is >> eta2_epoch;
-		// Accept also the old option name.
-		else if (option == "use_draw_in_training" || option == "use_draw_games_in_training") is >> use_draw_games_in_training;
-		// Accept also the old option name.
-		else if (option == "use_draw_in_validation" || option == "use_draw_games_in_validation") is >> use_draw_games_in_validation;
-		// Accept also the old option name.
-		else if (option == "use_hash_in_training" || option == "skip_duplicated_positions_in_training") is >> skip_duplicated_positions_in_training;
-		else if (option == "winning_probability_coefficient") is >> winning_probability_coefficient;
-		// Discount rate
-		else if (option == "discount_rate") is >> discount_rate;
-		// Using WDL with win rate model instead of sigmoid
-		else if (option == "use_wdl") is >> use_wdl;
-
-		// No learning of KK/KKP/KPP/KPPP.
-		else if (option == "freeze_kk")    is >> freeze[0];
-		else if (option == "freeze_kkp")   is >> freeze[1];
-		else if (option == "freeze_kpp")   is >> freeze[2];
-
-#if defined(EVAL_KPPT) || defined(EVAL_KPP_KKPT) || defined(EVAL_KPP_KKPT_FV_VAR) || defined(EVAL_NABLA)
-
-#elif defined(EVAL_KPPPT) || defined(EVAL_KPPP_KKPT) || defined(EVAL_HELICES)
-		else if (option == "freeze_kppp")  is >> freeze[3];
-#elif defined(EVAL_KKPP_KKPT) || defined(EVAL_KKPPT)
-		else if (option == "freeze_kkpp")  is >> freeze[3];
-#endif
-
-#if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
-		// LAMBDA
-		else if (option == "lambda")       is >> ELMO_LAMBDA;
-		else if (option == "lambda2")      is >> ELMO_LAMBDA2;
-		else if (option == "lambda_limit") is >> ELMO_LAMBDA_LIMIT;
-
-#endif
-		else if (option == "reduction_gameply") is >> reduction_gameply;
-
-		// shuffle related
-		else if (option == "shuffle")	shuffle_normal = true;
-		else if (option == "buffer_size") is >> buffer_size;
-		else if (option == "shuffleq")	shuffle_quick = true;
-		else if (option == "shufflem")	shuffle_on_memory = true;
-		else if (option == "output_file_name") is >> output_file_name;
-
-		else if (option == "eval_limit") is >> eval_limit;
-		else if (option == "save_only_once") save_only_once = true;
-		else if (option == "no_shuffle") no_shuffle = true;
-
-#if defined(EVAL_NNUE)
-		else if (option == "nn_batch_size") is >> nn_batch_size;
-		else if (option == "newbob_decay") is >> newbob_decay;
-		else if (option == "newbob_num_trials") is >> newbob_num_trials;
-		else if (option == "nn_options") is >> nn_options;
-#endif
-		else if (option == "eval_save_interval") is >> eval_save_interval;
-		else if (option == "loss_output_interval") is >> loss_output_interval;
-		else if (option == "mirror_percentage") is >> mirror_percentage;
-		else if (option == "validation_set_file_name") is >> validation_set_file_name;
-
-		// Rabbit convert related
-		else if (option == "convert_plain") use_convert_plain = true;
-		else if (option == "convert_bin") use_convert_bin = true;
-		else if (option == "interpolate_eval") is >> interpolate_eval;
-		else if (option == "check_invalid_fen") is >> check_invalid_fen;
-		else if (option == "check_illegal_move") is >> check_illegal_move;
-		else if (option == "convert_bin_from_pgn-extract") use_convert_bin_from_pgn_extract = true;
-		else if (option == "pgn_eval_side_to_move") is >> pgn_eval_side_to_move;
-		else if (option == "convert_no_eval_fens_as_score_zero") is >> convert_no_eval_fens_as_score_zero;
-		else if (option == "src_score_min_value") is >> src_score_min_value;
-		else if (option == "src_score_max_value") is >> src_score_max_value;
-		else if (option == "dest_score_min_value") is >> dest_score_min_value;
-		else if (option == "dest_score_max_value") is >> dest_score_max_value;
-		else if (option == "convert_teacher_signal_to_winning_probability") is >> convert_teacher_signal_to_winning_probability;
-		else if (option == "use_raw_nnue_eval") is >> use_raw_nnue_eval;
-
-		// Otherwise, it's a filename.
-		else
-			filenames.push_back(option);
-	}
-	if (loss_output_interval == 0)
-		loss_output_interval = LEARN_RMSE_OUTPUT_INTERVAL * mini_batch_size;
-
-	cout << "learn command , ";
-
-	// Issue a warning if OpenMP is disabled.
-#if !defined(_OPENMP)
-	cout << "Warning! OpenMP disabled." << endl;
-#endif
-
-	// Display learning game file
-	if (target_dir != "")
-	{
-		string kif_base_dir = Path::Combine(base_dir, target_dir);
-
-		// Remove this folder. Keep it relative to base_dir.
-#if defined(_MSC_VER)
-		// If you use std::tr2, warning C4996 will appear, so suppress it.
-		// * std::tr2 issued a deprecation warning by default under std:c++14, and was deleted by default in /std:c++17.
-		#pragma warning(push)
-		#pragma warning(disable:4996)
-
-		namespace sys = std::filesystem;
-		sys::path p(kif_base_dir); // Origin of enumeration
-		std::for_each(sys::directory_iterator(p), sys::directory_iterator(),
-			[&](const sys::path& p) {
-			if (sys::is_regular_file(p))
-				filenames.push_back(Path::Combine(target_dir, p.filename().generic_string()));
-		});
-		#pragma warning(pop)
-
-#elif defined(__GNUC__)
-
-		auto ends_with = [](std::string const & value, std::string const & ending)
-		{
-			if (ending.size() > value.size()) return false;
-			return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
-		};
-
-		// It can't be helped, so read it using dirent.h.
-		DIR *dp; // pointer to directory
-		dirent* entry; // entry point returned by readdir()
-
-		dp = opendir(kif_base_dir.c_str());
-		if (dp != NULL)
-		{
-			do {
-				entry = readdir(dp);
-				// Only list files ending with ".bin"
-				// →I hate this restriction when generating files with serial numbers...
-				if (entry != NULL  && ends_with(entry->d_name, ".bin")  )
-				{
-					//cout << entry->d_name << endl;
-					filenames.push_back(Path::Combine(target_dir, entry->d_name));
-				}
-			} while (entry != NULL);
-			closedir(dp);
-		}
-#endif
-	}
-
-	cout << "learn from ";
-	for (auto s : filenames)
-		cout << s << " , ";
-	cout << endl;
-	if (!validation_set_file_name.empty())
-	{
-		cout << "validation set  : " << validation_set_file_name << endl;
-	}
-
-	cout << "base dir        : " << base_dir   << endl;
-	cout << "target dir      : " << target_dir << endl;
-
-	// shuffle mode
-	if (shuffle_normal)
-	{
-		cout << "buffer_size     : " << buffer_size << endl;
-		cout << "shuffle mode.." << endl;
-		shuffle_files(filenames,output_file_name , buffer_size);
-		return;
-	}
-	if (shuffle_quick)
-	{
-		cout << "quick shuffle mode.." << endl;
-		shuffle_files_quick(filenames, output_file_name);
-		return;
-	}
-	if (shuffle_on_memory)
-	{
-		cout << "shuffle on memory.." << endl;
-		shuffle_files_on_memory(filenames,output_file_name);
-		return;
-	}
-	if (use_convert_plain)
-	{
-		Eval::init_NNUE();
-		cout << "convert_plain.." << endl;
-		convert_plain(filenames, output_file_name);
-		return;
-	}
-	if (use_convert_bin)
-	{
-		Eval::init_NNUE();
-		cout << "convert_bin.." << endl;
-		convert_bin(filenames,output_file_name, ply_minimum, ply_maximum, interpolate_eval, check_invalid_fen, check_illegal_move);
-		return;
-
-	}
-	if (use_convert_bin_from_pgn_extract)
-	{
-		Eval::init_NNUE();
-		cout << "convert_bin_from_pgn-extract.." << endl;
-		convert_bin_from_pgn_extract(filenames, output_file_name, pgn_eval_side_to_move, convert_no_eval_fens_as_score_zero);
-		return;
-	}
-
-	cout << "loop              : " << loop << endl;
-	cout << "eval_limit        : " << eval_limit << endl;
-	cout << "save_only_once    : " << (save_only_once ? "true" : "false") << endl;
-	cout << "no_shuffle        : " << (no_shuffle ? "true" : "false") << endl;
-
-	// Insert the file name for the number of loops.
-	for (int i = 0; i < loop; ++i)
-		// sfen reader, I'll read it in reverse order so I'll reverse it here. I'm sorry.
-		for (auto it = filenames.rbegin(); it != filenames.rend(); ++it)
-			sr.filenames.push_back(Path::Combine(base_dir, *it));
-
-#if !defined(EVAL_NNUE)
-	cout << "Gradient Method   : " << LEARN_UPDATE      << endl;
-#endif
-	cout << "Loss Function     : " << LOSS_FUNCTION     << endl;
-	cout << "mini-batch size   : " << mini_batch_size   << endl;
-#if defined(EVAL_NNUE)
-	cout << "nn_batch_size     : " << nn_batch_size     << endl;
-	cout << "nn_options        : " << nn_options        << endl;
-#endif
-	cout << "learning rate     : " << eta1 << " , " << eta2 << " , " << eta3 << endl;
-	cout << "eta_epoch         : " << eta1_epoch << " , " << eta2_epoch << endl;
-	cout << "use_draw_games_in_training : " << use_draw_games_in_training << endl;
-	cout << "use_draw_games_in_validation : " << use_draw_games_in_validation << endl;
-	cout << "skip_duplicated_positions_in_training : " << skip_duplicated_positions_in_training << endl;
-#if defined(EVAL_NNUE)
-	if (newbob_decay != 1.0) {
-		cout << "scheduling        : newbob with decay = " << newbob_decay
-		     << ", " << newbob_num_trials << " trials" << endl;
-	} else {
-		cout << "scheduling        : default" << endl;
-	}
-#endif
-	cout << "discount rate     : " << discount_rate     << endl;
-
-	// If reduction_gameply is set to 0, rand(0) will be divided by 0, so correct it to 1.
-	reduction_gameply = max(reduction_gameply, 1);
-	cout << "reduction_gameply : " << reduction_gameply << endl;
-
-#if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
-	cout << "LAMBDA            : " << ELMO_LAMBDA       << endl;
-	cout << "LAMBDA2           : " << ELMO_LAMBDA2      << endl;
-	cout << "LAMBDA_LIMIT      : " << ELMO_LAMBDA_LIMIT << endl;
-#endif
-	cout << "mirror_percentage : " << mirror_percentage << endl;
-	cout << "eval_save_interval  : " << eval_save_interval << " sfens" << endl;
-	cout << "loss_output_interval: " << loss_output_interval << " sfens" << endl;
-
-#if defined(EVAL_KPPT) || defined(EVAL_KPP_KKPT) || defined(EVAL_KPP_KKPT_FV_VAR) || defined(EVAL_NABLA)
-	cout << "freeze_kk/kkp/kpp      : " << freeze[0] << " , " << freeze[1] << " , " << freeze[2] << endl;
-#elif defined(EVAL_KPPPT) || defined(EVAL_KPPP_KKPT) || defined(EVAL_HELICES)
-	cout << "freeze_kk/kkp/kpp/kppp : " << freeze[0] << " , " << freeze[1] << " , " << freeze[2] << " , " << freeze[3] << endl;
-#elif defined(EVAL_KKPP_KKPT) || defined(EVAL_KKPPT)
-	cout << "freeze_kk/kkp/kpp/kkpp : " << freeze[0] << " , " << freeze[1] << " , " << freeze[2] << " , " << freeze[3] << endl;
-#endif
-
-	// -----------------------------------
-	// various initialization
-	// -----------------------------------
-
-	cout << "init.." << endl;
-
-	// Read evaluation function parameters
-	Eval::init_NNUE();
-
-#if !defined(EVAL_NNUE)
-	cout << "init_grad.." << endl;
-
-	// Initialize gradient array of merit function parameters
-	Eval::init_grad(eta1,eta1_epoch,eta2,eta2_epoch,eta3);
-#else
-	cout << "init_training.." << endl;
-	Eval::NNUE::InitializeTraining(eta1,eta1_epoch,eta2,eta2_epoch,eta3);
-	Eval::NNUE::SetBatchSize(nn_batch_size);
-	Eval::NNUE::SetOptions(nn_options);
-	if (newbob_decay != 1.0 && !Options["SkipLoadingEval"]) {
-		learn_think.best_nn_directory = std::string(Options["EvalDir"]);
-	}
-#endif
-
-#if 0
-	// A test to give a gradient of 1.0 to the initial stage of Hirate.
-	pos.set_hirate();
-	cout << Eval::evaluate(pos) << endl;
-	//Eval::print_eval_stat(pos);
-	Eval::add_grad(pos, BLACK, 32.0 , false);
-	Eval::update_weights(1);
-	pos.state()->sum.p[2][0] = VALUE_NOT_EVALUATED;
-	cout << Eval::evaluate(pos) << endl;
-	//Eval::print_eval_stat(pos);
-#endif
-
-	cout << "init done." << endl;
-
-	// Reflect other option settings.
-	learn_think.discount_rate = discount_rate;
-	learn_think.eval_limit = eval_limit;
-	learn_think.save_only_once = save_only_once;
-	learn_think.sr.no_shuffle = no_shuffle;
-	learn_think.freeze = freeze;
-	learn_think.reduction_gameply = reduction_gameply;
-#if defined(EVAL_NNUE)
-	learn_think.newbob_scale = 1.0;
-	learn_think.newbob_decay = newbob_decay;
-	learn_think.newbob_num_trials = newbob_num_trials;
-#endif
-	learn_think.eval_save_interval = eval_save_interval;
-	learn_think.loss_output_interval = loss_output_interval;
-	learn_think.mirror_percentage = mirror_percentage;
-
-	// Start a thread that loads the phase file in the background
-	// (If this is not started, mse cannot be calculated.)
-	learn_think.start_file_read_worker();
-
-	learn_think.mini_batch_size = mini_batch_size;
-
-	if (validation_set_file_name.empty()) {
-	// Get about 10,000 data for mse calculation.
-		sr.read_for_mse();
-	} else {
-		sr.read_validation_set(validation_set_file_name, eval_limit);
-	}
-
-	// Calculate rmse once at this point (timing of 0 sfen)
-	// sr.calc_rmse();
-#if defined(EVAL_NNUE)
-	if (newbob_decay != 1.0) {
-		learn_think.calc_loss(0, -1);
-		learn_think.best_loss = learn_think.latest_loss_sum / learn_think.latest_loss_count;
-		learn_think.latest_loss_sum = 0.0;
-		learn_think.latest_loss_count = 0;
-		cout << "initial loss: " << learn_think.best_loss << endl;
-	}
-#endif
-
-	// -----------------------------------
-	// start learning evaluation function parameters
-	// -----------------------------------
-
-	// Start learning.
-	learn_think.go_think();
-
-	// Save once at the end.
-	learn_think.save(true);
-
-#if defined(USE_GLOBAL_OPTIONS)
-	// Restore Global Options.
-	GlobalOptions = oldGlobalOptions;
-#endif
-}
-
-
-} // namespace Learner
-
-#if defined(GENSFEN2019)
-#include "gensfen2019.cpp"
-#endif
-
-
-#endif // EVAL_LEARN
diff --git a/src/learn/learning_tools.cpp b/src/learn/learning_tools.cpp
deleted file mode 100644
index de6da9c5..00000000
--- a/src/learn/learning_tools.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-﻿#include "learning_tools.h"
-
-#if defined (EVAL_LEARN)
-
-#if defined(_OPENMP)
-#include <omp.h>
-#endif
-#include "../misc.h"
-
-using namespace Eval;
-
-namespace EvalLearningTools
-{
-
-	// --- static variables
-
-	double Weight::eta;
-	double Weight::eta1;
-	double Weight::eta2;
-	double Weight::eta3;
-	uint64_t Weight::eta1_epoch;
-	uint64_t Weight::eta2_epoch;
-}
-
-#endif
diff --git a/src/learn/learning_tools.h b/src/learn/learning_tools.h
deleted file mode 100644
index 3c4be08a..00000000
--- a/src/learn/learning_tools.h
+++ /dev/null
@@ -1,200 +0,0 @@
-﻿#ifndef __LEARN_WEIGHT_H__
-#define __LEARN_WEIGHT_H__
-
-// A set of machine learning tools related to the weight array used for machine learning of evaluation functions
-
-#include "learn.h"
-#if defined (EVAL_LEARN)
-#include <array>
-
-#if defined(SGD_UPDATE) || defined(USE_KPPP_MIRROR_WRITE)
-#include "../misc.h"  // PRNG , my_insertion_sort
-#endif
-
-#include <cmath>	// std::sqrt()
-
-namespace EvalLearningTools
-{
-	// -------------------------------------------------
-	//   Array for learning that stores gradients etc.
-	// -------------------------------------------------
-
-#if defined(_MSC_VER)
-#pragma pack(push,2)
-#elif defined(__GNUC__)
-#pragma pack(2)
-#endif
-	struct Weight
-	{
-		// cumulative value of one mini-batch gradient
-		LearnFloatType g = LearnFloatType(0);
-
-		// When ADA_GRAD_UPDATE. LearnFloatType == float,
-		// total 4*2 + 4*2 + 1*2 = 18 bytes
-		// It suffices to secure a Weight array that is 4.5 times the size of the evaluation function parameter of 1GB.
-		// However, sizeof(Weight)==20 code is generated if the structure alignment is in 4-byte units, so
-		// Specify pragma pack(2).
-
-		// For SGD_UPDATE, this structure is reduced by 10 bytes to 8 bytes.
-
-		// Learning rate η(eta) such as AdaGrad.
-		// It is assumed that eta1,2,3,eta1_epoch,eta2_epoch have been set by the time updateFV() is called.
-		// The epoch of update_weights() gradually changes from eta1 to eta2 until eta1_epoch.
-		// After eta2_epoch, gradually change from eta2 to eta3.
-		static double eta;
-		static double eta1;
-		static double eta2;
-		static double eta3;
-		static uint64_t eta1_epoch;
-		static uint64_t eta2_epoch;
-
-		// Batch initialization of eta. If 0 is passed, the default value will be set.
-		static void init_eta(double eta1, double eta2, double eta3, uint64_t eta1_epoch, uint64_t eta2_epoch)
-		{
-			Weight::eta1 = (eta1 != 0) ? eta1 : 30.0;
-			Weight::eta2 = (eta2 != 0) ? eta2 : 30.0;
-			Weight::eta3 = (eta3 != 0) ? eta3 : 30.0;
-			Weight::eta1_epoch = (eta1_epoch != 0) ? eta1_epoch : 0;
-			Weight::eta2_epoch = (eta2_epoch != 0) ? eta2_epoch : 0;
-		}
-
-		// Set eta according to epoch.
-		static void calc_eta(uint64_t epoch)
-		{
-			if (Weight::eta1_epoch == 0) // Exclude eta2
-				Weight::eta = Weight::eta1;
-			else if (epoch < Weight::eta1_epoch)
-				// apportion
-				Weight::eta = Weight::eta1 + (Weight::eta2 - Weight::eta1) * epoch / Weight::eta1_epoch;
-			else if (Weight::eta2_epoch == 0) // Exclude eta3
-				Weight::eta = Weight::eta2;
-			else if (epoch < Weight::eta2_epoch)
-				Weight::eta = Weight::eta2 + (Weight::eta3 - Weight::eta2) * (epoch - Weight::eta1_epoch) / (Weight::eta2_epoch - Weight::eta1_epoch);
-			else
-				Weight::eta = Weight::eta3;
-		}
-
-		template <typename T> void updateFV(T& v) { updateFV(v, 1.0); }
-
-#if defined (ADA_GRAD_UPDATE)
-
-		// Since the maximum value that can be accurately calculated with float is INT16_MAX*256-1
-		// Keep the small value as a marker.
-		const LearnFloatType V0_NOT_INIT = (INT16_MAX * 128);
-
-		// What holds v internally. The previous implementation kept a fixed decimal with only a fractional part to save memory,
-		// Since it is doubtful in accuracy and the visibility is bad, it was abolished.
-		LearnFloatType v0 = LearnFloatType(V0_NOT_INIT);
-
-		// AdaGrad g2
-		LearnFloatType g2 = LearnFloatType(0);
-
-		// update with AdaGrad
-		// When executing this function, the value of g and the member do not change
-		// Guaranteed by the caller. It does not have to be an atomic operation.
-		// k is a coefficient for eta. 1.0 is usually sufficient. If you want to lower eta for your turn item, set this to 1/8.0 etc.
-		template <typename T>
-		void updateFV(T& v,double k)
-		{
-			// AdaGrad update formula
-			// Gradient vector is g, vector to be updated is v, η(eta) is a constant,
-			//     g2 = g2 + g^2
-			//     v = v - ηg/sqrt(g2)
-
-			constexpr double epsilon = 0.000001;
-
-			if (g == LearnFloatType(0))
-				return;
-
-			g2 += g * g;
-
-			// If v0 is V0_NOT_INIT, it means that the value is not initialized with the value of KK/KKP/KPP array,
-			// In this case, read the value of v from the one passed in the argument.
-			double V = (v0 == V0_NOT_INIT) ? v : v0;
-
-			V -= k * eta * (double)g / sqrt((double)g2 + epsilon);
-
-			// Limit the value of V to be within the range of types.
-			// By the way, windows.h defines the min and max macros, so to avoid it,
-			// Here, it is enclosed in parentheses so that it is not treated as a function-like macro.
-			V = (std::min)((double)(std::numeric_limits<T>::max)() , V);
-			V = (std::max)((double)(std::numeric_limits<T>::min)() , V);
-
-			v0 = (LearnFloatType)V;
-			v = (T)round(V);
-
-			// Clear g because one update of mini-batch for this element is over
-			// g[i] = 0;
-			// → There is a problem of dimension reduction, so this will be done by the caller.
-		}
-
-#elif defined(SGD_UPDATE)
-
-		// See only the sign of the gradient Update with SGD
-		// When executing this function, the value of g and the member do not change
-		// Guaranteed by the caller. It does not have to be an atomic operation.
-		template <typename T>
-		void updateFV(T & v , double k)
-		{
-			if (g == 0)
-				return;
-
-			// See only the sign of g and update.
-			// If g <0, add v a little.
-			// If g> 0, subtract v slightly.
-
-			// Since we only add integers, no decimal part is required.
-
-			// It's a good idea to move around 0-5.
-			// It is better to have a Gaussian distribution, so generate a 5-bit random number (each bit has a 1/2 probability of 1),
-			// Pop_count() it. At this time, it has a binomial distribution.
-			//int16_t diff = (int16_t)POPCNT32((u32)prng.rand(31));
-			// → If I do this with 80 threads, this AsyncPRNG::rand() locks, so I slowed down. This implementation is not good.
-			int16_t diff = 1;
-
-			double V = v;
-			if (g > 0.0)
-				V-= diff;
-			else
-				V+= diff;
-
-			V = (std::min)((double)(std::numeric_limits<T>::max)(), V);
-			V = (std::max)((double)(std::numeric_limits<T>::min)(), V);
-
-			v = (T)V;
-		}
-
-#endif
-
-		// grad setting
-		template <typename T> void set_grad(const T& g_) { g = g_; }
-
-		// Add grad
-		template <typename T> void add_grad(const T& g_) { g += g_; }
-
-		LearnFloatType get_grad() const { return g; }
-	};
-#if defined(_MSC_VER)
-#pragma pack(pop)
-#elif defined(__GNUC__)
-#pragma pack(0)
-#endif
-
-	// Turned weight array
-	// In order to be able to handle it transparently, let's have the same member as Weight.
-	struct Weight2
-	{
-		Weight w[2];
-
-		//Evaluate your turn, eta 1/8.
-		template <typename T> void updateFV(std::array<T, 2>& v) { w[0].updateFV(v[0] , 1.0); w[1].updateFV(v[1],1.0/8.0); }
-
-		template <typename T> void set_grad(const std::array<T, 2>& g) { for (int i = 0; i<2; ++i) w[i].set_grad(g[i]); }
-		template <typename T> void add_grad(const std::array<T, 2>& g) { for (int i = 0; i<2; ++i) w[i].add_grad(g[i]); }
-
-		std::array<LearnFloatType, 2> get_grad() const { return std::array<LearnFloatType, 2>{w[0].get_grad(), w[1].get_grad()}; }
-	};
-}
-
-#endif // defined (EVAL_LEARN)
-#endif
diff --git a/src/learn/multi_think.cpp b/src/learn/multi_think.cpp
deleted file mode 100644
index 82ebeabb..00000000
--- a/src/learn/multi_think.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-﻿#include "../types.h"
-
-#if defined(EVAL_LEARN)
-
-#include "multi_think.h"
-#include "../tt.h"
-#include "../uci.h"
-
-#include <thread>
-
-void MultiThink::go_think()
-{
-	// Keep a copy to restore the Options settings later.
-	auto oldOptions = Options;
-
-	// When using the constant track, it takes a lot of time to perform on the fly & the part to access the file is
-	// Since it is not thread safe, it is guaranteed here that it is being completely read in memory.
-	Options["BookOnTheFly"] = std::string("false");
-
-	// Read evaluation function, etc.
-	// In the case of the learn command, the value of the evaluation function may be corrected after reading the evaluation function, so
-	// Skip memory corruption check.
-	Eval::init_NNUE();
-
-	// Call the derived class's init().
-	init();
-
-	// The loop upper limit is set with set_loop_max().
-	loop_count = 0;
-	done_count = 0;
-
-	// Create threads as many as Options["Threads"] and start thinking.
-	std::vector<std::thread> threads;
-	auto thread_num = (size_t)Options["Threads"];
-
-	// Secure end flag of worker thread
-	thread_finished.resize(thread_num);
-	
-	// start worker thread
-	for (size_t i = 0; i < thread_num; ++i)
-	{
-		thread_finished[i] = 0;
-		threads.push_back(std::thread([i, this]
-		{ 
-			// exhaust all processor threads.
-			WinProcGroup::bindThisThread(i);
-
-			// execute the overridden process
-			this->thread_worker(i);
-
-			// Set the end flag because the thread has ended
-			this->thread_finished[i] = 1;
-		}));
-	}
-
-	// wait for all threads to finish
-	// for (auto& th :threads)
-	// th.join();
-	// If you write like, the thread will rush here while it is still working,
-	// During that time, callback_func() cannot be called and you cannot save.
-	// Therefore, you need to check the end flag yourself.
-
-	// function to determine if all threads have finished
-	auto threads_done = [&]()
-	{
-		// returns false if no one is finished
-		for (auto& f : thread_finished)
-			if (!f)
-				return false;
-		return true;
-	};
-
-	// Call back if the callback function is set.
-	auto do_a_callback = [&]()
-	{
-		if (callback_func)
-			callback_func();
-	};
-
-
-	for (uint64_t i = 0 ; ; )
-	{
-		// If all threads have finished, exit the loop.
-		if (threads_done())
-			break;
-
-		sleep(1000);
-
-		// callback_func() is called every callback_seconds.
-		if (++i == callback_seconds)
-		{
-			do_a_callback();
-			// Since I am returning from ↑, I reset the counter, so
-			// no matter how long it takes to save() etc. in do_a_callback()
-			// The next call will take a certain amount of time.
-			i = 0;
-		}
-	}
-
-	// Last save.
-	std::cout << std::endl << "finalize..";
-
-	// do_a_callback();
-	// → It should be saved by the caller, so I feel that it is not necessary here.
-
-	// It is possible that the exit code of the thread is running but the exit code of the thread is running, so
-	// We need to wait for the end with join().
-	for (auto& th : threads)
-		th.join();
-
-	// The file writing thread etc. are still running only when all threads are finished
-	// Since the work itself may not have completed, output only that all threads have finished.
-	std::cout << "all threads are joined." << std::endl;
-
-	// Restored because Options were rewritten.
-	// Restore the handler because the handler will not start unless you assign a value.
-	for (auto& s : oldOptions)
-		Options[s.first] = std::string(s.second);
-
-}
-
-
-#endif // defined(EVAL_LEARN)
diff --git a/src/learn/multi_think.h b/src/learn/multi_think.h
deleted file mode 100644
index 6e6c695c..00000000
--- a/src/learn/multi_think.h
+++ /dev/null
@@ -1,152 +0,0 @@
-﻿#ifndef _MULTI_THINK_
-#define _MULTI_THINK_
-
-#if defined(EVAL_LEARN)
-
-#include <functional>
-#include <mutex>
-
-#include "../misc.h"
-#include "../learn/learn.h"
-#include "../thread_win32_osx.h"
-
-#include <atomic>
-
-// Learning from a game record, when making yourself think and generating a fixed track, etc.
-// Helper class used when multiple threads want to call Search::think() individually.
-// Derive and use this class.
-struct MultiThink
-{
-	MultiThink() : prng(std::chrono::system_clock::now().time_since_epoch().count())
-	{
-		loop_count = 0;
-	}
-
-	// Call this function from the master thread, each thread will think,
-	// Return control when the thought ending condition is satisfied.
-	// Do something else.
-	// ・It is safe for each thread to call Learner::search(),qsearch()
-	// Separates the substitution table for each thread. (It will be restored after the end.)
-	// ・Book is not thread safe when in on the fly mode, so temporarily change this mode.
-	// Turn it off.
-	// [Requirements]
-	// 1) Override thread_worker()
-	// 2) Set the loop count with set_loop_max()
-	// 3) set a function to be called back periodically (if necessary)
-	// callback_func and callback_interval
-	void go_think();
-
-	// If there is something you want to initialize on the derived class side, override this,
-	// Called when initialization is completed with go_think().
-	// It is better to read the fixed trace at that timing.
-	virtual void init() {}
-
-	// A thread worker that is called by creating a thread when you go_think()
-	// Override and use this.
-	virtual void thread_worker(size_t thread_id) = 0;
-
-	// Called back every callback_seconds [seconds] when go_think().
-	std::function<void()> callback_func;
-	uint64_t callback_seconds = 600;
-
-	// Set the number of times worker processes (calls Search::think()).
-	void set_loop_max(uint64_t loop_max_) { loop_max = loop_max_; }
-
-	// Get the value set by set_loop_max().
-	uint64_t get_loop_max() const { return loop_max; }
-
-	// [ASYNC] Take the value of the loop counter and add the loop counter after taking it out.
-	// If the loop counter has reached loop_max, return UINT64_MAX.
-	// If you want to generate a phase, you must call this function at the time of generating the phase,
-	// Please note that the number of generated phases and the value of the counter will not match.
-	uint64_t get_next_loop_count() {
-		std::unique_lock<std::mutex> lk(loop_mutex);
-		if (loop_count >= loop_max)
-			return UINT64_MAX;
-		return loop_count++;
-	}
-
-	// [ASYNC] For returning the processed number. Each time it is called, it returns a counter that is incremented.
-	uint64_t get_done_count() {
-		std::unique_lock<std::mutex> lk(loop_mutex);
-		return ++done_count;
-	}
-
-	// Mutex when worker thread accesses I/O
-	std::mutex io_mutex;
-
-protected:
-	// Random number generator body
-	AsyncPRNG prng;
-
-private:
-	// number of times worker processes (calls Search::think())
-	std::atomic<uint64_t> loop_max;
-	// number of times the worker has processed (calls Search::think())
-	std::atomic<uint64_t> loop_count;
-	// To return the number of times it has been processed.
-	std::atomic<uint64_t> done_count;
-
-	// Mutex when changing the variables in ↑
-	std::mutex loop_mutex;
-
-	// Thread end flag.
-	// vector<bool> may not be reflected properly when trying to rewrite from multiple threads...
-	typedef uint8_t Flag;
-	std::vector<Flag> thread_finished;
-
-};
-
-// Mechanism to process task during idle time.
-// master passes the task with push_task_async() whenever you like.
-// When slave executes on_idle() in its spare time, it retrieves one task and continues execution until there is no queue.
-// Convenient to use when you want to write MultiThink thread worker in master-slave method.
-struct TaskDispatcher
-{
-	typedef std::function<void(size_t /* thread_id */)> Task;
-
-	// slave calls this function during idle.
-	void on_idle(size_t thread_id)
-	{
-		Task task;
-		while ((task = get_task_async()) != nullptr)
-			task(thread_id);
-
-		sleep(1);
-	}
-
-	// Stack [ASYNC] task.
-	void push_task_async(Task task)
-	{
-		std::unique_lock<std::mutex> lk(task_mutex);
-		tasks.push_back(task);
-	}
-
-	// Allocate size array elements for task in advance.
-	void task_reserve(size_t size)
-	{
-		tasks.reserve(size);
-	}
-
-protected:
-	// set of tasks
-	std::vector<Task> tasks;
-
-	// Take out one [ASYNC] task. Called from on_idle().
-	Task get_task_async()
-	{
-		std::unique_lock<std::mutex> lk(task_mutex);
-		if (tasks.size() == 0)
-			return nullptr;
-		Task task = *tasks.rbegin();
-		tasks.pop_back();
-		return task;
-	}
-
-	// a mutex for accessing tasks
-	std::mutex task_mutex;
-};
-
-#endif // defined(EVAL_LEARN) && defined(YANEURAOU_2018_OTAFUKU_ENGINE)
-
-#endif
diff --git a/src/learn/opening_book.cpp b/src/learn/opening_book.cpp
new file mode 100644
index 00000000..fb569bda
--- /dev/null
+++ b/src/learn/opening_book.cpp
@@ -0,0 +1,43 @@
+#include "opening_book.h"
+
+#include <fstream>
+
+namespace Learner {
+
+    EpdOpeningBook::EpdOpeningBook(const std::string& file, PRNG& prng) :
+        OpeningBook(file)
+    {
+        std::ifstream in(file);
+        if (!in)
+        {
+            return;
+        }
+
+        std::string line;
+        while (std::getline(in, line))
+        {
+            if (line.empty())
+                continue;
+
+            fens.emplace_back(line);
+        }
+
+        Algo::shuffle(fens, prng);
+    }
+
+    static bool ends_with(const std::string& lhs, const std::string& end)
+    {
+        if (end.size() > lhs.size()) return false;
+
+        return std::equal(end.rbegin(), end.rend(), lhs.rbegin());
+    }
+
+    std::unique_ptr<OpeningBook> open_opening_book(const std::string& filename, PRNG& prng)
+    {
+        if (ends_with(filename, ".epd"))
+            return std::make_unique<EpdOpeningBook>(filename, prng);
+
+        return nullptr;
+    }
+
+}
diff --git a/src/learn/opening_book.h b/src/learn/opening_book.h
new file mode 100644
index 00000000..16207f13
--- /dev/null
+++ b/src/learn/opening_book.h
@@ -0,0 +1,56 @@
+#ifndef LEARN_OPENING_BOOK_H
+#define LEARN_OPENING_BOOK_H
+
+#include "misc.h"
+#include "position.h"
+#include "thread.h"
+
+#include <vector>
+#include <random>
+#include <optional>
+#include <string>
+#include <cstdint>
+#include <memory>
+
+namespace Learner {
+
+    struct OpeningBook {
+
+        const std::string& next_fen()
+        {
+            assert(fens.size() > 0);
+
+            auto& fen = fens[current_index++];
+            if (current_index >= fens.size())
+                current_index = 0;
+
+            return fen;
+        }
+
+        std::size_t size() const { return fens.size(); }
+
+        const std::string& get_filename() const { return filename; }
+
+    protected:
+        OpeningBook(const std::string& file) :
+            filename(file),
+            current_index(0)
+        {
+        }
+
+
+        std::string filename;
+        std::vector<std::string> fens;
+        std::size_t current_index;
+    };
+
+    struct EpdOpeningBook : OpeningBook {
+
+        EpdOpeningBook(const std::string& file, PRNG& prng);
+    };
+
+    std::unique_ptr<OpeningBook> open_opening_book(const std::string& filename, PRNG& prng);
+
+}
+
+#endif
diff --git a/src/learn/packed_sfen.h b/src/learn/packed_sfen.h
new file mode 100644
index 00000000..3aa4fcac
--- /dev/null
+++ b/src/learn/packed_sfen.h
@@ -0,0 +1,46 @@
+#ifndef _PACKED_SFEN_H_
+#define _PACKED_SFEN_H_
+
+#include <vector>
+#include <cstdint>
+
+namespace Learner {
+
+    // packed sfen
+    struct PackedSfen { std::uint8_t data[32]; };
+
+    // Structure in which PackedSfen and evaluation value are integrated
+    // If you write different contents for each option, it will be a problem when reusing the teacher game
+    // For the time being, write all the following members regardless of the options.
+    struct PackedSfenValue
+    {
+        // phase
+        PackedSfen sfen;
+
+        // Evaluation value returned from Learner::search()
+        std::int16_t score;
+
+        // PV first move
+        // Used when finding the match rate with the teacher
+        std::uint16_t move;
+
+        // Trouble of the phase from the initial phase.
+        std::uint16_t gamePly;
+
+        // 1 if the player on this side ultimately wins the game. -1 if you are losing.
+        // 0 if a draw is reached.
+        // The draw is in the teacher position generation command gensfen,
+        // Only write if LEARN_GENSFEN_DRAW_RESULT is enabled.
+        std::int8_t game_result;
+
+        // When exchanging the file that wrote the teacher aspect with other people
+        //Because this structure size is not fixed, pad it so that it is 40 bytes in any environment.
+        std::uint8_t padding;
+
+        // 32 + 2 + 2 + 2 + 1 + 1 = 40bytes
+    };
+
+    // Phase array: PSVector stands for packed sfen vector.
+    using PSVector = std::vector<PackedSfenValue>;
+}
+#endif
diff --git a/src/learn/sfen_packer.cpp b/src/learn/sfen_packer.cpp
new file mode 100644
index 00000000..777b5943
--- /dev/null
+++ b/src/learn/sfen_packer.cpp
@@ -0,0 +1,386 @@
+﻿#include "sfen_packer.h"
+
+#include "packed_sfen.h"
+
+#include "misc.h"
+#include "position.h"
+
+#include <sstream>
+#include <fstream>
+#include <cstring> // std::memset()
+
+using namespace std;
+
+namespace Learner {
+
+    // Class that handles bitstream
+    // useful when doing aspect encoding
+    struct BitStream
+    {
+        // Set the memory to store the data in advance.
+        // Assume that memory is cleared to 0.
+        void set_data(std::uint8_t* data_) { data = data_; reset(); }
+
+        // Get the pointer passed in set_data().
+        uint8_t* get_data() const { return data; }
+
+        // Get the cursor.
+        int get_cursor() const { return bit_cursor; }
+
+        // reset the cursor
+        void reset() { bit_cursor = 0; }
+
+        // Write 1bit to the stream.
+        // If b is non-zero, write out 1. If 0, write 0.
+        void write_one_bit(int b)
+        {
+            if (b)
+                data[bit_cursor / 8] |= 1 << (bit_cursor & 7);
+
+            ++bit_cursor;
+        }
+
+        // Get 1 bit from the stream.
+        int read_one_bit()
+        {
+            int b = (data[bit_cursor / 8] >> (bit_cursor & 7)) & 1;
+            ++bit_cursor;
+
+            return b;
+        }
+
+        // write n bits of data
+        // Data shall be written out from the lower order of d.
+        void write_n_bit(int d, int n)
+        {
+            for (int i = 0; i <n; ++i)
+                write_one_bit(d & (1 << i));
+        }
+
+        // read n bits of data
+        // Reverse conversion of write_n_bit().
+        int read_n_bit(int n)
+        {
+            int result = 0;
+            for (int i = 0; i < n; ++i)
+                result |= read_one_bit() ? (1 << i) : 0;
+
+            return result;
+        }
+
+    private:
+        // Next bit position to read/write.
+        int bit_cursor;
+
+        // data entity
+        std::uint8_t* data;
+    };
+
+    // Class for compressing/decompressing sfen
+    // sfen can be packed to 256bit (32bytes) by Huffman coding.
+    // This is proven by mini. The above is Huffman coding.
+    //
+    // Internal format = 1-bit turn + 7-bit king position *2 + piece on board (Huffman coding) + hand piece (Huffman coding)
+    // Side to move (White = 0, Black = 1) (1bit)
+    // White King Position (6 bits)
+    // Black King Position (6 bits)
+    // Huffman Encoding of the board
+    // Castling availability (1 bit x 4)
+    // En passant square (1 or 1 + 6 bits)
+    // Rule 50 (6 bits)
+    // Game play (8 bits)
+    //
+    // TODO(someone): Rename SFEN to FEN.
+    //
+    struct SfenPacker
+    {
+        void pack(const Position& pos);
+
+        // sfen packed by pack() (256bit = 32bytes)
+        // Or sfen to decode with unpack()
+        uint8_t *data; // uint8_t[32];
+
+        BitStream stream;
+
+        // Output the board pieces to stream.
+        void write_board_piece_to_stream(Piece pc);
+
+        // Read one board piece from stream
+        Piece read_board_piece_from_stream();
+    };
+
+
+    // Huffman coding
+    // * is simplified from mini encoding to make conversion easier.
+    //
+    // Huffman Encoding
+    //
+    // Empty  xxxxxxx0
+    // Pawn   xxxxx001 + 1 bit (Color)
+    // Knight xxxxx011 + 1 bit (Color)
+    // Bishop xxxxx101 + 1 bit (Color)
+    // Rook   xxxxx111 + 1 bit (Color)
+    // Queen   xxxx1001 + 1 bit (Color)
+    //
+    // Worst case:
+    // - 32 empty squares    32 bits
+    // - 30 pieces           150 bits
+    // - 2 kings             12 bits
+    // - castling rights     4 bits
+    // - ep square           7 bits
+    // - rule50              7 bits
+    // - game ply            16 bits
+    // - TOTAL               228 bits < 256 bits
+
+    struct HuffmanedPiece
+    {
+        int code; // how it will be coded
+        int bits; // How many bits do you have
+    };
+
+    constexpr HuffmanedPiece huffman_table[] =
+    {
+        {0b0000,1}, // NO_PIECE
+        {0b0001,4}, // PAWN
+        {0b0011,4}, // KNIGHT
+        {0b0101,4}, // BISHOP
+        {0b0111,4}, // ROOK
+        {0b1001,4}, // QUEEN
+    };
+
+    // Pack sfen and store in data[32].
+    void SfenPacker::pack(const Position& pos)
+    {
+        memset(data, 0, 32 /* 256bit */);
+        stream.set_data(data);
+
+        // turn
+        // Side to move.
+        stream.write_one_bit((int)(pos.side_to_move()));
+
+        // 7-bit positions for leading and trailing balls
+        // White king and black king, 6 bits for each.
+        for(auto c: Colors)
+            stream.write_n_bit(pos.king_square(c), 6);
+
+        // Write the pieces on the board other than the kings.
+        for (Rank r = RANK_8; r >= RANK_1; --r)
+        {
+            for (File f = FILE_A; f <= FILE_H; ++f)
+            {
+                Piece pc = pos.piece_on(make_square(f, r));
+                if (type_of(pc) == KING)
+                    continue;
+                write_board_piece_to_stream(pc);
+            }
+        }
+
+        // TODO(someone): Support chess960.
+        stream.write_one_bit(pos.can_castle(WHITE_OO));
+        stream.write_one_bit(pos.can_castle(WHITE_OOO));
+        stream.write_one_bit(pos.can_castle(BLACK_OO));
+        stream.write_one_bit(pos.can_castle(BLACK_OOO));
+
+        if (pos.ep_square() == SQ_NONE) {
+            stream.write_one_bit(0);
+        }
+        else {
+            stream.write_one_bit(1);
+            stream.write_n_bit(static_cast<int>(pos.ep_square()), 6);
+        }
+
+        stream.write_n_bit(pos.state()->rule50, 6);
+
+        const int fm = 1 + (pos.game_ply()-(pos.side_to_move() == BLACK)) / 2;
+        stream.write_n_bit(fm, 8);
+
+        // Write high bits of half move. This is a fix for the
+        // limited range of half move counter.
+        // This is backwards compatibile.
+        stream.write_n_bit(fm >> 8, 8);
+
+        // Write the highest bit of rule50 at the end. This is a backwards
+        // compatibile fix for rule50 having only 6 bits stored.
+        // This bit is just ignored by the old parsers.
+        stream.write_n_bit(pos.state()->rule50 >> 6, 1);
+
+        assert(stream.get_cursor() <= 256);
+    }
+
+    // Output the board pieces to stream.
+    void SfenPacker::write_board_piece_to_stream(Piece pc)
+    {
+        // piece type
+        PieceType pr = type_of(pc);
+        auto c = huffman_table[pr];
+        stream.write_n_bit(c.code, c.bits);
+
+        if (pc == NO_PIECE)
+            return;
+
+        // first and second flag
+        stream.write_one_bit(color_of(pc));
+    }
+
+    // Read one board piece from stream
+    Piece SfenPacker::read_board_piece_from_stream()
+    {
+        PieceType pr = NO_PIECE_TYPE;
+        int code = 0, bits = 0;
+        while (true)
+        {
+            code |= stream.read_one_bit() << bits;
+            ++bits;
+
+            assert(bits <= 6);
+
+            for (pr = NO_PIECE_TYPE; pr <KING; ++pr)
+                if (huffman_table[pr].code == code
+                    && huffman_table[pr].bits == bits)
+                    goto Found;
+        }
+    Found:;
+        if (pr == NO_PIECE_TYPE)
+            return NO_PIECE;
+
+        // first and second flag
+        Color c = (Color)stream.read_one_bit();
+
+        return make_piece(c, pr);
+    }
+
+    int set_from_packed_sfen(Position& pos, const PackedSfen& sfen, StateInfo* si, Thread* th)
+    {
+        SfenPacker packer;
+        auto& stream = packer.stream;
+
+        // TODO: separate streams for writing and reading. Here we actually have to
+        // const_cast which is not safe in the long run.
+        stream.set_data(const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(&sfen)));
+
+        pos.clear();
+        std::memset(si, 0, sizeof(StateInfo));
+        std::fill_n(&pos.pieceList[0][0], sizeof(pos.pieceList) / sizeof(Square), SQ_NONE);
+        pos.st = si;
+
+        // Active color
+        pos.sideToMove = (Color)stream.read_one_bit();
+
+        pos.pieceList[W_KING][0] = SQUARE_NB;
+        pos.pieceList[B_KING][0] = SQUARE_NB;
+
+        // First the position of the ball
+        for (auto c : Colors)
+            pos.board[stream.read_n_bit(6)] = make_piece(c, KING);
+
+        // Piece placement
+        for (Rank r = RANK_8; r >= RANK_1; --r)
+        {
+            for (File f = FILE_A; f <= FILE_H; ++f)
+            {
+                auto sq = make_square(f, r);
+
+                // it seems there are already balls
+                Piece pc;
+                if (type_of(pos.board[sq]) != KING)
+                {
+                    assert(pos.board[sq] == NO_PIECE);
+                    pc = packer.read_board_piece_from_stream();
+                }
+                else
+                {
+                    pc = pos.board[sq];
+                    // put_piece() will catch ASSERT unless you remove it all.
+                    pos.board[sq] = NO_PIECE;
+                }
+
+                // There may be no pieces, so skip in that case.
+                if (pc == NO_PIECE)
+                    continue;
+
+                pos.put_piece(Piece(pc), sq);
+
+                if (stream.get_cursor()> 256)
+                    return 1;
+            }
+        }
+
+        // Castling availability.
+        // TODO(someone): Support chess960.
+        pos.st->castlingRights = 0;
+        if (stream.read_one_bit()) {
+            Square rsq;
+            for (rsq = relative_square(WHITE, SQ_H1); pos.piece_on(rsq) != W_ROOK; --rsq) {}
+            pos.set_castling_right(WHITE, rsq);
+        }
+        if (stream.read_one_bit()) {
+            Square rsq;
+            for (rsq = relative_square(WHITE, SQ_A1); pos.piece_on(rsq) != W_ROOK; ++rsq) {}
+            pos.set_castling_right(WHITE, rsq);
+        }
+        if (stream.read_one_bit()) {
+            Square rsq;
+            for (rsq = relative_square(BLACK, SQ_H1); pos.piece_on(rsq) != B_ROOK; --rsq) {}
+            pos.set_castling_right(BLACK, rsq);
+        }
+        if (stream.read_one_bit()) {
+            Square rsq;
+            for (rsq = relative_square(BLACK, SQ_A1); pos.piece_on(rsq) != B_ROOK; ++rsq) {}
+            pos.set_castling_right(BLACK, rsq);
+        }
+
+        // En passant square. Ignore if no pawn capture is possible
+        if (stream.read_one_bit()) {
+            Square ep_square = static_cast<Square>(stream.read_n_bit(6));
+            pos.st->epSquare = ep_square;
+
+            if (!(pos.attackers_to(pos.st->epSquare) & pos.pieces(pos.sideToMove, PAWN))
+                || !(pos.pieces(~pos.sideToMove, PAWN) & (pos.st->epSquare + pawn_push(~pos.sideToMove))))
+                pos.st->epSquare = SQ_NONE;
+        }
+        else {
+            pos.st->epSquare = SQ_NONE;
+        }
+
+        // Halfmove clock
+        pos.st->rule50 = stream.read_n_bit(6);
+
+        // Fullmove number
+        pos.gamePly = stream.read_n_bit(8);
+
+        // Read the highest bit of rule50. This was added as a fix for rule50
+        // counter having only 6 bits stored.
+        // In older entries this will just be a zero bit.
+        pos.gamePly |= stream.read_n_bit(8) << 8;
+
+        // Read the highest bit of rule50. This was added as a fix for rule50
+        // counter having only 6 bits stored.
+        // In older entries this will just be a zero bit.
+        pos.st->rule50 |= stream.read_n_bit(1) << 6;
+
+        // Convert from fullmove starting from 1 to gamePly starting from 0,
+        // handle also common incorrect FEN with fullmove = 0.
+        pos.gamePly = std::max(2 * (pos.gamePly - 1), 0) + (pos.sideToMove == BLACK);
+
+        assert(stream.get_cursor() <= 256);
+
+        pos.chess960 = false;
+        pos.thisThread = th;
+        pos.set_state(pos.st);
+
+        assert(pos.pos_is_ok());
+
+        return 0;
+    }
+
+    PackedSfen sfen_pack(Position& pos)
+    {
+        PackedSfen sfen;
+
+        SfenPacker sp;
+        sp.data = (uint8_t*)&sfen;
+        sp.pack(pos);
+
+        return sfen;
+    }
+}
diff --git a/src/learn/sfen_packer.h b/src/learn/sfen_packer.h
new file mode 100644
index 00000000..5f232fed
--- /dev/null
+++ b/src/learn/sfen_packer.h
@@ -0,0 +1,20 @@
+#ifndef _SFEN_PACKER_H_
+#define _SFEN_PACKER_H_
+
+#include "types.h"
+
+#include "learn/packed_sfen.h"
+
+#include <cstdint>
+
+class Position;
+struct StateInfo;
+class Thread;
+
+namespace Learner {
+
+    int set_from_packed_sfen(Position& pos, const PackedSfen& sfen, StateInfo* si, Thread* th);
+    PackedSfen sfen_pack(Position& pos);
+}
+
+#endif
\ No newline at end of file
diff --git a/src/learn/sfen_reader.h b/src/learn/sfen_reader.h
new file mode 100644
index 00000000..512f1165
--- /dev/null
+++ b/src/learn/sfen_reader.h
@@ -0,0 +1,365 @@
+#include "sfen_stream.h"
+
+#include "packed_sfen.h"
+
+#include "misc.h"
+
+#include <string>
+#include <vector>
+#include <deque>
+#include <memory>
+#include <mutex>
+#include <list>
+#include <atomic>
+#include <optional>
+#include <iostream>
+#include <cstdint>
+#include <thread>
+
+namespace Learner{
+
+    enum struct SfenReaderMode
+    {
+        Sequential,
+        Cyclic
+    };
+
+    // Sfen reader
+    struct SfenReader
+    {
+        // Number of phases buffered by each thread 0.1M phases. 4M phase at 40HT
+        static constexpr size_t DEFAULT_THREAD_BUFFER_SIZE = 10 * 1000;
+
+        // Buffer for reading files (If this is made larger,
+        // the shuffle becomes larger and the phases may vary.
+        // If it is too large, the memory consumption will increase.
+        // SFEN_READ_SIZE is a multiple of THREAD_BUFFER_SIZE.
+        static constexpr const size_t DEFAULT_SFEN_READ_SIZE = 1000 * 1000 * 10;
+
+        // Do not use std::random_device().
+        // Because it always the same integers on MinGW.
+        SfenReader(
+            const std::vector<std::string>& filenames_,
+            bool do_shuffle,
+            SfenReaderMode mode_,
+            int thread_num,
+            const std::string& seed,
+            size_t read_size = DEFAULT_SFEN_READ_SIZE,
+            size_t buffer_size = DEFAULT_THREAD_BUFFER_SIZE
+        ) :
+            filenames(filenames_.begin(), filenames_.end()),
+            mode(mode_),
+            sfen_read_size(read_size),
+            thread_buffer_size(buffer_size),
+            prng(seed)
+        {
+            packed_sfens.resize(thread_num);
+            total_read = 0;
+            end_of_files = false;
+            shuffle = do_shuffle;
+            stop_flag = false;
+
+            file_worker_thread = std::thread([&] {
+                this->file_read_worker();
+            });
+        }
+
+        ~SfenReader()
+        {
+            stop_flag = true;
+
+            if (file_worker_thread.joinable())
+                file_worker_thread.join();
+        }
+
+        // Load the phase for calculation such as mse.
+        PSVector read_for_mse(uint64_t count)
+        {
+            PSVector sfen_for_mse;
+            sfen_for_mse.reserve(count);
+
+            for (uint64_t i = 0; i < count; ++i)
+            {
+                PackedSfenValue ps;
+                if (!read_to_thread_buffer(0, ps))
+                {
+                    std::cout << "ERROR (sfen_reader): Reading failed." << std::endl;
+                    return sfen_for_mse;
+                }
+
+                sfen_for_mse.push_back(ps);
+            }
+
+            return sfen_for_mse;
+        }
+
+        PSVector read_validation_set(const std::string& file_name, int eval_limit, bool use_draw_games)
+        {
+            PSVector sfen_for_mse;
+
+            auto input = open_sfen_input_file(file_name);
+
+            while(!input->eof())
+            {
+                std::optional<PackedSfenValue> p_opt = input->next();
+                if (p_opt.has_value())
+                {
+                    auto& p = *p_opt;
+
+                    if (eval_limit < abs(p.score))
+                        continue;
+
+                    if (!use_draw_games && p.game_result == 0)
+                        continue;
+
+                    sfen_for_mse.push_back(p);
+                }
+                else
+                {
+                    break;
+                }
+            }
+
+            return sfen_for_mse;
+        }
+
+        // [ASYNC] Thread returns one aspect. Otherwise returns false.
+        bool read_to_thread_buffer(size_t thread_id, PackedSfenValue& ps)
+        {
+            // If there are any positions left in the thread buffer
+            // then retrieve one and return it.
+            auto& thread_ps = packed_sfens[thread_id];
+
+            // Fill the read buffer if there is no remaining buffer,
+            // but if it doesn't even exist, finish.
+            // If the buffer is empty, fill it.
+            if ((thread_ps == nullptr || thread_ps->empty())
+                && !read_to_thread_buffer_impl(thread_id))
+                return false;
+
+            // read_to_thread_buffer_impl() returned true,
+            // Since the filling of the thread buffer with the
+            // phase has been completed successfully
+            // thread_ps->rbegin() is alive.
+
+            ps = thread_ps->back();
+            thread_ps->pop_back();
+
+            // If you've run out of buffers, call delete yourself to free this buffer.
+            if (thread_ps->empty())
+            {
+                thread_ps.reset();
+            }
+
+            return true;
+        }
+
+        // [ASYNC] Read some aspects into thread buffer.
+        bool read_to_thread_buffer_impl(size_t thread_id)
+        {
+            while (true)
+            {
+                {
+                    std::unique_lock<std::mutex> lk(mutex);
+                    // If you can fill from the file buffer, that's fine.
+                    if (packed_sfens_pool.size() != 0)
+                    {
+                        // It seems that filling is possible, so fill and finish.
+
+                        packed_sfens[thread_id] = std::move(packed_sfens_pool.front());
+                        packed_sfens_pool.pop_front();
+
+                        total_read += thread_buffer_size;
+
+                        return true;
+                    }
+                }
+
+                // The file to read is already gone. No more use.
+                if (end_of_files)
+                    return false;
+
+                // Waiting for file worker to fill packed_sfens_pool.
+                // The mutex isn't locked, so it should fill up soon.
+                // Poor man's condition variable.
+                sleep(1);
+            }
+
+        }
+
+        void file_read_worker()
+        {
+            std::string currentFilename;
+            uint64_t numEntriesReadFromCurrentFile = 0;
+
+            auto open_next_file = [&]() {
+                // no more
+                for(;;)
+                {
+                    sfen_input_stream.reset();
+
+                    if (filenames.empty())
+                        return false;
+
+                    // Get the next file name.
+                    currentFilename = filenames.front();
+                    filenames.pop_front();
+
+                    numEntriesReadFromCurrentFile = 0;
+
+                    sfen_input_stream = open_sfen_input_file(currentFilename);
+
+                    auto out = sync_region_cout.new_region();
+                    if (sfen_input_stream == nullptr)
+                    {
+                        out << "INFO (sfen_reader): File does not exist: " << currentFilename << '\n';
+                    }
+                    else
+                    {
+                        out << "INFO (sfen_reader): Opened file for reading: " << currentFilename << '\n';
+
+                        // in case the file is empty or was deleted.
+                        if (sfen_input_stream->eof())
+                        {
+                            out << "  - File empty, nothing to read.\n";
+                        }
+                        else
+                        {
+                            return true;
+                        }
+                    }
+                }
+            };
+
+            if (sfen_input_stream == nullptr && !open_next_file())
+            {
+                auto out = sync_region_cout.new_region();
+                out << "INFO (sfen_reader): End of files." << std::endl;
+                end_of_files = true;
+                return;
+            }
+
+            while (true)
+            {
+                // Wait for the buffer to run out.
+                // This size() is read only, so you don't need to lock it.
+                while (!stop_flag && packed_sfens_pool.size() >= sfen_read_size / thread_buffer_size)
+                    sleep(100);
+
+                if (stop_flag)
+                    return;
+
+                PSVector sfens;
+                sfens.reserve(sfen_read_size);
+
+                // Read from the file into the file buffer.
+                while (sfens.size() < sfen_read_size)
+                {
+                    std::optional<PackedSfenValue> p = sfen_input_stream->next();
+                    if (p.has_value())
+                    {
+                        sfens.push_back(*p);
+                        ++numEntriesReadFromCurrentFile;
+                    }
+                    else
+                    {
+                        if (mode == SfenReaderMode::Cyclic
+                            && numEntriesReadFromCurrentFile > 0)
+                        {
+                            // The file contained data so we add it again to the end of the queue.
+                            filenames.emplace_back(currentFilename);
+                        }
+
+                        if(!open_next_file())
+                        {
+                            // There was no next file. Abort.
+                            auto out = sync_region_cout.new_region();
+                            out << "INFO (sfen_reader): End of files." << std::endl;
+                            end_of_files = true;
+                            return;
+                        }
+                    }
+                }
+
+                // Shuffle the read phase data.
+                if (shuffle)
+                {
+                    Algo::shuffle(sfens, prng);
+                }
+
+                // Divide this by thread_buffer_size. There should be size pieces.
+                // sfen_read_size shall be a multiple of thread_buffer_size.
+                assert((sfen_read_size % thread_buffer_size) == 0);
+
+                auto size = size_t(sfen_read_size / thread_buffer_size);
+                std::vector<std::unique_ptr<PSVector>> buffers;
+                buffers.reserve(size);
+
+                for (size_t i = 0; i < size; ++i)
+                {
+                    // Delete this pointer on the receiving side.
+                    auto buf = std::make_unique<PSVector>();
+                    buf->resize(thread_buffer_size);
+                    memcpy(
+                        buf->data(),
+                        &sfens[i * thread_buffer_size],
+                        sizeof(PackedSfenValue) * thread_buffer_size);
+
+                    buffers.emplace_back(std::move(buf));
+                }
+
+                {
+                    std::unique_lock<std::mutex> lk(mutex);
+
+                    // The mutex lock is required because the%
+                    // contents of packed_sfens_pool are changed.
+
+                    for (auto& buf : buffers)
+                        packed_sfens_pool.emplace_back(std::move(buf));
+                }
+            }
+        }
+
+    protected:
+
+        // worker thread reading file in background
+        std::thread file_worker_thread;
+
+        // sfen files
+        std::deque<std::string> filenames;
+
+        std::atomic<bool> stop_flag;
+
+        // number of phases read (file to memory buffer)
+        std::atomic<uint64_t> total_read;
+
+        // Do not shuffle when reading the phase.
+        bool shuffle;
+
+        SfenReaderMode mode;
+
+        size_t sfen_read_size;
+        size_t thread_buffer_size;
+
+        // Random number to shuffle when reading the phase
+        PRNG prng;
+
+        // Did you read the files and reached the end?
+        std::atomic<bool> end_of_files;
+
+        // handle of sfen file
+        std::unique_ptr<BasicSfenInputStream> sfen_input_stream;
+
+        // sfen for each thread
+        // (When the thread is used up, the thread should call delete to release it.)
+        std::vector<std::unique_ptr<PSVector>> packed_sfens;
+
+        // Mutex when accessing packed_sfens_pool
+        std::mutex mutex;
+
+        // pool of sfen. The worker thread read from the file is added here.
+        // Each worker thread fills its own packed_sfens[thread_id] from here.
+        // * Lock and access the mutex.
+        std::list<std::unique_ptr<PSVector>> packed_sfens_pool;
+    };
+}
diff --git a/src/learn/sfen_stream.h b/src/learn/sfen_stream.h
new file mode 100644
index 00000000..da411346
--- /dev/null
+++ b/src/learn/sfen_stream.h
@@ -0,0 +1,222 @@
+#ifndef _SFEN_STREAM_H_
+#define _SFEN_STREAM_H_
+
+#include "packed_sfen.h"
+
+#include "extra/nnue_data_binpack_format.h"
+
+#include <optional>
+#include <fstream>
+#include <string>
+#include <memory>
+
+namespace Learner {
+
+    enum struct SfenOutputType
+    {
+        Bin,
+        Binpack
+    };
+
+    static bool ends_with(const std::string& lhs, const std::string& end)
+    {
+        if (end.size() > lhs.size()) return false;
+
+        return std::equal(end.rbegin(), end.rend(), lhs.rbegin());
+    }
+
+    static bool has_extension(const std::string& filename, const std::string& extension)
+    {
+        return ends_with(filename, "." + extension);
+    }
+
+    static std::string filename_with_extension(const std::string& filename, const std::string& ext)
+    {
+        if (ends_with(filename, ext))
+        {
+            return filename;
+        }
+        else
+        {
+            return filename + "." + ext;
+        }
+    }
+
+    struct BasicSfenInputStream
+    {
+        virtual std::optional<PackedSfenValue> next() = 0;
+        virtual bool eof() const = 0;
+        virtual ~BasicSfenInputStream() {}
+    };
+
+    struct BinSfenInputStream : BasicSfenInputStream
+    {
+        static constexpr auto openmode = std::ios::in | std::ios::binary;
+        static inline const std::string extension = "bin";
+
+        BinSfenInputStream(std::string filename) :
+            m_stream(filename, openmode),
+            m_eof(!m_stream)
+        {
+        }
+
+        std::optional<PackedSfenValue> next() override
+        {
+            PackedSfenValue e;
+            if(m_stream.read(reinterpret_cast<char*>(&e), sizeof(PackedSfenValue)))
+            {
+                return e;
+            }
+            else
+            {
+                m_eof = true;
+                return std::nullopt;
+            }
+        }
+
+        bool eof() const override
+        {
+            return m_eof;
+        }
+
+        ~BinSfenInputStream() override {}
+
+    private:
+        std::fstream m_stream;
+        bool m_eof;
+    };
+
+    struct BinpackSfenInputStream : BasicSfenInputStream
+    {
+        static constexpr auto openmode = std::ios::in | std::ios::binary;
+        static inline const std::string extension = "binpack";
+
+        BinpackSfenInputStream(std::string filename) :
+            m_stream(filename, openmode),
+            m_eof(!m_stream.hasNext())
+        {
+        }
+
+        std::optional<PackedSfenValue> next() override
+        {
+            static_assert(sizeof(binpack::nodchip::PackedSfenValue) == sizeof(PackedSfenValue));
+
+            if (!m_stream.hasNext())
+            {
+                m_eof = true;
+                return std::nullopt;
+            }
+
+            auto training_data_entry = m_stream.next();
+            auto v = binpack::trainingDataEntryToPackedSfenValue(training_data_entry);
+            PackedSfenValue psv;
+            // same layout, different types. One is from generic library.
+            std::memcpy(&psv, &v, sizeof(PackedSfenValue));
+
+            return psv;
+        }
+
+        bool eof() const override
+        {
+            return m_eof;
+        }
+
+        ~BinpackSfenInputStream() override {}
+
+    private:
+        binpack::CompressedTrainingDataEntryReader m_stream;
+        bool m_eof;
+    };
+
+    struct BasicSfenOutputStream
+    {
+        virtual void write(const PSVector& sfens) = 0;
+        virtual ~BasicSfenOutputStream() {}
+    };
+
+    struct BinSfenOutputStream : BasicSfenOutputStream
+    {
+        static constexpr auto openmode = std::ios::out | std::ios::binary | std::ios::app;
+        static inline const std::string extension = "bin";
+
+        BinSfenOutputStream(std::string filename) :
+            m_stream(filename_with_extension(filename, extension), openmode)
+        {
+        }
+
+        void write(const PSVector& sfens) override
+        {
+            m_stream.write(reinterpret_cast<const char*>(sfens.data()), sizeof(PackedSfenValue) * sfens.size());
+        }
+
+        ~BinSfenOutputStream() override {}
+
+    private:
+        std::fstream m_stream;
+    };
+
+    struct BinpackSfenOutputStream : BasicSfenOutputStream
+    {
+        static constexpr auto openmode = std::ios::out | std::ios::binary | std::ios::app;
+        static inline const std::string extension = "binpack";
+
+        BinpackSfenOutputStream(std::string filename) :
+            m_stream(filename_with_extension(filename, extension), openmode)
+        {
+        }
+
+        void write(const PSVector& sfens) override
+        {
+            static_assert(sizeof(binpack::nodchip::PackedSfenValue) == sizeof(PackedSfenValue));
+
+            for(auto& sfen : sfens)
+            {
+                // The library uses a type that's different but layout-compatibile.
+                binpack::nodchip::PackedSfenValue e;
+                std::memcpy(&e, &sfen, sizeof(binpack::nodchip::PackedSfenValue));
+                m_stream.addTrainingDataEntry(binpack::packedSfenValueToTrainingDataEntry(e));
+            }
+        }
+
+        ~BinpackSfenOutputStream() override {}
+
+    private:
+        binpack::CompressedTrainingDataEntryWriter m_stream;
+    };
+
+    inline std::unique_ptr<BasicSfenInputStream> open_sfen_input_file(const std::string& filename)
+    {
+        if (has_extension(filename, BinSfenInputStream::extension))
+            return std::make_unique<BinSfenInputStream>(filename);
+        else if (has_extension(filename, BinpackSfenInputStream::extension))
+            return std::make_unique<BinpackSfenInputStream>(filename);
+
+        return nullptr;
+    }
+
+    inline std::unique_ptr<BasicSfenOutputStream> create_new_sfen_output(const std::string& filename, SfenOutputType sfen_output_type)
+    {
+        switch(sfen_output_type)
+        {
+            case SfenOutputType::Bin:
+                return std::make_unique<BinSfenOutputStream>(filename);
+            case SfenOutputType::Binpack:
+                return std::make_unique<BinpackSfenOutputStream>(filename);
+        }
+
+        assert(false);
+        return nullptr;
+    }
+
+    inline std::unique_ptr<BasicSfenOutputStream> create_new_sfen_output(const std::string& filename)
+    {
+        if (has_extension(filename, BinSfenOutputStream::extension))
+            return std::make_unique<BinSfenOutputStream>(filename);
+        else if (has_extension(filename, BinpackSfenOutputStream::extension))
+            return std::make_unique<BinpackSfenOutputStream>(filename);
+
+        return nullptr;
+    }
+}
+
+#endif
\ No newline at end of file
diff --git a/src/learn/sfen_writer.h b/src/learn/sfen_writer.h
new file mode 100644
index 00000000..1bbd916c
--- /dev/null
+++ b/src/learn/sfen_writer.h
@@ -0,0 +1,206 @@
+#include "packed_sfen.h"
+#include "sfen_stream.h"
+
+#include "misc.h"
+
+#include "extra/nnue_data_binpack_format.h"
+
+#include "syzygy/tbprobe.h"
+
+#include <cstring>
+#include <filesystem>
+#include <fstream>
+#include <limits>
+#include <list>
+#include <memory>
+#include <optional>
+#include <shared_mutex>
+#include <thread>
+#include <atomic>
+
+using namespace std;
+
+namespace Learner {
+
+    // Helper class for exporting Sfen
+    struct SfenWriter
+    {
+        // Amount of sfens required to flush the buffer.
+        static constexpr size_t SFEN_WRITE_SIZE = 5000;
+
+        // File name to write and number of threads to create
+        SfenWriter(string filename_, int thread_num, uint64_t save_count, SfenOutputType sfen_output_type)
+        {
+            sfen_buffers_pool.reserve((size_t)thread_num * 10);
+            sfen_buffers.resize(thread_num);
+
+            auto out = sync_region_cout.new_region();
+            out << "INFO (sfen_writer): Creating new data file at " << filename_ << endl;
+
+            sfen_format = sfen_output_type;
+            output_file_stream = create_new_sfen_output(filename_, sfen_format);
+            filename = filename_;
+            save_every = save_count;
+
+            finished = false;
+
+            file_worker_thread = std::thread([&] { this->file_write_worker(); });
+        }
+
+        ~SfenWriter()
+        {
+            flush();
+
+            finished = true;
+            file_worker_thread.join();
+            output_file_stream.reset();
+
+#if !defined(NDEBUG)
+            {
+                // All buffers should be empty since file_worker_thread
+                // should have written everything before exiting.
+                for (const auto& p : sfen_buffers) { assert(p == nullptr); (void)p ; }
+                assert(sfen_buffers_pool.empty());
+            }
+#endif
+        }
+
+        void write(size_t thread_id, const PackedSfenValue& psv)
+        {
+            // We have a buffer for each thread and add it there.
+            // If the buffer overflows, write it to a file.
+
+            // This buffer is prepared for each thread.
+            auto& buf = sfen_buffers[thread_id];
+
+            // Secure since there is no buf at the first time
+            // and immediately after writing the thread buffer.
+            if (!buf)
+            {
+                buf = std::make_unique<PSVector>();
+                buf->reserve(SFEN_WRITE_SIZE);
+            }
+
+            // Buffer is exclusive to this thread.
+            // There is no need for a critical section.
+            buf->push_back(psv);
+
+            if (buf->size() >= SFEN_WRITE_SIZE)
+            {
+                // If you load it in sfen_buffers_pool, the worker will do the rest.
+
+                // Critical section since sfen_buffers_pool is shared among threads.
+                std::unique_lock<std::mutex> lk(mutex);
+                sfen_buffers_pool.emplace_back(std::move(buf));
+            }
+        }
+
+        void flush()
+        {
+            for (size_t i = 0; i < sfen_buffers.size(); ++i)
+            {
+                flush(i);
+            }
+        }
+
+        // Move what remains in the buffer for your thread to a buffer for writing to a file.
+        void flush(size_t thread_id)
+        {
+            std::unique_lock<std::mutex> lk(mutex);
+
+            auto& buf = sfen_buffers[thread_id];
+
+            // There is a case that buf==nullptr, so that check is necessary.
+            if (buf && buf->size() != 0)
+            {
+                sfen_buffers_pool.emplace_back(std::move(buf));
+            }
+        }
+
+        // Dedicated thread to write to file
+        void file_write_worker()
+        {
+            while (!finished || sfen_buffers_pool.size())
+            {
+                vector<std::unique_ptr<PSVector>> buffers;
+                {
+                    std::unique_lock<std::mutex> lk(mutex);
+
+                    // Atomically swap take the filled buffers and
+                    // create a new buffer pool for threads to fill.
+                    buffers = std::move(sfen_buffers_pool);
+                    sfen_buffers_pool = std::vector<std::unique_ptr<PSVector>>();
+                }
+
+                if (!buffers.size())
+                {
+                    // Poor man's condition variable.
+                    sleep(100);
+                }
+                else
+                {
+                    for (auto& buf : buffers)
+                    {
+                        output_file_stream->write(*buf);
+
+                        sfen_write_count += buf->size();
+
+                        // Add the processed number here, and if it exceeds save_every,
+                        // change the file name and reset this counter.
+                        sfen_write_count_current_file += buf->size();
+                        if (sfen_write_count_current_file >= save_every)
+                        {
+                            sfen_write_count_current_file = 0;
+
+                            // Sequential number attached to the file
+                            int n = (int)(sfen_write_count / save_every);
+
+                            // Rename the file and open it again.
+                            // Add ios::app in consideration of overwriting.
+                            // (Depending on the operation, it may not be necessary.)
+                            string new_filename = filename + "_" + std::to_string(n);
+                            output_file_stream = create_new_sfen_output(new_filename, sfen_format);
+
+                            auto out = sync_region_cout.new_region();
+                            out << "INFO (sfen_writer): Creating new data file at " << new_filename << endl;
+                        }
+                    }
+                }
+            }
+        }
+
+    private:
+
+        std::unique_ptr<BasicSfenOutputStream> output_file_stream;
+
+        // A new net is saved after every save_every sfens are processed.
+        uint64_t save_every = std::numeric_limits<uint64_t>::max();
+
+        // File name passed in the constructor
+        std::string filename;
+
+        // Thread to write to the file
+        std::thread file_worker_thread;
+
+        // Flag that all threads have finished
+        atomic<bool> finished;
+
+        SfenOutputType sfen_format;
+
+        // buffer before writing to file
+        // sfen_buffers is the buffer for each thread
+        // sfen_buffers_pool is a buffer for writing.
+        // After loading the phase in the former buffer by SFEN_WRITE_SIZE,
+        // transfer it to the latter.
+        std::vector<std::unique_ptr<PSVector>> sfen_buffers;
+        std::vector<std::unique_ptr<PSVector>> sfen_buffers_pool;
+
+        // Mutex required to access sfen_buffers_pool
+        std::mutex mutex;
+
+        // Number of sfens written in total, and the
+        // number of sfens written in the current file.
+        uint64_t sfen_write_count = 0;
+        uint64_t sfen_write_count_current_file = 0;
+    };
+}
diff --git a/src/learn/transform.cpp b/src/learn/transform.cpp
new file mode 100644
index 00000000..5687b48b
--- /dev/null
+++ b/src/learn/transform.cpp
@@ -0,0 +1,242 @@
+#include "transform.h"
+
+#include "sfen_stream.h"
+#include "packed_sfen.h"
+
+#include "thread.h"
+#include "position.h"
+#include "evaluate.h"
+
+#include "nnue/evaluate_nnue.h"
+
+#include <string>
+#include <map>
+#include <iostream>
+#include <cmath>
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+
+namespace Learner
+{
+    using CommandFunc = void(*)(std::istringstream&);
+
+    enum struct NudgedStaticMode
+    {
+        Absolute,
+        Relative,
+        Interpolate
+    };
+
+    struct NudgedStaticParams
+    {
+        std::string input_filename = "in.binpack";
+        std::string output_filename = "out.binpack";
+        NudgedStaticMode mode = NudgedStaticMode::Absolute;
+        int absolute_nudge = 5;
+        float relative_nudge = 0.1;
+        float interpolate_nudge = 0.1;
+
+        void enforce_constraints()
+        {
+            relative_nudge = std::max(relative_nudge, 0.0f);
+            absolute_nudge = std::max(absolute_nudge, 0);
+        }
+    };
+
+    [[nodiscard]] std::int16_t nudge(NudgedStaticParams& params, std::int16_t static_eval_i16, std::int16_t deep_eval_i16)
+    {
+        auto saturate_i32_to_i16 = [](int v) {
+            return static_cast<std::int16_t>(
+                std::clamp(
+                    v,
+                    (int)std::numeric_limits<std::int16_t>::min(),
+                    (int)std::numeric_limits<std::int16_t>::max()
+                )
+            );
+        };
+
+        auto saturate_f32_to_i16 = [saturate_i32_to_i16](float v) {
+            return saturate_i32_to_i16((int)v);
+        };
+
+        int static_eval = static_eval_i16;
+        int deep_eval = deep_eval_i16;
+
+        switch(params.mode)
+        {
+            case NudgedStaticMode::Absolute:
+                return saturate_i32_to_i16(
+                    static_eval + std::clamp(
+                        deep_eval - static_eval,
+                        -params.absolute_nudge,
+                        params.absolute_nudge
+                    )
+                );
+
+            case NudgedStaticMode::Relative:
+                return saturate_f32_to_i16(
+                    (float)static_eval * std::clamp(
+                        (float)deep_eval / (float)static_eval,
+                        (1.0f - params.relative_nudge),
+                        (1.0f + params.relative_nudge)
+                    )
+                );
+
+            case NudgedStaticMode::Interpolate:
+                return saturate_f32_to_i16(
+                    (float)static_eval * (1.0f - params.interpolate_nudge)
+                    + (float)deep_eval * params.interpolate_nudge
+                );
+
+            default:
+                assert(false);
+                return 0;
+        }
+    }
+
+    void do_nudged_static(NudgedStaticParams& params)
+    {
+        Thread* th = Threads.main();
+        Position& pos = th->rootPos;
+        StateInfo si;
+
+        auto in = Learner::open_sfen_input_file(params.input_filename);
+        auto out = Learner::create_new_sfen_output(params.output_filename);
+
+        if (in == nullptr)
+        {
+            std::cerr << "Invalid input file type.\n";
+            return;
+        }
+
+        if (out == nullptr)
+        {
+            std::cerr << "Invalid output file type.\n";
+            return;
+        }
+
+        PSVector buffer;
+        uint64_t batch_size = 1'000'000;
+
+        buffer.reserve(batch_size);
+
+        uint64_t num_processed = 0;
+        for (;;)
+        {
+            auto v = in->next();
+            if (!v.has_value())
+                break;
+
+            auto& ps = v.value();
+
+            pos.set_from_packed_sfen(ps.sfen, &si, th);
+            auto static_eval = Eval::evaluate(pos);
+            auto deep_eval = ps.score;
+            ps.score = nudge(params, static_eval, deep_eval);
+
+            buffer.emplace_back(ps);
+            if (buffer.size() >= batch_size)
+            {
+                num_processed += buffer.size();
+
+                out->write(buffer);
+                buffer.clear();
+
+                std::cout << "Processed " << num_processed << " positions.\n";
+            }
+        }
+
+        if (!buffer.empty())
+        {
+            num_processed += buffer.size();
+
+            out->write(buffer);
+            buffer.clear();
+
+            std::cout << "Processed " << num_processed << " positions.\n";
+        }
+
+        std::cout << "Finished.\n";
+    }
+
+    void nudged_static(std::istringstream& is)
+    {
+        NudgedStaticParams params{};
+
+        while(true)
+        {
+            std::string token;
+            is >> token;
+
+            if (token == "")
+                break;
+
+            if (token == "absolute")
+            {
+                params.mode = NudgedStaticMode::Absolute;
+                is >> params.absolute_nudge;
+            }
+            else if (token == "relative")
+            {
+                params.mode = NudgedStaticMode::Relative;
+                is >> params.relative_nudge;
+            }
+            else if (token == "interpolate")
+            {
+                params.mode = NudgedStaticMode::Interpolate;
+                is >> params.interpolate_nudge;
+            }
+            else if (token == "input_file")
+                is >> params.input_filename;
+            else if (token == "output_file")
+                is >> params.output_filename;
+        }
+
+        std::cout << "Performing transform nudged_static with parameters:\n";
+        std::cout << "input_file          : " << params.input_filename << '\n';
+        std::cout << "output_file         : " << params.output_filename << '\n';
+        std::cout << "\n";
+        if (params.mode == NudgedStaticMode::Absolute)
+        {
+            std::cout << "mode                : absolute\n";
+            std::cout << "absolute_nudge      : " << params.absolute_nudge << '\n';
+        }
+        else if (params.mode == NudgedStaticMode::Relative)
+        {
+            std::cout << "mode                : relative\n";
+            std::cout << "relative_nudge      : " << params.relative_nudge << '\n';
+        }
+        else if (params.mode == NudgedStaticMode::Interpolate)
+        {
+            std::cout << "mode                : interpolate\n";
+            std::cout << "interpolate_nudge   : " << params.interpolate_nudge << '\n';
+        }
+        std::cout << '\n';
+
+        params.enforce_constraints();
+        do_nudged_static(params);
+    }
+
+    void transform(std::istringstream& is)
+    {
+        const std::map<std::string, CommandFunc> subcommands = {
+            { "nudged_static", &nudged_static }
+        };
+
+        Eval::NNUE::init();
+
+        std::string subcommand;
+        is >> subcommand;
+
+        auto func = subcommands.find(subcommand);
+        if (func == subcommands.end())
+        {
+            std::cout << "Invalid subcommand " << subcommand << ". Exiting...\n";
+            return;
+        }
+
+        func->second(is);
+    }
+
+}
diff --git a/src/learn/transform.h b/src/learn/transform.h
new file mode 100644
index 00000000..8a6921a0
--- /dev/null
+++ b/src/learn/transform.h
@@ -0,0 +1,12 @@
+#ifndef _TRANSFORM_H_
+#define _TRANSFORM_H_
+
+#include <sstream>
+
+namespace Learner {
+
+    void transform(std::istringstream& is);
+
+}
+
+#endif
diff --git a/src/main.cpp b/src/main.cpp
index fbad6622..1a13dc62 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -18,6 +18,8 @@
 
 #include <iostream>
 
+#include "nnue/evaluate_nnue.h"
+
 #include "bitboard.h"
 #include "endgame.h"
 #include "position.h"
@@ -35,6 +37,7 @@ int main(int argc, char* argv[]) {
 
   std::cout << engine_info() << std::endl;
 
+  CommandLine::init(argc, argv);
   UCI::init(Options);
   Tune::init();
   PSQT::init();
@@ -44,7 +47,7 @@ int main(int argc, char* argv[]) {
   Endgames::init();
   Threads.set(size_t(Options["Threads"]));
   Search::clear(); // After threads are up
-  Eval::init_NNUE();
+  Eval::NNUE::init();
 
   UCI::loop(argc, argv);
 
diff --git a/src/misc.cpp b/src/misc.cpp
index 851280fe..eb68e842 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -61,6 +61,8 @@ typedef bool(*fun3_t)(HANDLE, CONST GROUP_AFFINITY*, PGROUP_AFFINITY);
 
 using namespace std;
 
+SynchronizedRegionLogger sync_region_cout(std::cout);
+
 namespace {
 
 /// Version number. If Version is left empty, then compile date in the format
@@ -132,6 +134,7 @@ public:
 
 } // namespace
 
+
 /// engine_info() returns the full name of the current Stockfish version. This
 /// will be either "Stockfish <Tag> DD-MM-YY" (where DD-MM-YY is the date when
 /// the program was compiled) or "Stockfish <Version>", depending on whether
@@ -356,27 +359,11 @@ void std_aligned_free(void* ptr) {
 #endif
 }
 
-/// aligned_ttmem_alloc() will return suitably aligned memory, if possible using large pages.
-/// The returned pointer is the aligned one, while the mem argument is the one that needs
-/// to be passed to free. With c++17 some of this functionality could be simplified.
+/// aligned_large_pages_alloc() will return suitably aligned memory, if possible using large pages.
 
-#if defined(__linux__) && !defined(__ANDROID__)
+#if defined(_WIN32)
 
-void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
-
-  constexpr size_t alignment = 2 * 1024 * 1024; // assumed 2MB page sizes
-  size_t size = ((allocSize + alignment - 1) / alignment) * alignment; // multiple of alignment
-  if (posix_memalign(&mem, alignment, size))
-     mem = nullptr;
-#if defined(MADV_HUGEPAGE)
-  madvise(mem, allocSize, MADV_HUGEPAGE);
-#endif
-  return mem;
-}
-
-#elif defined(_WIN64)
-
-static void* aligned_ttmem_alloc_large_pages(size_t allocSize) {
+static void* aligned_large_pages_alloc_win(size_t allocSize) {
 
   HANDLE hProcessToken { };
   LUID luid { };
@@ -421,23 +408,10 @@ static void* aligned_ttmem_alloc_large_pages(size_t allocSize) {
   return mem;
 }
 
-void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
-
-  static bool firstCall = true;
+void* aligned_large_pages_alloc(size_t allocSize) {
 
   // Try to allocate large pages
-  mem = aligned_ttmem_alloc_large_pages(allocSize);
-
-  // Suppress info strings on the first call. The first call occurs before 'uci'
-  // is received and in that case this output confuses some GUIs.
-  if (!firstCall)
-  {
-      if (mem)
-          sync_cout << "info string Hash table allocation: Windows large pages used." << sync_endl;
-      else
-          sync_cout << "info string Hash table allocation: Windows large pages not used." << sync_endl;
-  }
-  firstCall = false;
+  void* mem = aligned_large_pages_alloc_win(allocSize);
 
   // Fall back to regular, page aligned, allocation if necessary
   if (!mem)
@@ -448,23 +422,31 @@ void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
 
 #else
 
-void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
+void* aligned_large_pages_alloc(size_t allocSize) {
 
-  constexpr size_t alignment = 64; // assumed cache line size
-  size_t size = allocSize + alignment - 1; // allocate some extra space
-  mem = malloc(size);
-  void* ret = reinterpret_cast<void*>((uintptr_t(mem) + alignment - 1) & ~uintptr_t(alignment - 1));
-  return ret;
+#if defined(__linux__)
+  constexpr size_t alignment = 2 * 1024 * 1024; // assumed 2MB page size
+#else
+  constexpr size_t alignment = 4096; // assumed small page size
+#endif
+
+  // round up to multiples of alignment
+  size_t size = ((allocSize + alignment - 1) / alignment) * alignment;
+  void *mem = std_aligned_alloc(alignment, size);
+#if defined(MADV_HUGEPAGE)
+  madvise(mem, size, MADV_HUGEPAGE);
+#endif
+  return mem;
 }
 
 #endif
 
 
-/// aligned_ttmem_free() will free the previously allocated ttmem
+/// aligned_large_pages_free() will free the previously allocated ttmem
 
-#if defined(_WIN64)
+#if defined(_WIN32)
 
-void aligned_ttmem_free(void* mem) {
+void aligned_large_pages_free(void* mem) {
 
   if (mem && !VirtualFree(mem, 0, MEM_RELEASE))
   {
@@ -477,8 +459,8 @@ void aligned_ttmem_free(void* mem) {
 
 #else
 
-void aligned_ttmem_free(void *mem) {
-  free(mem);
+void aligned_large_pages_free(void *mem) {
+  std_aligned_free(mem);
 }
 
 #endif
@@ -590,6 +572,62 @@ void bindThisThread(size_t idx) {
 
 } // namespace WinProcGroup
 
+#ifdef _WIN32
+#include <direct.h>
+#define GETCWD _getcwd
+#else
+#include <unistd.h>
+#define GETCWD getcwd
+#endif
+
+namespace CommandLine {
+
+string argv0;            // path+name of the executable binary, as given by argv[0]
+string binaryDirectory;  // path of the executable directory
+string workingDirectory; // path of the working directory
+
+void init(int argc, char* argv[]) {
+    (void)argc;
+    string pathSeparator;
+
+    // extract the path+name of the executable binary
+    argv0 = argv[0];
+
+#ifdef _WIN32
+    pathSeparator = "\\";
+  #ifdef _MSC_VER
+    // Under windows argv[0] may not have the extension. Also _get_pgmptr() had
+    // issues in some windows 10 versions, so check returned values carefully.
+    char* pgmptr = nullptr;
+    if (!_get_pgmptr(&pgmptr) && pgmptr != nullptr && *pgmptr)
+        argv0 = pgmptr;
+  #endif
+#else
+    pathSeparator = "/";
+#endif
+
+    // extract the working directory
+    workingDirectory = "";
+    char buff[40000];
+    char* cwd = GETCWD(buff, 40000);
+    if (cwd)
+        workingDirectory = cwd;
+
+    // extract the binary directory path from argv0
+    binaryDirectory = argv0;
+    size_t pos = binaryDirectory.find_last_of("\\/");
+    if (pos == std::string::npos)
+        binaryDirectory = "." + pathSeparator;
+    else
+        binaryDirectory.resize(pos + 1);
+
+    // pattern replacement: "./" at the start of path is replaced by the working directory
+    if (binaryDirectory.find("." + pathSeparator) == 0)
+        binaryDirectory.replace(0, 1, workingDirectory);
+}
+
+
+} // namespace CommandLine
 // Returns a string that represents the current time. (Used when learning evaluation functions)
 std::string now_string()
 {
@@ -627,18 +665,27 @@ void* aligned_malloc(size_t size, size_t align)
     return p;
 }
 
+std::uint64_t get_file_size(std::fstream& fs)
+{
+    auto pos = fs.tellg();
+
+    fs.seekg(0, fstream::end);
+    const uint64_t eofPos = (uint64_t)fs.tellg();
+    fs.clear(); // Otherwise, the next seek may fail.
+    fs.seekg(0, fstream::beg);
+    const uint64_t begPos = (uint64_t)fs.tellg();
+    fs.seekg(pos);
+
+    return eofPos - begPos;
+}
+
 int read_file_to_memory(std::string filename, std::function<void* (uint64_t)> callback_func)
 {
     fstream fs(filename, ios::in | ios::binary);
     if (fs.fail())
         return 1;
 
-    fs.seekg(0, fstream::end);
-    uint64_t eofPos = (uint64_t)fs.tellg();
-    fs.clear(); // Otherwise the next seek may fail.
-    fs.seekg(0, fstream::beg);
-    uint64_t begPos = (uint64_t)fs.tellg();
-    uint64_t file_size = eofPos - begPos;
+    const uint64_t file_size = get_file_size(fs);
     //std::cout << "filename = " << filename << " , file_size = " << file_size << endl;
 
     // I know the file size, so call callback_func to get a buffer for this,
@@ -687,66 +734,3 @@ int write_memory_to_file(std::string filename, void* ptr, uint64_t size)
     fs.close();
     return 0;
 }
-
-// ----------------------------
-//     mkdir wrapper
-// ----------------------------
-
-// Specify relative to the current folder. Returns 0 on success, non-zero on failure.
-// Create a folder. Japanese is not used.
-// In case of gcc under msys2 environment, folder creation fails with _wmkdir(). Cause unknown.
-// Use _mkdir() because there is no help for it.
-
-#if defined(_WIN32)
-// for Windows
-
-#if defined(_MSC_VER)
-#include <codecvt> // I need this because I want wstring to mkdir
-#include <locale> // This is required for wstring_convert.
-
-namespace Dependency {
-    int mkdir(std::string dir_name)
-    {
-        std::wstring_convert<std::codecvt_utf8<wchar_t>, wchar_t> cv;
-        return _wmkdir(cv.from_bytes(dir_name).c_str());
-        //	::CreateDirectory(cv.from_bytes(dir_name).c_str(),NULL);
-    }
-}
-
-#elif defined(__GNUC__) 
-
-#include <direct.h>
-namespace Dependency {
-    int mkdir(std::string dir_name)
-    {
-        return _mkdir(dir_name.c_str());
-    }
-}
-
-#endif
-#elif defined(__linux__)
-
-// In the linux environment, this symbol _LINUX is defined in the makefile.
-
-// mkdir implementation for Linux.
-#include "sys/stat.h"
-
-namespace Dependency {
-    int mkdir(std::string dir_name)
-    {
-        return ::mkdir(dir_name.c_str(), 0777);
-    }
-}
-#else
-
-// In order to judge whether it is a Linux environment, we have to divide the makefile..
-// The function to dig a folder on linux is good for the time being... Only used to save the evaluation function file...
-
-namespace Dependency {
-    int mkdir(std::string dir_name)
-    {
-        return 0;
-    }
-}
-
-#endif
diff --git a/src/misc.h b/src/misc.h
index 19bb008c..c7cf3265 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -19,6 +19,7 @@
 #ifndef MISC_H_INCLUDED
 #define MISC_H_INCLUDED
 
+#include <algorithm>
 #include <cassert>
 #include <chrono>
 #include <functional>
@@ -27,6 +28,12 @@
 #include <string>
 #include <vector>
 
+#include <cstdint>
+#include <cmath>
+#include <cctype>
+#include <sstream>
+#include <deque>
+
 #include "types.h"
 
 const std::string engine_info(bool to_uci = false);
@@ -35,8 +42,8 @@ void prefetch(void* addr);
 void start_logger(const std::string& fname);
 void* std_aligned_alloc(size_t alignment, size_t size);
 void std_aligned_free(void* ptr);
-void* aligned_ttmem_alloc(size_t size, void*& mem);
-void aligned_ttmem_free(void* mem); // nop if mem == nullptr
+void* aligned_large_pages_alloc(size_t size); // memory aligned by page size, min alignment: 4096 bytes
+void aligned_large_pages_free(void* mem); // nop if mem == nullptr
 
 void dbg_hit_on(bool b);
 void dbg_hit_on(bool c, bool b);
@@ -44,9 +51,7 @@ void dbg_mean_of(int v);
 void dbg_print();
 
 typedef std::chrono::milliseconds::rep TimePoint; // A value in milliseconds
-
 static_assert(sizeof(TimePoint) == sizeof(int64_t), "TimePoint should be 64 bits");
-
 inline TimePoint now() {
   return std::chrono::duration_cast<std::chrono::milliseconds>
         (std::chrono::steady_clock::now().time_since_epoch()).count();
@@ -67,6 +72,232 @@ std::ostream& operator<<(std::ostream&, SyncCout);
 #define sync_cout std::cout << IO_LOCK
 #define sync_endl std::endl << IO_UNLOCK
 
+// `ptr` must point to an array of size at least
+// `sizeof(T) * N + alignment` bytes, where `N` is the
+// number of elements in the array.
+template <uintptr_t Alignment, typename T>
+T* align_ptr_up(T* ptr)
+{
+  static_assert(alignof(T) < Alignment);
+
+  const uintptr_t ptrint = reinterpret_cast<uintptr_t>(reinterpret_cast<char*>(ptr));
+  return reinterpret_cast<T*>(reinterpret_cast<char*>((ptrint + (Alignment - 1)) / Alignment * Alignment));
+}
+
+// This logger allows printing many parts in a region atomically
+// but doesn't block the threads trying to append to other regions.
+// Instead if some region tries to pring while other region holds
+// the lock the messages are queued to be printed as soon as the
+// current region releases the lock.
+struct SynchronizedRegionLogger
+{
+  using RegionId = std::uint64_t;
+
+  struct Region
+  {
+    friend struct SynchronizedRegionLogger;
+
+    Region() :
+      logger(nullptr), region_id(0), is_held(false)
+    {
+    }
+
+    Region(const Region&) = delete;
+    Region& operator=(const Region&) = delete;
+
+    Region(Region&& other) :
+      logger(other.logger), region_id(other.region_id), is_held(other.is_held)
+    {
+      other.logger = nullptr;
+      other.is_held = false;
+    }
+
+    Region& operator=(Region&& other) {
+      if (is_held && logger != nullptr)
+      {
+        logger->release_region(region_id);
+      }
+
+      logger = other.logger;
+      region_id = other.region_id;
+      is_held = other.is_held;
+
+      other.is_held = false;
+
+      return *this;
+    }
+
+    ~Region() { unlock(); }
+
+    void unlock() {
+      if (is_held) {
+        is_held = false;
+
+        if (logger != nullptr)
+          logger->release_region(region_id);
+      }
+    }
+
+    Region& operator << (std::ostream&(*pManip)(std::ostream&)) {
+      if (logger != nullptr)
+        logger->write(region_id, pManip);
+
+      return *this;
+    }
+
+    template <typename T>
+    Region& operator << (const T& value) {
+      if (logger != nullptr)
+        logger->write(region_id, value);
+
+      return *this;
+    }
+
+  private:
+    SynchronizedRegionLogger* logger;
+    RegionId region_id;
+    bool is_held;
+
+    Region(SynchronizedRegionLogger& log, RegionId id) :
+      logger(&log), region_id(id), is_held(true)
+    {
+    }
+  };
+
+private:
+  struct RegionBookkeeping
+  {
+    RegionBookkeeping(RegionId rid) : id(rid), is_held(true) {}
+
+    std::vector<std::string> pending_parts;
+    RegionId id;
+    bool is_held;
+  };
+
+  RegionId init_next_region()
+  {
+    static RegionId next_id = 0;
+
+    std::lock_guard lock(mutex);
+
+    const auto id = next_id++;
+    regions.emplace_back(id);
+
+    return id;
+  }
+
+  void write(RegionId id, std::ostream&(*pManip)(std::ostream&)) {
+    std::lock_guard lock(mutex);
+
+    if (regions.empty())
+      return;
+
+    if (id == regions.front().id) {
+      // We can just directly print to the output because
+      // we are at the front of the region queue.
+      out << *pManip;
+    } else {
+      // We have to schedule the print until previous regions are
+      // processed
+      auto* region = find_region_nolock(id);
+      if (region == nullptr)
+        return;
+
+      std::stringstream ss;
+      ss << *pManip;
+      region->pending_parts.emplace_back(std::move(ss).str());
+    }
+  }
+
+  template <typename T>
+  void write(RegionId id, const T& value) {
+    std::lock_guard lock(mutex);
+
+    if (regions.empty())
+      return;
+
+    if (id == regions.front().id) {
+      // We can just directly print to the output because
+      // we are at the front of the region queue.
+      out << value;
+    } else {
+      // We have to schedule the print until previous regions are
+      // processed
+      auto* region = find_region_nolock(id);
+      if (region == nullptr)
+        return;
+
+      std::stringstream ss;
+      ss << value;
+      region->pending_parts.emplace_back(std::move(ss).str());
+    }
+  }
+
+  std::ostream& out;
+
+  std::deque<RegionBookkeeping> regions;
+
+  std::mutex mutex;
+
+  RegionBookkeeping* find_region_nolock(RegionId id) {
+    // Linear search because the amount of concurrent regions should be small.
+    auto it = std::find_if(
+      regions.begin(),
+      regions.end(),
+      [id](const RegionBookkeeping& r) { return r.id == id; });
+
+    if (it == regions.end())
+      return nullptr;
+    else
+      return &*it;
+  }
+
+  void release_region(RegionId id) {
+    std::lock_guard lock(mutex);
+
+    auto* region = find_region_nolock(id);
+    if (region == nullptr)
+      return;
+
+    region->is_held = false;
+
+    process_backlog_nolock();
+  }
+
+  void process_backlog_nolock()
+  {
+    while(!regions.empty()) {
+      auto& region = regions.front();
+
+      for(auto& part : region.pending_parts) {
+        out << part;
+      }
+
+      // If the region is still held then we don't
+      // want to start printing stuff from the next region.
+      if (region.is_held)
+        break;
+
+      regions.pop_front();
+    }
+  }
+
+public:
+
+  SynchronizedRegionLogger(std::ostream& s) :
+    out(s)
+  {
+  }
+
+  [[nodiscard]] Region new_region() {
+    const auto id = init_next_region();
+    return Region(*this, id);
+  }
+
+};
+
+extern SynchronizedRegionLogger sync_region_cout;
+
 
 /// xorshift64star Pseudo-Random Number Generator
 /// This class is based on original code written and dedicated
@@ -83,6 +314,19 @@ std::ostream& operator<<(std::ostream&, SyncCout);
 /// For further analysis see
 ///   <http://vigna.di.unimi.it/ftp/papers/xorshift.pdf>
 
+static uint64_t string_hash(const std::string& str)
+{
+  uint64_t h = 525201411107845655ull;
+
+  for (auto c : str) {
+    h ^= static_cast<uint64_t>(c);
+    h *= 0x5bd1e9955bd1e995ull;
+    h ^= h >> 47;
+  }
+
+  return h;
+}
+
 class PRNG {
 
   uint64_t s;
@@ -94,7 +338,9 @@ class PRNG {
   }
 
 public:
+  PRNG() { set_seed_from_time(); }
   PRNG(uint64_t seed) : s(seed) { assert(seed); }
+  PRNG(const std::string& seed) { set_seed(seed); }
 
   template<typename T> T rand() { return T(rand64()); }
 
@@ -107,6 +353,40 @@ public:
 
   // Return the random seed used internally.
   uint64_t get_seed() const { return s; }
+
+  void set_seed(uint64_t seed) { s = seed; }
+
+  uint64_t next_random_seed()
+  {
+    uint64_t seed = 0;
+    for(int i = 0; i < 64; ++i)
+    {
+      const auto off = rand64() % 64;
+      seed |= (rand64() & (uint64_t(1) << off)) >> off;
+      seed <<= 1;
+    }
+    return seed;
+  }
+
+  void set_seed_from_time()
+  {
+      set_seed(std::chrono::system_clock::now().time_since_epoch().count());
+  }
+
+  void set_seed(const std::string& str)
+  {
+    if (str.empty())
+    {
+      set_seed_from_time();
+    }
+    else if (std::all_of(str.begin(), str.end(), [](char c) { return std::isdigit(c);} )) {
+      set_seed(std::stoull(str));
+    }
+    else
+    {
+      set_seed(string_hash(str));
+    }
+  }
 };
 
 // Display a random seed. (For debugging)
@@ -130,6 +410,74 @@ inline uint64_t mul_hi64(uint64_t a, uint64_t b) {
 #endif
 }
 
+// This bitset can be accessed concurrently, provided
+// the concurrent accesses are performed on distinct
+// instances of underlying type. That means the cuncurrent
+// accesses need to be spaced by at least
+// bits_per_bucket bits.
+// But at least best_concurrent_access_stride bits
+// is recommended to prevent false sharing.
+template <uint64_t N>
+struct LargeBitset
+{
+private:
+    constexpr static uint64_t cache_line_size = 64;
+
+public:
+    using UnderlyingType = uint64_t;
+
+    constexpr static uint64_t num_bits = N;
+    constexpr static uint64_t bits_per_bucket = 8 * sizeof(uint64_t);
+    constexpr static uint64_t num_buckets = (num_bits + bits_per_bucket - 1) / bits_per_bucket;
+    constexpr static uint64_t best_concurrent_access_stride = 8 * cache_line_size;
+
+    LargeBitset()
+    {
+        std::fill(std::begin(bits), std::end(bits), 0);
+    }
+
+    void set(uint64_t idx)
+    {
+        const uint64_t bucket = idx / bits_per_bucket;
+        const uint64_t bit = uint64_t(1) << (idx % bits_per_bucket);
+        bits[bucket] |= bit;
+    }
+
+    bool test(uint64_t idx) const
+    {
+        const uint64_t bucket = idx / bits_per_bucket;
+        const uint64_t bit = uint64_t(1) << (idx % bits_per_bucket);
+        return bits[bucket] & bit;
+    }
+
+    uint64_t count() const
+    {
+        uint64_t c = 0;
+        uint64_t i = 0;
+
+        for (; i < num_buckets - 3; i += 4)
+        {
+            uint64_t c0 = popcount(bits[i+0]);
+            uint64_t c1 = popcount(bits[i+1]);
+            uint64_t c2 = popcount(bits[i+2]);
+            uint64_t c3 = popcount(bits[i+3]);
+            c0 += c1;
+            c2 += c3;
+            c += c0 + c2;
+        }
+
+        for (; i < num_buckets; ++i)
+        {
+            c += popcount(bits[i]);
+        }
+
+        return c;
+    }
+
+private:
+    alignas(cache_line_size) UnderlyingType bits[num_buckets];
+};
+
 /// Under Windows it is not possible for a process to run on more than one
 /// logical processor group. This usually means to be limited to use max 64
 /// cores. To overcome this, some special platform specific API should be
@@ -155,6 +503,7 @@ std::string now_string();
 // Also, if the buffer cannot be allocated in the callback function or if the file size is different from the expected file size,
 // Return nullptr. At this time, read_file_to_memory() interrupts reading and returns with an error.
 
+std::uint64_t get_file_size(std::fstream& fs);
 int read_file_to_memory(std::string filename, std::function<void* (uint64_t)> callback_func);
 int write_memory_to_file(std::string filename, void* ptr, uint64_t size);
 
@@ -165,7 +514,9 @@ int write_memory_to_file(std::string filename, void* ptr, uint64_t size);
 // async version of PRNG
 struct AsyncPRNG
 {
+  AsyncPRNG() : prng() { }
   AsyncPRNG(uint64_t seed) : prng(seed) { assert(seed); }
+  AsyncPRNG(const std::string& seed) : prng(seed) { }
   // [ASYNC] Extract one random number.
   template<typename T> T rand() {
     std::unique_lock<std::mutex> lk(mutex);
@@ -199,20 +550,51 @@ inline std::ostream& operator<<(std::ostream& os, AsyncPRNG& prng)
 
 // Mathematical function used for progress calculation and learning
 namespace Math {
-	// Sigmoid function
-	// = 1.0 / (1.0 + std::exp(-x))
-	double sigmoid(double x);
+    inline double sigmoid(double x)
+    {
+        return 1.0 / (1.0 + std::exp(-x));
+    }
 
-	// Differentiation of sigmoid function
-	// = sigmoid(x) * (1.0-sigmoid(x))
-	double dsigmoid(double x);
+    inline double dsigmoid(double x)
+    {
+        // Sigmoid function
+        // f(x) = 1/(1+exp(-x))
+        // the first derivative is
+        // f'(x) = df/dx = f(x)・{ 1-f(x)}
+        // becomes
+
+        return sigmoid(x) * (1.0 - sigmoid(x));
+    }
 
 	// Clip v so that it fits between [lo,hi].
 	// * In Stockfish, this function is written in bitboard.h.
 	template<class T> constexpr const T& clamp(const T& v, const T& lo, const T& hi) {
 		return v < lo ? lo : v > hi ? hi : v;
 	}
+}
 
+namespace Algo {
+    // Fisher-Yates
+    template <typename Rng, typename T>
+    void shuffle(std::vector<T>& buf, Rng&& prng)
+    {
+        const auto size = buf.size();
+        for (uint64_t i = 0; i < size; ++i)
+            std::swap(buf[i], buf[prng.rand(size - i) + i]);
+    }
+
+    // split the string
+    inline std::vector<std::string> split(const std::string& input, char delimiter) {
+        std::istringstream stream(input);
+        std::string field;
+        std::vector<std::string> fields;
+
+        while (std::getline(stream, field, delimiter)) {
+            fields.push_back(field);
+        }
+
+        return fields;
+    }
 }
 
 // --------------------
@@ -225,7 +607,7 @@ struct Path
 {
 	// Combine the path name and file name and return it.
 	// If the folder name is not an empty string, append it if there is no'/' or'\\' at the end.
-	static std::string Combine(const std::string& folder, const std::string& filename)
+	static std::string combine(const std::string& folder, const std::string& filename)
 	{
 		if (folder.length() >= 1 && *folder.rbegin() != '/' && *folder.rbegin() != '\\')
 			return folder + "/" + filename;
@@ -234,7 +616,7 @@ struct Path
 	}
 
 	// Get the file name part (excluding the folder name) from the full path expression.
-	static std::string GetFileName(const std::string& path)
+	static std::string get_file_name(const std::string& path)
 	{
 		// I don't know which "\" or "/" is used.
 		auto path_index1 = path.find_last_of("\\") + 1;
@@ -259,7 +641,24 @@ public:
   template <typename U> AlignedAllocator(const AlignedAllocator<U>&) {}
 
   T* allocate(std::size_t n) { return (T*)std_aligned_alloc(alignof(T), n * sizeof(T)); }
-  void deallocate(T* p, std::size_t n) { std_aligned_free(p); }
+  void deallocate(T* p, std::size_t ) { std_aligned_free(p); }
+};
+
+template <typename T>
+class CacheLineAlignedAllocator {
+public:
+    using value_type = T;
+
+    constexpr static uint64_t cache_line_size = 64;
+
+    CacheLineAlignedAllocator() {}
+    CacheLineAlignedAllocator(const CacheLineAlignedAllocator&) {}
+    CacheLineAlignedAllocator(CacheLineAlignedAllocator&&) {}
+
+    template <typename U> CacheLineAlignedAllocator(const CacheLineAlignedAllocator<U>&) {}
+
+    T* allocate(std::size_t n) { return (T*)std_aligned_alloc(cache_line_size, n * sizeof(T)); }
+    void deallocate(T* p, std::size_t) { std_aligned_free(p); }
 };
 
 // --------------------
@@ -273,11 +672,13 @@ namespace Dependency
   // So when calling getline() on fstream,
   // just write getline() instead of std::getline() and use this function.
   extern bool getline(std::ifstream& fs, std::string& s);
+}
 
-  // Create a folder.
-  // Specify relative to the current folder. Japanese is not used for dir_name.
-  // Returns 0 on success, non-zero on failure.
-  extern int mkdir(std::string dir_name);
+namespace CommandLine {
+  void init(int argc, char* argv[]);
+
+  extern std::string binaryDirectory;  // path of the executable directory
+  extern std::string workingDirectory; // path of the working directory
 }
 
 #endif // #ifndef MISC_H_INCLUDED
diff --git a/src/movepick.cpp b/src/movepick.cpp
index 153d323e..f5e02385 100644
--- a/src/movepick.cpp
+++ b/src/movepick.cpp
@@ -73,8 +73,9 @@ MovePicker::MovePicker(const Position& p, Move ttm, Depth d, const ButterflyHist
   assert(d <= 0);
 
   stage = (pos.checkers() ? EVASION_TT : QSEARCH_TT) +
-           !(ttm && (depth > DEPTH_QS_RECAPTURES || to_sq(ttm) == recaptureSquare)
-                 && pos.pseudo_legal(ttm));
+          !(   ttm
+            && (pos.checkers() || depth > DEPTH_QS_RECAPTURES || to_sq(ttm) == recaptureSquare)
+            && pos.pseudo_legal(ttm));
 }
 
 /// MovePicker constructor for ProbCut: we generate captures with SEE greater
diff --git a/src/nnue/architectures/halfka_256x2-32-32.h b/src/nnue/architectures/halfka_256x2-32-32.h
new file mode 100644
index 00000000..c108ef5d
--- /dev/null
+++ b/src/nnue/architectures/halfka_256x2-32-32.h
@@ -0,0 +1,54 @@
+/*
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+// Definition of input features and network structure used in NNUE evaluation function
+
+#ifndef NNUE_HALFKA_256X2_32_32_H_INCLUDED
+#define NNUE_HALFKA_256X2_32_32_H_INCLUDED
+
+#include "nnue/features/feature_set.h"
+#include "nnue/features/half_ka.h"
+
+#include "nnue/layers/input_slice.h"
+#include "nnue/layers/affine_transform.h"
+#include "nnue/layers/clipped_relu.h"
+
+namespace Eval::NNUE {
+
+    // Input features used in evaluation function
+    using RawFeatures = Features::FeatureSet<
+        Features::HalfKA<Features::Side::kFriend>>;
+
+    // Number of input feature dimensions after conversion
+    constexpr IndexType kTransformedFeatureDimensions = 256;
+
+    namespace Layers {
+
+        // Define network structure
+        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+        using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+
+    }  // namespace Layers
+
+    using Network = Layers::OutputLayer;
+
+}  // namespace Eval::NNUE
+
+#endif // #ifndef NNUE_HALFA_256X2_32_32_H_INCLUDED
diff --git a/src/nnue/architectures/halfkp-cr-ep_256x2-32-32.h b/src/nnue/architectures/halfkp-cr-ep_256x2-32-32.h
index 37b155d5..6327b78a 100644
--- a/src/nnue/architectures/halfkp-cr-ep_256x2-32-32.h
+++ b/src/nnue/architectures/halfkp-cr-ep_256x2-32-32.h
@@ -1,42 +1,57 @@
+/*
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
 // Definition of input features and network structure used in NNUE evaluation function
 
-#ifndef HALFKP_CR_EP_256X2_32_32_H
-#define HALFKP_CR_EP_256X2_32_32_H
+#ifndef NNUE_HALFKP_CR_EP_256X2_32_32_H_INCLUDED
+#define NNUE_HALFKP_CR_EP_256X2_32_32_H_INCLUDED
 
-#include "../features/feature_set.h"
-#include "../features/half_kp.h"
-#include "../features/castling_right.h"
-#include "../features/enpassant.h"
+#include "nnue/features/feature_set.h"
+#include "nnue/features/half_kp.h"
+#include "nnue/features/castling_right.h"
+#include "nnue/features/enpassant.h"
 
-#include "../layers/input_slice.h"
-#include "../layers/affine_transform.h"
-#include "../layers/clipped_relu.h"
+#include "nnue/layers/input_slice.h"
+#include "nnue/layers/affine_transform.h"
+#include "nnue/layers/clipped_relu.h"
 
-namespace Eval {
-
-  namespace NNUE {
+namespace Eval::NNUE {
 
     // Input features used in evaluation function
     using RawFeatures = Features::FeatureSet<
-      Features::HalfKP<Features::Side::kFriend>, Features::CastlingRight,
-      Features::EnPassant>;
+        Features::HalfKP<Features::Side::kFriend>, Features::CastlingRight,
+        Features::EnPassant>;
 
     // Number of input feature dimensions after conversion
     constexpr IndexType kTransformedFeatureDimensions = 256;
 
     namespace Layers {
 
-      // define network structure
-      using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-      using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-      using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-      using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+        // Define network structure
+        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+        using OutputLayer = AffineTransform<HiddenLayer2, 1>;
 
     }  // namespace Layers
 
     using Network = Layers::OutputLayer;
 
-  }  // namespace NNUE
+}  // namespace Eval::NNUE
 
-}  // namespace Eval
-#endif // HALFKP_CR_EP_256X2_32_32_H
+#endif // #ifndef NNUE_HALFKP_CR_EP_256X2_32_32_H_INCLUDED
diff --git a/src/nnue/architectures/halfkp-cr_256x2-32-32.h b/src/nnue/architectures/halfkp-cr_256x2-32-32.h
new file mode 100644
index 00000000..dd587d1d
--- /dev/null
+++ b/src/nnue/architectures/halfkp-cr_256x2-32-32.h
@@ -0,0 +1,37 @@
+// Definition of input features and network structure used in NNUE evaluation function
+
+#ifndef NNUE_HALFKP_CR_256X2_32_32_H_INCLUDED
+#define NNUE_HALFKP_CR_256X2_32_32_H_INCLUDED
+
+#include "nnue/features/feature_set.h"
+#include "nnue/features/half_kp.h"
+#include "nnue/features/castling_right.h"
+
+#include "nnue/layers/input_slice.h"
+#include "nnue/layers/affine_transform.h"
+#include "nnue/layers/clipped_relu.h"
+
+namespace Eval::NNUE {
+
+    // Input features used in evaluation function
+    using RawFeatures = Features::FeatureSet<
+        Features::HalfKP<Features::Side::kFriend>, Features::CastlingRight>;
+
+    // Number of input feature dimensions after conversion
+    constexpr IndexType kTransformedFeatureDimensions = 256;
+
+    namespace Layers {
+
+        // Define network structure
+        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+        using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+
+    }  // namespace Layers
+
+    using Network = Layers::OutputLayer;
+
+}  // namespace Eval::NNUE
+
+#endif // #ifndef NNUE_HALFKP_CR_256X2_32_32_H_INCLUDED
diff --git a/src/nnue/architectures/halfkp_256x2-32-32.h b/src/nnue/architectures/halfkp_256x2-32-32.h
index 9216bd41..333feb83 100644
--- a/src/nnue/architectures/halfkp_256x2-32-32.h
+++ b/src/nnue/architectures/halfkp_256x2-32-32.h
@@ -1,19 +1,19 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
 
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
 
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 // Definition of input features and network structure used in NNUE evaluation function
@@ -21,33 +21,33 @@
 #ifndef NNUE_HALFKP_256X2_32_32_H_INCLUDED
 #define NNUE_HALFKP_256X2_32_32_H_INCLUDED
 
-#include "../features/feature_set.h"
-#include "../features/half_kp.h"
+#include "nnue/features/feature_set.h"
+#include "nnue/features/half_kp.h"
 
-#include "../layers/input_slice.h"
-#include "../layers/affine_transform.h"
-#include "../layers/clipped_relu.h"
+#include "nnue/layers/input_slice.h"
+#include "nnue/layers/affine_transform.h"
+#include "nnue/layers/clipped_relu.h"
 
 namespace Eval::NNUE {
 
-// Input features used in evaluation function
-using RawFeatures = Features::FeatureSet<
-    Features::HalfKP<Features::Side::kFriend>>;
+    // Input features used in evaluation function
+    using RawFeatures = Features::FeatureSet<
+        Features::HalfKP<Features::Side::kFriend>>;
 
-// Number of input feature dimensions after conversion
-constexpr IndexType kTransformedFeatureDimensions = 256;
+    // Number of input feature dimensions after conversion
+    constexpr IndexType kTransformedFeatureDimensions = 256;
 
-namespace Layers {
+    namespace Layers {
 
-// Define network structure
-using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+        // Define network structure
+        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+        using OutputLayer = AffineTransform<HiddenLayer2, 1>;
 
-}  // namespace Layers
+    }  // namespace Layers
 
-using Network = Layers::OutputLayer;
+    using Network = Layers::OutputLayer;
 
 }  // namespace Eval::NNUE
 
diff --git a/src/nnue/architectures/halfkp_384x2-32-32.h b/src/nnue/architectures/halfkp_384x2-32-32.h
index 3d28139a..96913295 100644
--- a/src/nnue/architectures/halfkp_384x2-32-32.h
+++ b/src/nnue/architectures/halfkp_384x2-32-32.h
@@ -3,37 +3,33 @@
 #ifndef HALFKP_384X2_32_32_H
 #define HALFKP_384X2_32_32_H
 
-#include "../features/feature_set.h"
-#include "../features/half_kp.h"
+#include "nnue/features/feature_set.h"
+#include "nnue/features/half_kp.h"
 
-#include "../layers/input_slice.h"
-#include "../layers/affine_transform.h"
-#include "../layers/clipped_relu.h"
+#include "nnue/layers/input_slice.h"
+#include "nnue/layers/affine_transform.h"
+#include "nnue/layers/clipped_relu.h"
 
-namespace Eval {
+namespace Eval::NNUE {
 
-namespace NNUE {
+    // Input features used in evaluation function
+    using RawFeatures = Features::FeatureSet<
+        Features::HalfKP<Features::Side::kFriend>>;
 
-// Input features used in evaluation function
-using RawFeatures = Features::FeatureSet<
-    Features::HalfKP<Features::Side::kFriend>>;
+    // Number of input feature dimensions after conversion
+    constexpr IndexType kTransformedFeatureDimensions = 384;
 
-// Number of input feature dimensions after conversion
-constexpr IndexType kTransformedFeatureDimensions = 384;
+    namespace Layers {
 
-namespace Layers {
+        // define network structure
+        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+        using OutputLayer = AffineTransform<HiddenLayer2, 1>;
 
-// define network structure
-using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+    }  // namespace Layers
 
-}  // namespace Layers
+    using Network = Layers::OutputLayer;
 
-using Network = Layers::OutputLayer;
-
-}  // namespace NNUE
-
-}  // namespace Eval
+}  // namespace Eval::NNUE
 #endif // HALFKP_384X2_32_32_H
diff --git a/src/nnue/architectures/k-p-cr-ep_256x2-32-32.h b/src/nnue/architectures/k-p-cr-ep_256x2-32-32.h
deleted file mode 100644
index e178b57b..00000000
--- a/src/nnue/architectures/k-p-cr-ep_256x2-32-32.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// Definition of input features and network structure used in NNUE evaluation function
-
-#ifndef K_P_CR_EP_256X2_32_32_H
-#define K_P_CR_EP_256X2_32_32_H
-
-#include "../features/feature_set.h"
-#include "../features/k.h"
-#include "../features/p.h"
-#include "../features/castling_right.h"
-#include "../features/enpassant.h"
-
-#include "../layers/input_slice.h"
-#include "../layers/affine_transform.h"
-#include "../layers/clipped_relu.h"
-
-namespace Eval {
-
-  namespace NNUE {
-
-    // Input features used in evaluation function
-    using RawFeatures = Features::FeatureSet<Features::K, Features::P,
-      Features::CastlingRight, Features::EnPassant>;
-
-    // Number of input feature dimensions after conversion
-    constexpr IndexType kTransformedFeatureDimensions = 256;
-
-    namespace Layers {
-
-      // define network structure
-      using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-      using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-      using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-      using OutputLayer = AffineTransform<HiddenLayer2, 1>;
-
-    }  // namespace Layers
-
-    using Network = Layers::OutputLayer;
-
-  }  // namespace NNUE
-
-}  // namespace Eval
-#endif // K_P_CR_EP_256X2_32_32_H
diff --git a/src/nnue/architectures/k-p-cr_256x2-32-32.h b/src/nnue/architectures/k-p-cr_256x2-32-32.h
deleted file mode 100644
index d3c187c0..00000000
--- a/src/nnue/architectures/k-p-cr_256x2-32-32.h
+++ /dev/null
@@ -1,41 +0,0 @@
-// Definition of input features and network structure used in NNUE evaluation function
-
-#ifndef K_P_CR_256X2_32_32_H
-#define K_P_CR_256X2_32_32_H
-
-#include "../features/feature_set.h"
-#include "../features/k.h"
-#include "../features/p.h"
-#include "../features/castling_right.h"
-
-#include "../layers/input_slice.h"
-#include "../layers/affine_transform.h"
-#include "../layers/clipped_relu.h"
-
-namespace Eval {
-
-  namespace NNUE {
-
-    // Input features used in evaluation function
-    using RawFeatures = Features::FeatureSet<Features::K, Features::P,
-      Features::CastlingRight>;
-
-    // Number of input feature dimensions after conversion
-    constexpr IndexType kTransformedFeatureDimensions = 256;
-
-    namespace Layers {
-
-      // define network structure
-      using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-      using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-      using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-      using OutputLayer = AffineTransform<HiddenLayer2, 1>;
-
-    }  // namespace Layers
-
-    using Network = Layers::OutputLayer;
-
-  }  // namespace NNUE
-
-}  // namespace Eval
-#endif // K_P_CR_256X2_32_32_H
diff --git a/src/nnue/architectures/k-p_256x2-32-32.h b/src/nnue/architectures/k-p_256x2-32-32.h
deleted file mode 100644
index 00b14d47..00000000
--- a/src/nnue/architectures/k-p_256x2-32-32.h
+++ /dev/null
@@ -1,38 +0,0 @@
-﻿// Definition of input features and network structure used in NNUE evaluation function
-#ifndef K_P_256X2_32_32_H
-#define K_P_256X2_32_32_H
-
-#include "../features/feature_set.h"
-#include "../features/k.h"
-#include "../features/p.h"
-
-#include "../layers/input_slice.h"
-#include "../layers/affine_transform.h"
-#include "../layers/clipped_relu.h"
-
-namespace Eval {
-
-namespace NNUE {
-
-// Input features used in evaluation function
-using RawFeatures = Features::FeatureSet<Features::K, Features::P>;
-
-// Number of input feature dimensions after conversion
-constexpr IndexType kTransformedFeatureDimensions = 256;
-
-namespace Layers {
-
-// define network structure
-using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-using OutputLayer = AffineTransform<HiddenLayer2, 1>;
-
-}  // namespace Layers
-
-using Network = Layers::OutputLayer;
-
-}  // namespace NNUE
-
-}  // namespace Eval
-#endif // K_P_256X2_32_32_H
diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index a2845c96..c7bd681f 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -18,20 +18,29 @@
 
 // Code for calculating NNUE evaluation function
 
-#include <fstream>
+#include "evaluate_nnue.h"
+
+#include "position.h"
+#include "misc.h"
+#include "uci.h"
+#include "types.h"
+
 #include <iostream>
+#include <string>
+#include <fstream>
 #include <set>
 
 #include "../evaluate.h"
 #include "../position.h"
 #include "../misc.h"
 #include "../uci.h"
+#include "../types.h"
 
 #include "evaluate_nnue.h"
 
 namespace Eval::NNUE {
 
-  uint32_t kpp_board_index[PIECE_NB][COLOR_NB] = {
+  const uint32_t kpp_board_index[PIECE_NB][COLOR_NB] = {
    // convention: W - us, B - them
    // viewed from other side, W and B are reversed
       { PS_NONE,     PS_NONE     },
@@ -53,7 +62,7 @@ namespace Eval::NNUE {
   };
 
   // Input feature converter
-  AlignedPtr<FeatureTransformer> feature_transformer;
+  LargePagePtr<FeatureTransformer> feature_transformer;
 
   // Evaluation function
   AlignedPtr<Network> network;
@@ -65,50 +74,77 @@ namespace Eval::NNUE {
   std::string savedfileName = "nn.bin";
 
   // Get a string that represents the structure of the evaluation function
-  std::string GetArchitectureString() {
-    return "Features=" + FeatureTransformer::GetStructureString() +
-      ",Network=" + Network::GetStructureString();
+  std::string get_architecture_string() {
+    return "Features=" + FeatureTransformer::get_structure_string() +
+        ",Network=" + Network::get_structure_string();
   }
 
+  std::string get_layers_info() {
+    return
+        FeatureTransformer::get_layers_info()
+        + '\n' + Network::get_layers_info();
+  }
+
+  UseNNUEMode useNNUE;
+  std::string eval_file_loaded = "None";
+
   namespace Detail {
 
   // Initialize the evaluation function parameters
   template <typename T>
-  void Initialize(AlignedPtr<T>& pointer) {
+  void initialize(AlignedPtr<T>& pointer) {
 
     pointer.reset(reinterpret_cast<T*>(std_aligned_alloc(alignof(T), sizeof(T))));
     std::memset(pointer.get(), 0, sizeof(T));
   }
 
+  template <typename T>
+  void initialize(LargePagePtr<T>& pointer) {
+
+    static_assert(alignof(T) <= 4096, "aligned_large_pages_alloc() may fail for such a big alignment requirement of T");
+    pointer.reset(reinterpret_cast<T*>(aligned_large_pages_alloc(sizeof(T))));
+    std::memset(pointer.get(), 0, sizeof(T));
+  }
+
   // Read evaluation function parameters
   template <typename T>
-  bool ReadParameters(std::istream& stream, const AlignedPtr<T>& pointer) {
+  bool ReadParameters(std::istream& stream, T& reference) {
 
     std::uint32_t header;
     header = read_little_endian<std::uint32_t>(stream);
     if (!stream || header != T::GetHashValue()) return false;
-    return pointer->ReadParameters(stream);
+    return reference.ReadParameters(stream);
   }
 
   // write evaluation function parameters
   template <typename T>
   bool WriteParameters(std::ostream& stream, const AlignedPtr<T>& pointer) {
     constexpr std::uint32_t header = T::GetHashValue();
+
     stream.write(reinterpret_cast<const char*>(&header), sizeof(header));
+
     return pointer->WriteParameters(stream);
   }
 
+  template <typename T>
+  bool WriteParameters(std::ostream& stream, const LargePagePtr<T>& pointer) {
+    constexpr std::uint32_t header = T::GetHashValue();
+
+    stream.write(reinterpret_cast<const char*>(&header), sizeof(header));
+
+    return pointer->WriteParameters(stream);
+  }
   }  // namespace Detail
 
   // Initialize the evaluation function parameters
-  void Initialize() {
+  void initialize() {
 
-    Detail::Initialize(feature_transformer);
-    Detail::Initialize(network);
+    Detail::initialize(feature_transformer);
+    Detail::initialize(network);
   }
 
   // Read network header
-  bool ReadHeader(std::istream& stream, std::uint32_t* hash_value, std::string* architecture)
+  bool read_header(std::istream& stream, std::uint32_t* hash_value, std::string* architecture)
   {
     std::uint32_t version, size;
 
@@ -122,13 +158,17 @@ namespace Eval::NNUE {
   }
 
   // write the header
-  bool WriteHeader(std::ostream& stream,
+  bool write_header(std::ostream& stream,
     std::uint32_t hash_value, const std::string& architecture) {
+
     stream.write(reinterpret_cast<const char*>(&kVersion), sizeof(kVersion));
     stream.write(reinterpret_cast<const char*>(&hash_value), sizeof(hash_value));
+
     const std::uint32_t size = static_cast<std::uint32_t>(architecture.size());
+
     stream.write(reinterpret_cast<const char*>(&size), sizeof(size));
     stream.write(architecture.data(), size);
+
     return !stream.fail();
   }
 
@@ -137,81 +177,176 @@ namespace Eval::NNUE {
 
     std::uint32_t hash_value;
     std::string architecture;
-    if (!ReadHeader(stream, &hash_value, &architecture)) return false;
+    if (!read_header(stream, &hash_value, &architecture)) return false;
     if (hash_value != kHashValue) return false;
-    if (!Detail::ReadParameters(stream, feature_transformer)) return false;
-    if (!Detail::ReadParameters(stream, network)) return false;
+    if (!Detail::ReadParameters(stream, *feature_transformer)) return false;
+    if (!Detail::ReadParameters(stream, *network)) return false;
     return stream && stream.peek() == std::ios::traits_type::eof();
   }
 
   // write evaluation function parameters
   bool WriteParameters(std::ostream& stream) {
-    if (!WriteHeader(stream, kHashValue, GetArchitectureString())) return false;
-    if (!Detail::WriteParameters(stream, feature_transformer)) return false;
-    if (!Detail::WriteParameters(stream, network)) return false;
+
+    if (!write_header(stream, kHashValue, get_architecture_string()))
+        return false;
+
+    if (!Detail::WriteParameters(stream, feature_transformer))
+        return false;
+
+    if (!Detail::WriteParameters(stream, network))
+        return false;
+
     return !stream.fail();
-  }
-
-  // Proceed with the difference calculation if possible
-  static void UpdateAccumulatorIfPossible(const Position& pos) {
-
-    feature_transformer->UpdateAccumulatorIfPossible(pos);
-  }
-
-  // Calculate the evaluation value
-  static Value ComputeScore(const Position& pos, bool refresh) {
-
-    auto& accumulator = pos.state()->accumulator;
-    if (!refresh && accumulator.computed_score) {
-      return accumulator.score;
-    }
-
-    alignas(kCacheLineSize) TransformedFeatureType
-        transformed_features[FeatureTransformer::kBufferSize];
-    feature_transformer->Transform(pos, transformed_features, refresh);
-    alignas(kCacheLineSize) char buffer[Network::kBufferSize];
-    const auto output = network->Propagate(transformed_features, buffer);
-
-    auto score = static_cast<Value>(output[0] / FV_SCALE);
-
-    accumulator.score = score;
-    accumulator.computed_score = true;
-    return accumulator.score;
-  }
-
-  // Load the evaluation function file
-  bool load_eval_file(const std::string& evalFile) {
-
-    Initialize();
-
-    if (Options["SkipLoadingEval"])
-    {
-      std::cout << "info string SkipLoadingEval set to true, Net not loaded!" << std::endl;
-      return true;
-    }
-
-    fileName = evalFile;
-
-    std::ifstream stream(evalFile, std::ios::binary);
-
-    const bool result = ReadParameters(stream);
-
-    return result;
-  }
+}
 
   // Evaluation function. Perform differential calculation.
   Value evaluate(const Position& pos) {
-    return ComputeScore(pos, false);
+
+    // We manually align the arrays on the stack because with gcc < 9.3
+    // overaligning stack variables with alignas() doesn't work correctly.
+
+    constexpr uint64_t alignment = kCacheLineSize;
+
+#if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN)
+    TransformedFeatureType transformed_features_unaligned[
+      FeatureTransformer::kBufferSize + alignment / sizeof(TransformedFeatureType)];
+    char buffer_unaligned[Network::kBufferSize + alignment];
+
+    auto* transformed_features = align_ptr_up<alignment>(&transformed_features_unaligned[0]);
+    auto* buffer = align_ptr_up<alignment>(&buffer_unaligned[0]);
+#else
+    alignas(alignment)
+      TransformedFeatureType transformed_features[FeatureTransformer::kBufferSize];
+    alignas(alignment) char buffer[Network::kBufferSize];
+#endif
+
+    ASSERT_ALIGNED(transformed_features, alignment);
+    ASSERT_ALIGNED(buffer, alignment);
+
+    feature_transformer->Transform(pos, transformed_features);
+    const auto output = network->Propagate(transformed_features, buffer);
+
+    return static_cast<Value>(output[0] / FV_SCALE);
   }
 
-  // Evaluation function. Perform full calculation.
-  Value compute_eval(const Position& pos) {
-    return ComputeScore(pos, true);
+  // Load eval, from a file stream or a memory stream
+  bool load_eval(std::string name, std::istream& stream) {
+
+    initialize();
+    fileName = name;
+    return ReadParameters(stream);
+}
+
+static UseNNUEMode nnue_mode_from_option(const UCI::Option& mode)
+{
+  if (mode == "false")
+    return UseNNUEMode::False;
+  else if (mode == "true")
+     return UseNNUEMode::True;
+  else if (mode == "pure")
+    return UseNNUEMode::Pure;
+
+  return UseNNUEMode::False;
+}
+
+void init() {
+
+  useNNUE = nnue_mode_from_option(Options["Use NNUE"]);
+
+  if (Options["SkipLoadingEval"] || useNNUE == UseNNUEMode::False)
+  {
+    eval_file_loaded.clear();
+    return;
   }
 
-  // Proceed with the difference calculation if possible
-  void update_eval(const Position& pos) {
-    UpdateAccumulatorIfPossible(pos);
+  std::string eval_file = std::string(Options["EvalFile"]);
+
+#if defined(DEFAULT_NNUE_DIRECTORY)
+#define stringify2(x) #x
+#define stringify(x) stringify2(x)
+  std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory , stringify(DEFAULT_NNUE_DIRECTORY) };
+#else
+  std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory };
+#endif
+
+  for (std::string directory : dirs)
+  {
+    if (eval_file_loaded != eval_file)
+    {
+      std::ifstream stream(directory + eval_file, std::ios::binary);
+      if (load_eval(eval_file, stream))
+      {
+        sync_cout << "info string Loaded eval file " << directory + eval_file << sync_endl;
+        eval_file_loaded = eval_file;
+      }
+      else
+      {
+        sync_cout << "info string ERROR: failed to load eval file " << directory + eval_file << sync_endl;
+        eval_file_loaded.clear();
+      }
+    }
   }
 
+#undef stringify2
+#undef stringify
+}
+
+/// NNUE::verify() verifies that the last net used was loaded successfully
+void verify_eval_file_loaded() {
+
+  std::string eval_file = std::string(Options["EvalFile"]);
+
+  if (useNNUE != UseNNUEMode::False && eval_file_loaded != eval_file)
+  {
+    UCI::OptionsMap defaults;
+    UCI::init(defaults);
+
+    std::string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available.";
+    std::string msg2 = "The option is set to true, but the network file " + eval_file + " was not loaded successfully.";
+    std::string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file.";
+    std::string msg4 = "The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/" + std::string(defaults["EvalFile"]);
+    std::string msg5 = "The engine will be terminated now.";
+
+    sync_cout << "info string ERROR: " << msg1 << sync_endl;
+    sync_cout << "info string ERROR: " << msg2 << sync_endl;
+    sync_cout << "info string ERROR: " << msg3 << sync_endl;
+    sync_cout << "info string ERROR: " << msg4 << sync_endl;
+    sync_cout << "info string ERROR: " << msg5 << sync_endl;
+
+    std::exit(EXIT_FAILURE);
+  }
+
+  if (useNNUE != UseNNUEMode::False)
+    sync_cout << "info string NNUE evaluation using " << eval_file << " enabled" << sync_endl;
+  else
+    sync_cout << "info string classical evaluation enabled" << sync_endl;
+}
+
+/// In training we override eval file so this is useful.
+void verify_any_net_loaded() {
+
+  if (!Options["SkipLoadingEval"] && useNNUE != UseNNUEMode::False && eval_file_loaded.empty())
+  {
+    UCI::OptionsMap defaults;
+    UCI::init(defaults);
+
+    std::string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available.";
+    std::string msg2 = "The option is set to true, but the network file was not loaded successfully.";
+    std::string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file.";
+    std::string msg5 = "The engine will be terminated now.";
+
+    sync_cout << "info string ERROR: " << msg1 << sync_endl;
+    sync_cout << "info string ERROR: " << msg2 << sync_endl;
+    sync_cout << "info string ERROR: " << msg3 << sync_endl;
+    sync_cout << "info string ERROR: " << msg5 << sync_endl;
+
+    std::exit(EXIT_FAILURE);
+  }
+
+  if (useNNUE != UseNNUEMode::False)
+    sync_cout << "info string NNUE evaluation using " << eval_file_loaded << " enabled" << sync_endl;
+  else
+    sync_cout << "info string classical evaluation enabled" << sync_endl;
+}
+
 } // namespace Eval::NNUE
diff --git a/src/nnue/evaluate_nnue.h b/src/nnue/evaluate_nnue.h
index 75700d03..a1051abe 100644
--- a/src/nnue/evaluate_nnue.h
+++ b/src/nnue/evaluate_nnue.h
@@ -23,10 +23,19 @@
 
 #include "nnue_feature_transformer.h"
 
+#include "misc.h"
+
 #include <memory>
 
 namespace Eval::NNUE {
 
+  enum struct UseNNUEMode
+  {
+    False,
+    True,
+    Pure
+  };
+
   // Hash value of evaluation function structure
   constexpr std::uint32_t kHashValue =
       FeatureTransformer::GetHashValue() ^ Network::GetHashValue();
@@ -40,11 +49,22 @@ namespace Eval::NNUE {
     }
   };
 
+  template <typename T>
+  struct LargePageDeleter {
+    void operator()(T* ptr) const {
+      ptr->~T();
+      aligned_large_pages_free(ptr);
+    }
+  };
+
   template <typename T>
   using AlignedPtr = std::unique_ptr<T, AlignedDeleter<T>>;
 
+  template <typename T>
+  using LargePagePtr = std::unique_ptr<T, LargePageDeleter<T>>;
+
   // Input feature converter
-  extern AlignedPtr<FeatureTransformer> feature_transformer;
+  extern LargePagePtr<FeatureTransformer> feature_transformer;
 
   // Evaluation function
   extern AlignedPtr<Network> network;
@@ -55,16 +75,22 @@ namespace Eval::NNUE {
   // Saved evaluation function file name
   extern std::string savedfileName;
 
+  extern UseNNUEMode useNNUE;
+
+  extern std::string eval_file_loaded;
+
   // Get a string that represents the structure of the evaluation function
-  std::string GetArchitectureString();
+  std::string get_architecture_string();
+
+  std::string get_layers_info();
 
   // read the header
-  bool ReadHeader(std::istream& stream,
-    std::uint32_t* hash_value, std::string* architecture);
+  bool read_header(std::istream& stream,
+      std::uint32_t* hash_value, std::string* architecture);
 
   // write the header
-  bool WriteHeader(std::ostream& stream,
-    std::uint32_t hash_value, const std::string& architecture);
+  bool write_header(std::ostream& stream,
+      std::uint32_t hash_value, const std::string& architecture);
 
   // read evaluation function parameters
   bool ReadParameters(std::istream& stream);
@@ -72,6 +98,13 @@ namespace Eval::NNUE {
   // write evaluation function parameters
   bool WriteParameters(std::ostream& stream);
 
+  Value evaluate(const Position& pos);
+  bool load_eval(std::string name, std::istream& stream);
+  void init();
+
+  void verify_eval_file_loaded();
+  void verify_any_net_loaded();
+
 }  // namespace Eval::NNUE
 
 #endif // #ifndef NNUE_EVALUATE_NNUE_H_INCLUDED
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 650f443e..3061a4f4 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -1,231 +1,342 @@
-﻿// Code for learning NNUE evaluation function
-
-#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
-
-#include <random>
+﻿#include <random>
 #include <fstream>
-
-#include "../learn/learn.h"
-#include "../learn/learning_tools.h"
-
-#include "../position.h"
-#include "../uci.h"
-#include "../misc.h"
-#include "../thread_win32_osx.h"
-
-#include "../eval/evaluate_common.h"
+#include <filesystem>
 
 #include "evaluate_nnue.h"
 #include "evaluate_nnue_learner.h"
-#include "trainer/features/factorizer_feature_set.h"
-#include "trainer/features/factorizer_half_kp.h"
+
+#include "trainer/features/all_factorizers.h"
+
 #include "trainer/trainer_feature_transformer.h"
 #include "trainer/trainer_input_slice.h"
 #include "trainer/trainer_affine_transform.h"
 #include "trainer/trainer_clipped_relu.h"
 #include "trainer/trainer_sum.h"
 
-namespace Eval {
+#include "position.h"
+#include "uci.h"
+#include "misc.h"
+#include "thread_win32_osx.h"
+#include "thread.h"
 
-namespace NNUE {
+// Code for learning NNUE evaluation function
+namespace Eval::NNUE {
 
-namespace {
+    namespace {
 
-// learning data
-std::vector<Example> examples;
+        // learning data
+        std::vector<Example> examples;
 
-// Mutex for exclusive control of examples
-std::mutex examples_mutex;
+        // Mutex for exclusive control of examples
+        std::mutex examples_mutex;
 
-// number of samples in mini-batch
-uint64_t batch_size;
+        // number of samples in mini-batch
+        uint64_t batch_size;
 
-// random number generator
-std::mt19937 rng;
+        // random number generator
+        std::mt19937 rng;
 
-// learner
-std::shared_ptr<Trainer<Network>> trainer;
+        // learner
+        std::shared_ptr<Trainer<Network>> trainer;
 
-// Learning rate scale
-double global_learning_rate_scale;
+        // Tell the learner options such as hyperparameters
+        void send_messages(std::vector<Message> messages) {
+            for (auto& message : messages) {
+                trainer->send_message(&message);
+                assert(message.num_receivers > 0);
+            }
+        }
 
-// Get the learning rate scale
-double GetGlobalLearningRateScale() {
-  return global_learning_rate_scale;
-}
+    }  // namespace
 
-// Tell the learner options such as hyperparameters
-void SendMessages(std::vector<Message> messages) {
-  for (auto& message : messages) {
-    trainer->SendMessage(&message);
-    assert(message.num_receivers > 0);
-  }
-}
+    // Initialize learning
+    void initialize_training(
+        const std::string& seed,
+        SynchronizedRegionLogger::Region& out) {
 
-}  // namespace
+#if defined (OPENBLAS_VERSION)
+        openblas_set_num_threads(1);
+#elif defined (INTEL_MKL_VERSION)
+        mkl_set_num_threads(1);
+#endif
 
-// Initialize learning
-void InitializeTraining(double eta1, uint64_t eta1_epoch,
-                        double eta2, uint64_t eta2_epoch, double eta3) {
-  std::cout << "Initializing NN training for "
-            << GetArchitectureString() << std::endl;
+        out << "INFO (initialize_training): Initializing NN training for "
+            << get_architecture_string() << std::endl;
 
-  assert(feature_transformer);
-  assert(network);
-  trainer = Trainer<Network>::Create(network.get(), feature_transformer.get());
+        out << std::endl;
 
-  if (Options["SkipLoadingEval"]) {
-    trainer->Initialize(rng);
-  }
+        out << "Layers:\n"
+            << get_layers_info() << std::endl;
 
-  global_learning_rate_scale = 1.0;
-  EvalLearningTools::Weight::init_eta(eta1, eta2, eta3, eta1_epoch, eta2_epoch);
-}
+        out << std::endl;
 
-// set the number of samples in the mini-batch
-void SetBatchSize(uint64_t size) {
-  assert(size > 0);
-  batch_size = size;
-}
+        out << "Factorizers:\n"
+            << Features::Factorizer<RawFeatures>::get_factorizers_string() << std::endl;
 
-// set the learning rate scale
-void SetGlobalLearningRateScale(double scale) {
-  global_learning_rate_scale = scale;
-}
+        out << std::endl;
 
-// Set options such as hyperparameters
-void SetOptions(const std::string& options) {
-  std::vector<Message> messages;
-  for (const auto& option : Split(options, ',')) {
-    const auto fields = Split(option, '=');
-    assert(fields.size() == 1 || fields.size() == 2);
-    if (fields.size() == 1) {
-      messages.emplace_back(fields[0]);
-    } else {
-      messages.emplace_back(fields[0], fields[1]);
-    }
-  }
-  SendMessages(std::move(messages));
-}
+        assert(feature_transformer);
+        assert(network);
 
-// Reread the evaluation function parameters for learning from the file
-void RestoreParameters(const std::string& dir_name) {
-  const std::string file_name = Path::Combine(dir_name, NNUE::savedfileName);
-  std::ifstream stream(file_name, std::ios::binary);
-  bool result = ReadParameters(stream);
-  assert(result);
+        trainer = Trainer<Network>::create(network.get(), feature_transformer.get());
+        rng.seed(PRNG(seed).rand<uint64_t>());
 
-  SendMessages({{"reset"}});
-}
-
-// Add 1 sample of learning data
-void AddExample(Position& pos, Color rootColor,
-                const Learner::PackedSfenValue& psv, double weight) {
-  Example example;
-  if (rootColor == pos.side_to_move()) {
-    example.sign = 1;
-  } else {
-    example.sign = -1;
-  }
-  example.psv = psv;
-  example.weight = weight;
-
-  Features::IndexList active_indices[2];
-  for (const auto trigger : kRefreshTriggers) {
-    RawFeatures::AppendActiveIndices(pos, trigger, active_indices);
-  }
-  if (pos.side_to_move() != WHITE) {
-    active_indices[0].swap(active_indices[1]);
-  }
-  for (const auto color : Colors) {
-    std::vector<TrainingFeature> training_features;
-    for (const auto base_index : active_indices[color]) {
-      static_assert(Features::Factorizer<RawFeatures>::GetDimensions() <
-                    (1 << TrainingFeature::kIndexBits), "");
-      Features::Factorizer<RawFeatures>::AppendTrainingFeatures(
-          base_index, &training_features);
-    }
-    std::sort(training_features.begin(), training_features.end());
-
-    auto& unique_features = example.training_features[color];
-    for (const auto& feature : training_features) {
-      if (!unique_features.empty() &&
-          feature.GetIndex() == unique_features.back().GetIndex()) {
-        unique_features.back() += feature;
-      } else {
-        unique_features.push_back(feature);
-      }
-    }
-  }
-
-  std::lock_guard<std::mutex> lock(examples_mutex);
-  examples.push_back(std::move(example));
-}
-
-// update the evaluation function parameters
-void UpdateParameters(uint64_t epoch) {
-  assert(batch_size > 0);
-
-  EvalLearningTools::Weight::calc_eta(epoch);
-  const auto learning_rate = static_cast<LearnFloatType>(
-      get_eta() / batch_size);
-
-  std::lock_guard<std::mutex> lock(examples_mutex);
-  std::shuffle(examples.begin(), examples.end(), rng);
-  while (examples.size() >= batch_size) {
-    std::vector<Example> batch(examples.end() - batch_size, examples.end());
-    examples.resize(examples.size() - batch_size);
-
-    const auto network_output = trainer->Propagate(batch);
-
-    std::vector<LearnFloatType> gradients(batch.size());
-    for (std::size_t b = 0; b < batch.size(); ++b) {
-      const auto shallow = static_cast<Value>(Round<std::int32_t>(
-          batch[b].sign * network_output[b] * kPonanzaConstant));
-      const auto& psv = batch[b].psv;
-      const double gradient = batch[b].sign * Learner::calc_grad(shallow, psv);
-      gradients[b] = static_cast<LearnFloatType>(gradient * batch[b].weight);
+        if (Options["SkipLoadingEval"]) {
+            out << "INFO (initialize_training): Performing random net initialization.\n";
+            trainer->initialize(rng);
+        }
     }
 
-    trainer->Backpropagate(gradients.data(), learning_rate);
-  }
-  SendMessages({{"quantize_parameters"}});
-}
+    // set the number of samples in the mini-batch
+    void set_batch_size(uint64_t size) {
+        assert(size > 0);
+        batch_size = size;
+    }
 
-// Check if there are any problems with learning
-void CheckHealth() {
-  SendMessages({{"check_health"}});
-}
+    // Set options such as hyperparameters
+    void set_options(const std::string& options) {
+        std::vector<Message> messages;
+        for (const auto& option : Algo::split(options, ',')) {
+          const auto fields = Algo::split(option, '=');
+          assert(fields.size() == 1 || fields.size() == 2);
 
-}  // namespace NNUE
+          if (fields.size() == 1) {
+              messages.emplace_back(fields[0]);
+          } else {
+              messages.emplace_back(fields[0], fields[1]);
+          }
+        }
 
-// save merit function parameters to a file
-void save_eval(std::string dir_name) {
-  auto eval_dir = Path::Combine(Options["EvalSaveDir"], dir_name);
-  std::cout << "save_eval() start. folder = " << eval_dir << std::endl;
+        send_messages(std::move(messages));
+    }
 
-  // mkdir() will fail if this folder already exists, but
-  // Apart from that. If not, I just want you to make it.
-  // Also, assume that the folders up to EvalSaveDir have been dug.
-  Dependency::mkdir(eval_dir);
+    // Reread the evaluation function parameters for learning from the file
+    void restore_parameters(const std::string& dir_name) {
+        const std::string file_name = Path::combine(dir_name, NNUE::savedfileName);
+        std::ifstream stream(file_name, std::ios::binary);
+#ifndef NDEBUG
+        bool result =
+#endif
+        ReadParameters(stream);
+#ifndef NDEBUG
+        assert(result);
+#endif
 
-  if (Options["SkipLoadingEval"] && NNUE::trainer) {
-    NNUE::SendMessages({{"clear_unobserved_feature_weights"}});
-  }
+        send_messages({{"reset"}});
+    }
 
-  const std::string file_name = Path::Combine(eval_dir, NNUE::savedfileName);
-  std::ofstream stream(file_name, std::ios::binary);
-  const bool result = NNUE::WriteParameters(stream);
-  assert(result);
+    void finalize_net() {
+        send_messages({{"clear_unobserved_feature_weights"}});
+    }
 
-  std::cout << "save_eval() finished. folder = " << eval_dir << std::endl;
-}
+    // Add 1 sample of learning data
+    void add_example(
+        Position& pos,
+        Color rootColor,
+        Value discrete_nn_eval,
+        const Learner::PackedSfenValue& psv,
+        double weight) {
 
-// get the current eta
-double get_eta() {
-  return NNUE::GetGlobalLearningRateScale() * EvalLearningTools::Weight::eta;
-}
+        Example example;
+        if (rootColor == pos.side_to_move()) {
+            example.sign = 1;
+        } else {
+            example.sign = -1;
+        }
 
-}  // namespace Eval
+        example.discrete_nn_eval = discrete_nn_eval;
+        example.psv = psv;
+        example.weight = weight;
 
-#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+        Features::IndexList active_indices[2];
+        for (const auto trigger : kRefreshTriggers) {
+            RawFeatures::append_active_indices(pos, trigger, active_indices);
+        }
+
+        if (pos.side_to_move() != WHITE) {
+            active_indices[0].swap(active_indices[1]);
+        }
+
+        static thread_local std::vector<TrainingFeature> s_training_features;
+        auto& training_features = s_training_features;
+
+        for (const auto color : Colors) {
+            training_features.clear();
+
+            for (const auto base_index : active_indices[color]) {
+                static_assert(Features::Factorizer<RawFeatures>::get_dimensions() <
+                              (1 << TrainingFeature::kIndexBits), "");
+                Features::Factorizer<RawFeatures>::append_training_features(
+                    base_index, &training_features);
+            }
+
+            std::sort(training_features.begin(), training_features.end());
+
+            auto& unique_features = example.training_features[color];
+            unique_features.reserve(training_features.size());
+            for (const auto& feature : training_features) {
+                if (!unique_features.empty() &&
+                    feature.get_index() == unique_features.back().get_index()) {
+
+                    unique_features.back() += feature;
+                } else {
+                    unique_features.push_back(feature);
+                }
+            }
+        }
+
+        std::lock_guard<std::mutex> lock(examples_mutex);
+        examples.push_back(std::move(example));
+    }
+
+    // update the evaluation function parameters
+    Learner::Loss update_parameters(
+        ThreadPool& thread_pool,
+        uint64_t epoch,
+        bool verbose,
+        double learning_rate,
+        double max_grad,
+        Learner::CalcLossFunc calc_loss)
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        assert(batch_size > 0);
+
+        learning_rate /= batch_size;
+
+        std::lock_guard<std::mutex> lock(examples_mutex);
+
+        double abs_eval_diff_sum = 0.0;
+        double abs_discrete_eval_sum = 0.0;
+        double gradient_norm = 0.0;
+
+        bool collect_stats = verbose;
+
+        Learner::Loss loss_sum{};
+
+        std::vector<double> abs_eval_diff_sum_local(thread_pool.size(), 0.0);
+        std::vector<double> abs_discrete_eval_sum_local(thread_pool.size(), 0.0);
+        std::vector<double> gradient_norm_local(thread_pool.size(), 0.0);
+        std::vector<Learner::Loss> loss_sum_local(thread_pool.size());
+
+        auto prev_batch_begin = examples.end();
+        while ((long)(prev_batch_begin - examples.begin()) >= (long)batch_size) {
+            auto batch_begin = prev_batch_begin - batch_size;
+            auto batch_end = prev_batch_begin;
+            auto size = batch_end - batch_begin;
+            const auto network_output = trainer->step_start(thread_pool, batch_begin, batch_end);
+            std::vector<LearnFloatType> gradients(size);
+
+            thread_pool.for_each_index_chunk_with_workers(
+                std::size_t(0), size,
+                [&](Thread& th, std::size_t offset, std::size_t count) {
+                    const auto thread_id = th.thread_idx();
+
+                    trainer->propagate(th, offset, count);
+
+                    for (std::size_t b = offset; b < offset + count; ++b) {
+                        const auto& e = *(batch_begin + b);
+                        const auto shallow = static_cast<Value>(round<std::int32_t>(
+                            e.sign * network_output[b] * kPonanzaConstant));
+                        const auto discrete = e.sign * e.discrete_nn_eval;
+                        const auto& psv = e.psv;
+                        auto loss = calc_loss(shallow, (Value)psv.score, psv.game_result, psv.gamePly);
+                        loss.grad = std::clamp(
+                            loss.grad * e.sign * kPonanzaConstant * e.weight, -max_grad, max_grad);
+                        gradients[b] = static_cast<LearnFloatType>(loss.grad);
+                        loss_sum_local[thread_id] += loss;
+
+                        // The discrete eval will only be valid before first backpropagation,
+                        // that is only for the first batch.
+                        // Similarily we want only gradients from one batch.
+                        if (collect_stats)
+                        {
+                            abs_eval_diff_sum_local[thread_id] += std::abs(discrete - shallow);
+                            abs_discrete_eval_sum_local[thread_id] += std::abs(discrete);
+                            gradient_norm_local[thread_id] += std::abs(loss.grad);
+                        }
+                    }
+
+                    trainer->backpropagate(th, gradients.data(), offset, count);
+                }
+            );
+
+            // We can asyncronously erase the examples that we used in the previous
+            // step. This can be done safely because we're no longer using these
+            // examples and erase won't invalidate iterators.
+            examples.erase(prev_batch_begin, examples.end());
+            prev_batch_begin = batch_begin;
+
+            thread_pool.wait_for_workers_finished();
+
+            trainer->step_end(thread_pool, learning_rate);
+
+            collect_stats = false;
+        }
+        examples.erase(prev_batch_begin, examples.end());
+
+        if (verbose)
+        {
+            abs_eval_diff_sum = std::accumulate(abs_eval_diff_sum_local.begin(), abs_eval_diff_sum_local.end(), 0.0);
+            abs_discrete_eval_sum = std::accumulate(abs_discrete_eval_sum_local.begin(), abs_discrete_eval_sum_local.end(), 0.0);
+            gradient_norm = std::accumulate(gradient_norm_local.begin(), gradient_norm_local.end(), 0.0);
+
+            const double avg_abs_eval_diff = abs_eval_diff_sum / batch_size;
+            const double avg_abs_discrete_eval = abs_discrete_eval_sum / batch_size;
+
+            auto out = sync_region_cout.new_region();
+
+            out << "INFO (update_parameters):"
+                << " epoch = " << epoch
+                << " , avg_abs(trainer_eval-nnue_eval) = " << avg_abs_eval_diff
+                << " , avg_abs(nnue_eval) = " << avg_abs_discrete_eval
+                << " , avg_relative_error = " << avg_abs_eval_diff / avg_abs_discrete_eval
+                << " , batch_size = " << batch_size
+                << " , grad_norm = " << gradient_norm
+                << std::endl;
+        } else {
+            // Display some progress but don't synchronize as
+            // we can't really decide when to release the output lock here
+            std::cout << '.';
+        }
+
+        send_messages({{"quantize_parameters"}});
+
+        for(auto& loss : loss_sum_local)
+        {
+            loss_sum += loss;
+        }
+
+        return loss_sum;
+    }
+
+    // Check if there are any problems with learning
+    void check_health() {
+        send_messages({{"check_health"}});
+    }
+
+    // save merit function parameters to a file
+    void save_eval(std::string dir_name) {
+        auto eval_dir = Path::combine(Options["EvalSaveDir"], dir_name);
+
+        auto out = sync_region_cout.new_region();
+
+        out << "INFO (save_eval): Saving current evaluation file in " << eval_dir << std::endl;
+
+        // mkdir() will fail if this folder already exists, but
+        // Apart from that. If not, I just want you to make it.
+        // Also, assume that the folders up to EvalSaveDir have been dug.
+        std::filesystem::create_directories(eval_dir);
+
+        const std::string file_name = Path::combine(eval_dir, NNUE::savedfileName);
+        std::ofstream stream(file_name, std::ios::binary);
+#ifndef NDEBUG
+        bool result =
+#endif
+        WriteParameters(stream);
+#ifndef NDEBUG
+        assert(result);
+#endif
+        out << "INFO (save_eval): Finished saving evaluation file in " << eval_dir << std::endl;
+    }
+}  // namespace Eval::NNUE
diff --git a/src/nnue/evaluate_nnue_learner.h b/src/nnue/evaluate_nnue_learner.h
index 1e4a463e..3d9f5b31 100644
--- a/src/nnue/evaluate_nnue_learner.h
+++ b/src/nnue/evaluate_nnue_learner.h
@@ -1,46 +1,52 @@
-﻿// Interface used for learning NNUE evaluation function
-
-#ifndef _EVALUATE_NNUE_LEARNER_H_
+﻿#ifndef _EVALUATE_NNUE_LEARNER_H_
 #define _EVALUATE_NNUE_LEARNER_H_
 
-#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+#include "learn/learn.h"
 
-#include "../learn/learn.h"
+#include "misc.h"
 
-namespace Eval {
+struct ThreadPool;
 
-namespace NNUE {
+// Interface used for learning NNUE evaluation function
+namespace Eval::NNUE {
 
-// Initialize learning
-void InitializeTraining(double eta1, uint64_t eta1_epoch,
-                        double eta2, uint64_t eta2_epoch, double eta3);
+    // Initialize learning
+    void initialize_training(
+        const std::string& seed,
+        SynchronizedRegionLogger::Region& out);
 
-// set the number of samples in the mini-batch
-void SetBatchSize(uint64_t size);
+    // set the number of samples in the mini-batch
+    void set_batch_size(uint64_t size);
 
-// set the learning rate scale
-void SetGlobalLearningRateScale(double scale);
+    // Set options such as hyperparameters
+    void set_options(const std::string& options);
 
-// Set options such as hyperparameters
-void SetOptions(const std::string& options);
+    // Reread the evaluation function parameters for learning from the file
+    void restore_parameters(const std::string& dir_name);
 
-// Reread the evaluation function parameters for learning from the file
-void RestoreParameters(const std::string& dir_name);
+    // Add 1 sample of learning data
+    void add_example(
+        Position& pos,
+        Color rootColor,
+        Value discrete_nn_eval,
+    	const Learner::PackedSfenValue& psv,
+        double weight);
 
-// Add 1 sample of learning data
-void AddExample(Position& pos, Color rootColor,
-                const Learner::PackedSfenValue& psv, double weight);
+    // update the evaluation function parameters
+    Learner::Loss update_parameters(
+        ThreadPool& thread_pool,
+        uint64_t epoch,
+        bool verbose,
+        double learning_rate,
+        double max_grad,
+        Learner::CalcLossFunc calc_loss);
 
-// update the evaluation function parameters
-void UpdateParameters(uint64_t epoch);
+    // Check if there are any problems with learning
+    void check_health();
 
-// Check if there are any problems with learning
-void CheckHealth();
+    void finalize_net();
 
-}  // namespace NNUE
-
-}  // namespace Eval
-
-#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+    void save_eval(std::string suffix);
+}  // namespace Eval::NNUE
 
 #endif
diff --git a/src/nnue/features/a.cpp b/src/nnue/features/a.cpp
new file mode 100644
index 00000000..1bfb583f
--- /dev/null
+++ b/src/nnue/features/a.cpp
@@ -0,0 +1,54 @@
+﻿#include "a.h"
+#include "index_list.h"
+
+// Definition of input feature A of NNUE evaluation function
+namespace Eval::NNUE::Features {
+
+    // Orient a square according to perspective (rotate the board 180° for black)
+    // Important note for "halfka": this arch was designed with "flip" in mind 
+    // although it still is untested which approach is better.
+    // this has to stay until we find a better arch that works with "flip".
+    // allows us to use current master net for gensfen (primarily needed for higher quality data)
+    inline Square orient(Color perspective, Square s) {
+        return Square(int(s) ^ (bool(perspective) * 63));
+    }
+
+    // Find the index of the feature quantity from the king position and PieceSquare
+    inline IndexType A::make_index(
+        Color perspective, Square s, Piece pc) {
+        return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
+    }
+
+    // Get a list of indices with a value of 1 among the features
+    void A::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {
+
+        Bitboard bb = pos.pieces();
+        while (bb) {
+            Square s = pop_lsb(&bb);
+            active->push_back(make_index(perspective, s, pos.piece_on(s)));
+        }
+    }
+
+    // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+    void A::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* added) {
+
+        const auto& dp = pos.state()->dirtyPiece;
+        for (int i = 0; i < dp.dirty_num; ++i) {
+            Piece pc = dp.piece[i];
+
+            if (dp.from[i] != SQ_NONE)
+              removed->push_back(make_index(perspective, dp.from[i], pc));
+
+            if (dp.to[i] != SQ_NONE)
+              added->push_back(make_index(perspective, dp.to[i], pc));
+        }
+    }
+
+}  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/a.h b/src/nnue/features/a.h
new file mode 100644
index 00000000..50a0d8be
--- /dev/null
+++ b/src/nnue/features/a.h
@@ -0,0 +1,54 @@
+﻿#ifndef _NNUE_FEATURES_A_H_
+#define _NNUE_FEATURES_A_H_
+
+#include "features_common.h"
+
+#include "evaluate.h"
+
+// Definition of input feature A of NNUE evaluation function
+// A is a union of P features and K features, so technically the
+// same effect can be achieved by including both P and K features
+// but it would result in slower index appending because
+// P would conditionally exclude K features and vice versa,
+// where A doesn't have any conditionals.
+namespace Eval::NNUE::Features {
+
+    // Feature P: PieceSquare of pieces other than balls
+    class A {
+    public:
+        // feature quantity name
+        static constexpr const char* kName = "A";
+
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t kHashValue = 0x7A4C414Cu;
+
+        // number of feature dimensions
+        static constexpr IndexType kDimensions = PS_END2;
+
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions = 32;
+
+        // Timing of full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
+
+        // Get a list of indices with a value of 1 among the features
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);
+
+        // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
+
+    private:
+        // Index of a feature for a given piece on some square
+        static IndexType make_index(Color perspective, Square s, Piece pc);
+    };
+
+}  // namespace Eval::NNUE::Features
+
+#endif // #ifndef _NNUE_FEATURES_UNION_P_K_H_
diff --git a/src/nnue/features/castling_right.cpp b/src/nnue/features/castling_right.cpp
index ee7b6576..cbac0851 100644
--- a/src/nnue/features/castling_right.cpp
+++ b/src/nnue/features/castling_right.cpp
@@ -1,73 +1,65 @@
-//Definition of input feature quantity K of NNUE evaluation function
-
-#if defined(EVAL_NNUE)
-
 #include "castling_right.h"
 #include "index_list.h"
 
-namespace Eval {
+//Definition of input feature quantity CastlingRight of NNUE evaluation function
+namespace Eval::NNUE::Features {
 
-  namespace NNUE {
+    // Get a list of indices with a value of 1 among the features
+    void CastlingRight::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {
 
-    namespace Features {
-
-      // Get a list of indices with a value of 1 among the features
-      void CastlingRight::AppendActiveIndices(
-        const Position& pos, Color perspective, IndexList* active) {
         // do nothing if array size is small to avoid compiler warning
         if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
 
         int castling_rights = pos.state()->castlingRights;
         int relative_castling_rights;
         if (perspective == WHITE) {
-          relative_castling_rights = castling_rights;
+            relative_castling_rights = castling_rights;
         }
         else {
-          // Invert the perspective.
-          relative_castling_rights = ((castling_rights & 3) << 2)
-            & ((castling_rights >> 2) & 3);
+            // Invert the perspective.
+            relative_castling_rights = ((castling_rights & 3) << 2)
+                & ((castling_rights >> 2) & 3);
         }
 
-        for (int i = 0; i <kDimensions; ++i) {
-          if (relative_castling_rights & (i << 1)) {
-            active->push_back(i);
-          }
+        for (Eval::NNUE::IndexType i = 0; i < kDimensions; ++i) {
+            if (relative_castling_rights & (1 << i)) {
+                active->push_back(i);
+            }
         }
-      }
+    }
 
-      // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-      void CastlingRight::AppendChangedIndices(
-        const Position& pos, Color perspective,
-        IndexList* removed, IndexList* added) {
+    // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+    void CastlingRight::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* /* added */) {
 
         int previous_castling_rights = pos.state()->previous->castlingRights;
         int current_castling_rights = pos.state()->castlingRights;
         int relative_previous_castling_rights;
         int relative_current_castling_rights;
         if (perspective == WHITE) {
-          relative_previous_castling_rights = previous_castling_rights;
-          relative_current_castling_rights = current_castling_rights;
+            relative_previous_castling_rights = previous_castling_rights;
+            relative_current_castling_rights = current_castling_rights;
         }
         else {
-          // Invert the perspective.
-          relative_previous_castling_rights = ((previous_castling_rights & 3) << 2)
-            & ((previous_castling_rights >> 2) & 3);
-          relative_current_castling_rights = ((current_castling_rights & 3) << 2)
-            & ((current_castling_rights >> 2) & 3);
+            // Invert the perspective.
+            relative_previous_castling_rights = ((previous_castling_rights & 3) << 2)
+                & ((previous_castling_rights >> 2) & 3);
+            relative_current_castling_rights = ((current_castling_rights & 3) << 2)
+                & ((current_castling_rights >> 2) & 3);
         }
 
-        for (int i = 0; i < kDimensions; ++i) {
-          if ((relative_previous_castling_rights & (i << 1)) &&
-            (relative_current_castling_rights & (i << 1)) == 0) {
-            removed->push_back(i);
-          }
+        for (Eval::NNUE::IndexType i = 0; i < kDimensions; ++i) {
+            if ((relative_previous_castling_rights & (1 << i)) &&
+                (relative_current_castling_rights & (1 << i)) == 0) {
+                removed->push_back(i);
+            }
         }
-      }
+    }
 
-    }  // namespace Features
-
-  }  // namespace NNUE
-
-}  // namespace Eval
-
-#endif  // defined(EVAL_NNUE)
+}  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/castling_right.h b/src/nnue/features/castling_right.h
index 3af5b074..cada24b6 100644
--- a/src/nnue/features/castling_right.h
+++ b/src/nnue/features/castling_right.h
@@ -1,48 +1,44 @@
-//Definition of input feature quantity K of NNUE evaluation function
-
 #ifndef _NNUE_FEATURES_CASTLING_RIGHT_H_
 #define _NNUE_FEATURES_CASTLING_RIGHT_H_
 
-#if defined(EVAL_NNUE)
-
-#include "../../evaluate.h"
 #include "features_common.h"
 
-namespace Eval {
+#include "evaluate.h"
 
-  namespace NNUE {
+//Definition of input feature quantity CastlingRight of NNUE evaluation function
+namespace Eval::NNUE::Features {
 
-    namespace Features {
-
-      // Feature K: Ball position
-      class CastlingRight {
-      public:
+    class CastlingRight {
+    public:
         // feature quantity name
         static constexpr const char* kName = "CastlingRight";
+
         // Hash value embedded in the evaluation function file
         static constexpr std::uint32_t kHashValue = 0x913968AAu;
+
         // number of feature dimensions
         static constexpr IndexType kDimensions = 4;
+
         // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
         static constexpr IndexType kMaxActiveDimensions = 4;
+
         // Timing of full calculation instead of difference calculation
         static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
 
         // Get a list of indices with a value of 1 among the features
-        static void AppendActiveIndices(const Position& pos, Color perspective,
-          IndexList* active);
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);
 
-        // Get a list of indices whose values ??have changed from the previous one in the feature quantity
-        static void AppendChangedIndices(const Position& pos, Color perspective,
-          IndexList* removed, IndexList* added);
-      };
+        // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
+    };
 
-    }  // namespace Features
-
-  }  // namespace NNUE
-
-}  // namespace Eval
-
-#endif  // defined(EVAL_NNUE)
+}  // namespace Eval::NNUE::Features
 
 #endif
diff --git a/src/nnue/features/enpassant.cpp b/src/nnue/features/enpassant.cpp
index ea70529a..06ba2d49 100644
--- a/src/nnue/features/enpassant.cpp
+++ b/src/nnue/features/enpassant.cpp
@@ -1,47 +1,49 @@
-//Definition of input feature quantity K of NNUE evaluation function
-
-#if defined(EVAL_NNUE)
-
 #include "enpassant.h"
 #include "index_list.h"
 
-namespace Eval {
+//Definition of input feature quantity EnPassant of NNUE evaluation function
+namespace Eval::NNUE::Features {
 
-  namespace NNUE {
+    // Get a list of indices with a value of 1 among the features
+    void EnPassant::append_active_indices(
+        const Position& pos,
+        Color /* perspective */,
+        IndexList* active) {
 
-    namespace Features {
-
-      // Get a list of indices with a value of 1 among the features
-      void EnPassant::AppendActiveIndices(
-        const Position& pos, Color perspective, IndexList* active) {
         // do nothing if array size is small to avoid compiler warning
-        if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
+        if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions)
+            return;
 
         auto epSquare = pos.state()->epSquare;
-        if (epSquare == SQ_NONE) {
-          return;
-        }
-
-        if (perspective == BLACK) {
-          epSquare = rotate180(epSquare);
-        }
+        if (epSquare == SQ_NONE)
+            return;
 
         auto file = file_of(epSquare);
         active->push_back(file);
-      }
+    }
 
-      // Get a list of indices whose values ??have changed from the previous one in the feature quantity
-      void EnPassant::AppendChangedIndices(
-        const Position& pos, Color perspective,
-        IndexList* removed, IndexList* added) {
-        // Not implemented.
-        assert(false);
-      }
+    // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+    void EnPassant::append_changed_indices(
+        const Position& pos,
+        Color /* perspective */,
+        IndexList* removed,
+        IndexList* added) {
 
-    }  // namespace Features
+        auto previous_epSquare = pos.state()->previous->epSquare;
+        auto epSquare = pos.state()->epSquare;
 
-  }  // namespace NNUE
+        if (previous_epSquare != SQ_NONE) {
+            if (epSquare != SQ_NONE && file_of(epSquare) == file_of(previous_epSquare))
+                return;
 
-}  // namespace Eval
+            auto file = file_of(previous_epSquare);
+            removed->push_back(file);
+        }
 
-#endif  // defined(EVAL_NNUE)
+        if (epSquare != SQ_NONE) {
+            auto file = file_of(epSquare);
+            added->push_back(file);
+        }
+    }
+
+}  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/enpassant.h b/src/nnue/features/enpassant.h
index f77f9c4f..6ccb6046 100644
--- a/src/nnue/features/enpassant.h
+++ b/src/nnue/features/enpassant.h
@@ -1,22 +1,15 @@
-//Definition of input feature quantity K of NNUE evaluation function
-
 #ifndef _NNUE_FEATURES_ENPASSANT_H_
 #define _NNUE_FEATURES_ENPASSANT_H_
 
-#if defined(EVAL_NNUE)
-
-#include "../../evaluate.h"
 #include "features_common.h"
 
-namespace Eval {
+#include "evaluate.h"
 
-  namespace NNUE {
+//Definition of input feature quantity EnPassant of NNUE evaluation function
+namespace Eval::NNUE::Features {
 
-    namespace Features {
-
-      // Feature K: Ball position
-      class EnPassant {
-      public:
+    class EnPassant {
+    public:
         // feature quantity name
         static constexpr const char* kName = "EnPassant";
         // Hash value embedded in the evaluation function file
@@ -26,23 +19,22 @@ namespace Eval {
         // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
         static constexpr IndexType kMaxActiveDimensions = 1;
         // Timing of full calculation instead of difference calculation
-        static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kAnyPieceMoved;
+        static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
 
         // Get a list of indices with a value of 1 among the features
-        static void AppendActiveIndices(const Position& pos, Color perspective,
-          IndexList* active);
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);
 
-        // Get a list of indices whose values ??have changed from the previous one in the feature quantity
-        static void AppendChangedIndices(const Position& pos, Color perspective,
-          IndexList* removed, IndexList* added);
-      };
+        // Get a list of indices whose values have changed from the previous one in the feature quantity
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
+    };
 
-    }  // namespace Features
-
-  }  // namespace NNUE
-
-}  // namespace Eval
-
-#endif  // defined(EVAL_NNUE)
+}  // namespace Eval::NNUE::Features
 
 #endif
diff --git a/src/nnue/features/feature_set.h b/src/nnue/features/feature_set.h
index b933d2d9..6602eae7 100644
--- a/src/nnue/features/feature_set.h
+++ b/src/nnue/features/feature_set.h
@@ -26,222 +26,276 @@
 
 namespace Eval::NNUE::Features {
 
-  // Class template that represents a list of values
-  template <typename T, T... Values>
-  struct CompileTimeList;
+    // Class template that represents a list of values
+    template <typename T, T... Values>
+    struct CompileTimeList;
 
-  template <typename T, T First, T... Remaining>
-  struct CompileTimeList<T, First, Remaining...> {
-    static constexpr bool Contains(T value) {
-      return value == First || CompileTimeList<T, Remaining...>::Contains(value);
-    }
-    static constexpr std::array<T, sizeof...(Remaining) + 1>
-        kValues = {{First, Remaining...}};
-  };
-
-  template <typename T, T First, T... Remaining>
-  constexpr std::array<T, sizeof...(Remaining) + 1>
-    CompileTimeList<T, First, Remaining...>::kValues;
-  template <typename T>
-  struct CompileTimeList<T> {
-    static constexpr bool Contains(T /*value*/) {
-      return false;
-    }
-    static constexpr std::array<T, 0> kValues = { {} };
-  };
-
-  // Class template that adds to the beginning of the list
-  template <typename T, typename ListType, T Value>
-  struct AppendToList;
-  template <typename T, T... Values, T AnotherValue>
-  struct AppendToList<T, CompileTimeList<T, Values...>, AnotherValue> {
-    using Result = CompileTimeList<T, AnotherValue, Values...>;
-  };
-
-  // Class template for adding to a sorted, unique list
-  template <typename T, typename ListType, T Value>
-  struct InsertToSet;
-  template <typename T, T First, T... Remaining, T AnotherValue>
-  struct InsertToSet<T, CompileTimeList<T, First, Remaining...>, AnotherValue> {
-    using Result = std::conditional_t<
-      CompileTimeList<T, First, Remaining...>::Contains(AnotherValue),
-      CompileTimeList<T, First, Remaining...>,
-      std::conditional_t<(AnotherValue < First),
-      CompileTimeList<T, AnotherValue, First, Remaining...>,
-      typename AppendToList<T, typename InsertToSet<
-      T, CompileTimeList<T, Remaining...>, AnotherValue>::Result,
-      First>::Result>>;
-  };
-  template <typename T, T Value>
-  struct InsertToSet<T, CompileTimeList<T>, Value> {
-    using Result = CompileTimeList<T, Value>;
-  };
-
-  // Base class of feature set
-  template <typename Derived>
-  class FeatureSetBase {
-
-   public:
-    // Get a list of indices for active features
-    template <typename IndexListType>
-    static void AppendActiveIndices(
-        const Position& pos, TriggerEvent trigger, IndexListType active[2]) {
-
-      for (Color perspective : { WHITE, BLACK }) {
-        Derived::CollectActiveIndices(
-            pos, trigger, perspective, &active[perspective]);
-      }
-    }
-
-    // Get a list of indices for recently changed features
-    template <typename PositionType, typename IndexListType>
-    static void AppendChangedIndices(
-        const PositionType& pos, TriggerEvent trigger,
-        IndexListType removed[2], IndexListType added[2], bool reset[2]) {
-
-      const auto& dp = pos.state()->dirtyPiece;
-      if (dp.dirty_num == 0) return;
-
-      for (Color perspective : { WHITE, BLACK }) {
-        reset[perspective] = false;
-        switch (trigger) {
-          case TriggerEvent::kFriendKingMoved:
-            reset[perspective] = dp.piece[0] == make_piece(perspective, KING);
-            break;
-          default:
-            assert(false);
-            break;
+    template <typename T, T First, T... Remaining>
+    struct CompileTimeList<T, First, Remaining...> {
+        static constexpr bool contains(T value) {
+            return value == First || CompileTimeList<T, Remaining...>::contains(value);
         }
-        if (reset[perspective]) {
-          Derived::CollectActiveIndices(
-              pos, trigger, perspective, &added[perspective]);
-        } else {
-          Derived::CollectChangedIndices(
-              pos, trigger, perspective,
-              &removed[perspective], &added[perspective]);
+
+        static constexpr std::array<T, sizeof...(Remaining) + 1>
+            kValues = {{First, Remaining...}};
+    };
+
+    template <typename T, T First, T... Remaining>
+    constexpr std::array<T, sizeof...(Remaining) + 1>
+        CompileTimeList<T, First, Remaining...>::kValues;
+
+    template <typename T>
+    struct CompileTimeList<T> {
+        static constexpr bool contains(T /*value*/) {
+            return false;
         }
-      }
-    }
-  };
+        static constexpr std::array<T, 0> kValues = { {} };
+    };
 
-  // Class template that represents the feature set
-  // do internal processing in reverse order of template arguments in order to linearize the amount of calculation at runtime
-  template <typename FirstFeatureType, typename... RemainingFeatureTypes>
-  class FeatureSet<FirstFeatureType, RemainingFeatureTypes...> :
-    public FeatureSetBase<
-    FeatureSet<FirstFeatureType, RemainingFeatureTypes...>> {
-  private:
-    using Head = FirstFeatureType;
-    using Tail = FeatureSet<RemainingFeatureTypes...>;
+    // Class template that adds to the beginning of the list
+    template <typename T, typename ListType, T Value>
+    struct AppendToList;
 
-  public:
-    // Hash value embedded in the evaluation function file
-    static constexpr std::uint32_t kHashValue =
-      Head::kHashValue ^ (Tail::kHashValue << 1) ^ (Tail::kHashValue >> 31);
-    // number of feature dimensions
-    static constexpr IndexType kDimensions =
-      Head::kDimensions + Tail::kDimensions;
-    // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-    static constexpr IndexType kMaxActiveDimensions =
-      Head::kMaxActiveDimensions + Tail::kMaxActiveDimensions;
-    // List of timings to perform all calculations instead of difference calculation
-    using SortedTriggerSet = typename InsertToSet<TriggerEvent,
-      typename Tail::SortedTriggerSet, Head::kRefreshTrigger>::Result;
-    static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
+    template <typename T, T... Values, T AnotherValue>
+    struct AppendToList<T, CompileTimeList<T, Values...>, AnotherValue> {
+        using Result = CompileTimeList<T, AnotherValue, Values...>;
+    };
 
-    // Get the feature quantity name
-    static std::string GetName() {
-      return std::string(Head::kName) + "+" + Tail::GetName();
-    }
+    // Class template for adding to a sorted, unique list
+    template <typename T, typename ListType, T Value>
+    struct InsertToSet;
 
-  private:
-    // Get a list of indices with a value of 1 among the features
-    template <typename IndexListType>
-    static void CollectActiveIndices(
-      const Position& pos, const TriggerEvent trigger, const Color perspective,
-      IndexListType* const active) {
-      Tail::CollectActiveIndices(pos, trigger, perspective, active);
-      if (Head::kRefreshTrigger == trigger) {
-        const auto start = active->size();
-        Head::AppendActiveIndices(pos, perspective, active);
-        for (auto i = start; i < active->size(); ++i) {
-          (*active)[i] += Tail::kDimensions;
+    template <typename T, T First, T... Remaining, T AnotherValue>
+    struct InsertToSet<T, CompileTimeList<T, First, Remaining...>, AnotherValue> {
+        using Result =
+            std::conditional_t<
+                CompileTimeList<T, First, Remaining...>::contains(AnotherValue),
+                CompileTimeList<T, First, Remaining...>,
+                std::conditional_t<
+                    (AnotherValue < First),
+                    CompileTimeList<T, AnotherValue, First, Remaining...>,
+                    typename AppendToList<T, typename InsertToSet<
+                        T, CompileTimeList<T, Remaining...>, AnotherValue>::Result,
+                        First
+                    >::Result
+                >
+            >;
+    };
+
+    template <typename T, T Value>
+    struct InsertToSet<T, CompileTimeList<T>, Value> {
+        using Result = CompileTimeList<T, Value>;
+    };
+
+    // Base class of feature set
+    template <typename Derived>
+    class FeatureSetBase {
+
+       public:
+        // Get a list of indices for active features
+        template <typename IndexListType>
+        static void append_active_indices(
+            const Position& pos, TriggerEvent trigger, IndexListType active[2]) {
+
+            for (Color perspective : { WHITE, BLACK }) {
+                Derived::collect_active_indices(
+                    pos, trigger, perspective, &active[perspective]);
+            }
         }
-      }
-    }
 
-    // Get a list of indices whose values have changed from the previous one in the feature quantity
-    template <typename IndexListType>
-    static void CollectChangedIndices(
-      const Position& pos, const TriggerEvent trigger, const Color perspective,
-      IndexListType* const removed, IndexListType* const added) {
-      Tail::CollectChangedIndices(pos, trigger, perspective, removed, added);
-      if (Head::kRefreshTrigger == trigger) {
-        const auto start_removed = removed->size();
-        const auto start_added = added->size();
-        Head::AppendChangedIndices(pos, perspective, removed, added);
-        for (auto i = start_removed; i < removed->size(); ++i) {
-          (*removed)[i] += Tail::kDimensions;
+        // Get a list of indices for recently changed features
+        template <typename PositionType, typename IndexListType>
+        static void append_changed_indices(
+            const PositionType& pos,
+            TriggerEvent trigger,
+            IndexListType removed[2],
+            IndexListType added[2],
+            bool reset[2]) {
+
+            const auto& dp = pos.state()->dirtyPiece;
+
+            for (Color perspective : { WHITE, BLACK }) {
+                switch (trigger) {
+                    case TriggerEvent::kNone:
+                        break;
+                    case TriggerEvent::kFriendKingMoved:
+                        if (dp.dirty_num == 0) continue;
+                        reset[perspective] = dp.piece[0] == make_piece(perspective, KING);
+                        break;
+                    case TriggerEvent::kEnemyKingMoved:
+                        if (dp.dirty_num == 0) continue;
+                        reset[perspective] = dp.piece[0] == make_piece(~perspective, KING);
+                        break;
+                    case TriggerEvent::kAnyKingMoved:
+                        if (dp.dirty_num == 0) continue;
+                        reset[perspective] = type_of(dp.piece[0]) == KING;
+                        break;
+                    case TriggerEvent::kAnyPieceMoved:
+                        reset[perspective] = true;
+                        break;
+                    default:
+                        assert(false);
+                        break;
+                }
+
+                if (reset[perspective]) {
+                    Derived::collect_active_indices(
+                        pos, trigger, perspective, &added[perspective]);
+                } else {
+                    Derived::collect_changed_indices(
+                        pos, trigger, perspective,
+                        &removed[perspective], &added[perspective]);
+                }
+            }
         }
-        for (auto i = start_added; i < added->size(); ++i) {
-          (*added)[i] += Tail::kDimensions;
+    };
+
+    // Class template that represents the feature set
+    // do internal processing in reverse order of template arguments in order to linearize the amount of calculation at runtime
+    template <typename FirstFeatureType, typename... RemainingFeatureTypes>
+    class FeatureSet<FirstFeatureType, RemainingFeatureTypes...> :
+      public FeatureSetBase<
+          FeatureSet<FirstFeatureType, RemainingFeatureTypes...>
+      > {
+
+    private:
+        using Head = FirstFeatureType;
+        using Tail = FeatureSet<RemainingFeatureTypes...>;
+
+    public:
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t kHashValue =
+            Head::kHashValue ^ (Tail::kHashValue << 1) ^ (Tail::kHashValue >> 31);
+
+        // number of feature dimensions
+        static constexpr IndexType kDimensions =
+            Head::kDimensions + Tail::kDimensions;
+
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions =
+            Head::kMaxActiveDimensions + Tail::kMaxActiveDimensions;
+
+        // List of timings to perform all calculations instead of difference calculation
+        using SortedTriggerSet = typename InsertToSet<TriggerEvent,
+            typename Tail::SortedTriggerSet, Head::kRefreshTrigger>::Result;
+
+        static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
+
+        // Get the feature quantity name
+        static std::string get_name() {
+            return std::string(Head::kName) + "+" + Tail::get_name();
         }
-      }
-    }
 
-    // Make the base class and the class template that recursively uses itself a friend
-    friend class FeatureSetBase<FeatureSet>;
-    template <typename... FeatureTypes>
-    friend class FeatureSet;
-  };
+    private:
+        // Get a list of indices with a value of 1 among the features
+        template <typename IndexListType>
+        static void collect_active_indices(
+            const Position& pos,
+            const TriggerEvent trigger,
+            const Color perspective,
+            IndexListType* const active) {
 
-  // Class template that represents the feature set
-  template <typename FeatureType>
-  class FeatureSet<FeatureType> : public FeatureSetBase<FeatureSet<FeatureType>> {
+            Tail::collect_active_indices(pos, trigger, perspective, active);
+            if (Head::kRefreshTrigger == trigger) {
+                const auto start = active->size();
+                Head::append_active_indices(pos, perspective, active);
 
-   public:
-    // Hash value embedded in the evaluation file
-    static constexpr std::uint32_t kHashValue = FeatureType::kHashValue;
-    // Number of feature dimensions
-    static constexpr IndexType kDimensions = FeatureType::kDimensions;
-    // Maximum number of simultaneously active features
-    static constexpr IndexType kMaxActiveDimensions =
-        FeatureType::kMaxActiveDimensions;
-    // Trigger for full calculation instead of difference calculation
-    using SortedTriggerSet =
-        CompileTimeList<TriggerEvent, FeatureType::kRefreshTrigger>;
-    static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
+                for (auto i = start; i < active->size(); ++i) {
+                    (*active)[i] += Tail::kDimensions;
+                }
+            }
+        }
 
-    // Get the feature quantity name
-    static std::string GetName() {
-      return FeatureType::kName;
-    }
+        // Get a list of indices whose values have changed from the previous one in the feature quantity
+        template <typename IndexListType>
+        static void collect_changed_indices(
+            const Position& pos,
+            const TriggerEvent trigger,
+            const Color perspective,
+            IndexListType* const removed,
+            IndexListType* const added) {
 
-   private:
-    // Get a list of indices for active features
-    static void CollectActiveIndices(
-        const Position& pos, const TriggerEvent trigger, const Color perspective,
-        IndexList* const active) {
-      if (FeatureType::kRefreshTrigger == trigger) {
-        FeatureType::AppendActiveIndices(pos, perspective, active);
-      }
-    }
+            Tail::collect_changed_indices(pos, trigger, perspective, removed, added);
+            if (Head::kRefreshTrigger == trigger) {
+                const auto start_removed = removed->size();
+                const auto start_added = added->size();
+                Head::append_changed_indices(pos, perspective, removed, added);
 
-    // Get a list of indices for recently changed features
-    static void CollectChangedIndices(
-        const Position& pos, const TriggerEvent trigger, const Color perspective,
-        IndexList* const removed, IndexList* const added) {
+                for (auto i = start_removed; i < removed->size(); ++i) {
+                    (*removed)[i] += Tail::kDimensions;
+                }
 
-      if (FeatureType::kRefreshTrigger == trigger) {
-        FeatureType::AppendChangedIndices(pos, perspective, removed, added);
-      }
-    }
+                for (auto i = start_added; i < added->size(); ++i) {
+                    (*added)[i] += Tail::kDimensions;
+                }
+            }
+        }
 
-    // Make the base class and the class template that recursively uses itself a friend
-    friend class FeatureSetBase<FeatureSet>;
-    template <typename... FeatureTypes>
-    friend class FeatureSet;
-  };
+        // Make the base class and the class template that recursively uses itself a friend
+        friend class FeatureSetBase<FeatureSet>;
+
+        template <typename... FeatureTypes>
+        friend class FeatureSet;
+    };
+
+    // Class template that represents the feature set
+    template <typename FeatureType>
+    class FeatureSet<FeatureType> : public FeatureSetBase<FeatureSet<FeatureType>> {
+
+    public:
+        // Hash value embedded in the evaluation file
+        static constexpr std::uint32_t kHashValue = FeatureType::kHashValue;
+
+        // Number of feature dimensions
+        static constexpr IndexType kDimensions = FeatureType::kDimensions;
+
+        // Maximum number of simultaneously active features
+        static constexpr IndexType kMaxActiveDimensions =
+            FeatureType::kMaxActiveDimensions;
+
+        // Trigger for full calculation instead of difference calculation
+        using SortedTriggerSet =
+            CompileTimeList<TriggerEvent, FeatureType::kRefreshTrigger>;
+
+        static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
+
+        // Get the feature quantity name
+        static std::string get_name() {
+            return FeatureType::kName;
+        }
+
+    private:
+        // Get a list of indices for active features
+        static void collect_active_indices(
+            const Position& pos,
+            const TriggerEvent trigger,
+            const Color perspective,
+            IndexList* const active) {
+
+            if (FeatureType::kRefreshTrigger == trigger) {
+              FeatureType::append_active_indices(pos, perspective, active);
+            }
+        }
+
+        // Get a list of indices for recently changed features
+        static void collect_changed_indices(
+            const Position& pos,
+            const TriggerEvent trigger,
+            const Color perspective,
+            IndexList* const removed,
+            IndexList* const added) {
+
+            if (FeatureType::kRefreshTrigger == trigger) {
+              FeatureType::append_changed_indices(pos, perspective, removed, added);
+            }
+        }
+
+        // Make the base class and the class template that recursively uses itself a friend
+        friend class FeatureSetBase<FeatureSet>;
+
+        template <typename... FeatureTypes>
+        friend class FeatureSet;
+    };
 
 }  // namespace Eval::NNUE::Features
 
diff --git a/src/nnue/features/features_common.h b/src/nnue/features/features_common.h
index 3377cd8f..656502a3 100644
--- a/src/nnue/features/features_common.h
+++ b/src/nnue/features/features_common.h
@@ -34,10 +34,10 @@ namespace Eval::NNUE::Features {
   // Trigger to perform full calculations instead of difference only
   enum class TriggerEvent {
     kNone, // Calculate the difference whenever possible
-    kFriendKingMoved, // calculate all when own ball moves
-    kEnemyKingMoved, // do all calculations when enemy balls move
-    kAnyKingMoved, // do all calculations if either ball moves
-    kAnyPieceMoved, // always do all calculations
+    kFriendKingMoved, // calculate full evaluation when own king moves
+    kEnemyKingMoved, // calculate full evaluation when opponent king moves
+    kAnyKingMoved, // calculate full evaluation when any king moves
+    kAnyPieceMoved, // always calculate full evaluation
   };
 
   enum class Side {
diff --git a/src/nnue/features/half_ka.cpp b/src/nnue/features/half_ka.cpp
new file mode 100644
index 00000000..08124b96
--- /dev/null
+++ b/src/nnue/features/half_ka.cpp
@@ -0,0 +1,93 @@
+/*
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+//Definition of input features HalfKA of NNUE evaluation function
+
+#include "half_ka.h"
+#include "index_list.h"
+
+namespace Eval::NNUE::Features {
+
+    // Orient a square according to perspective (rotate the board 180° for black)
+    // Important note for "halfka": this arch was designed with "flip" in mind 
+    // although it still is untested which approach is better.
+    // this has to stay until we find a better arch that works with "flip".
+    // allows us to use current master net for gensfen (primarily needed for higher quality data)
+    inline Square orient(Color perspective, Square s) {
+        return Square(int(s) ^ (bool(perspective) * 63));
+    }
+
+    // Find the index of the feature quantity from the king position and PieceSquare
+    template <Side AssociatedKing>
+    inline IndexType HalfKA<AssociatedKing>::make_index(
+        Color perspective,
+        Square s,
+        Piece pc,
+        Square ksq) {
+
+        return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective] + PS_END2 * ksq);
+    }
+
+    // Get a list of indices for active features
+    template <Side AssociatedKing>
+    void HalfKA<AssociatedKing>::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {
+
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));
+
+        Bitboard bb = pos.pieces();
+        while (bb) {
+            Square s = pop_lsb(&bb);
+            active->push_back(make_index(perspective, s, pos.piece_on(s), ksq));
+        }
+    }
+
+    // Get a list of indices for recently changed features
+    template <Side AssociatedKing>
+    void HalfKA<AssociatedKing>::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* added) {
+
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));
+
+        const auto& dp = pos.state()->dirtyPiece;
+        for (int i = 0; i < dp.dirty_num; ++i) {
+            Piece pc = dp.piece[i];
+
+            if (dp.from[i] != SQ_NONE)
+                removed->push_back(make_index(perspective, dp.from[i], pc, ksq));
+
+            if (dp.to[i] != SQ_NONE)
+                added->push_back(make_index(perspective, dp.to[i], pc, ksq));
+        }
+    }
+
+    template class HalfKA<Side::kFriend>;
+    template class HalfKA<Side::kEnemy>;
+
+}  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/half_ka.h b/src/nnue/features/half_ka.h
new file mode 100644
index 00000000..2839357e
--- /dev/null
+++ b/src/nnue/features/half_ka.h
@@ -0,0 +1,75 @@
+/*
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef NNUE_FEATURES_HALF_KA_H_INCLUDED
+#define NNUE_FEATURES_HALF_KA_H_INCLUDED
+
+#include "features_common.h"
+
+#include "evaluate.h"
+
+//Definition of input features HalfKPK of NNUE evaluation function
+namespace Eval::NNUE::Features {
+
+    // Feature HalfKPK: Combination of the position of own king
+    // and the position of pieces other than kings
+    template <Side AssociatedKing>
+    class HalfKA {
+
+    public:
+        // Feature name
+        static constexpr const char* kName = (AssociatedKing == Side::kFriend) ?
+            "HalfKA(Friend)" : "HalfKA(Enemy)";
+
+        // Hash value embedded in the evaluation file
+        static constexpr std::uint32_t kHashValue =
+            0x5F134CB9u ^ (AssociatedKing == Side::kFriend);
+
+        // Number of feature dimensions
+        static constexpr IndexType kDimensions =
+            static_cast<IndexType>(SQUARE_NB) * static_cast<IndexType>(PS_END2);
+
+        // Maximum number of simultaneously active features
+        static constexpr IndexType kMaxActiveDimensions = 32;
+
+        // Trigger for full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger =
+            (AssociatedKing == Side::kFriend) ?
+            TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
+
+        // Get a list of indices for active features
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);
+
+        // Get a list of indices for recently changed features
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
+
+    private:
+        // Index of a feature for a given king position and another piece on some square
+        static IndexType make_index(Color perspective, Square s, Piece pc, Square sq_k);
+    };
+
+}  // namespace Eval::NNUE::Features
+
+#endif // #ifndef NNUE_FEATURES_HALF_KA_H_INCLUDED
diff --git a/src/nnue/features/half_kp.cpp b/src/nnue/features/half_kp.cpp
index 88e384a3..743a6378 100644
--- a/src/nnue/features/half_kp.cpp
+++ b/src/nnue/features/half_kp.cpp
@@ -1,19 +1,19 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
 
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
 
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 //Definition of input features HalfKP of NNUE evaluation function
@@ -23,50 +23,72 @@
 
 namespace Eval::NNUE::Features {
 
-  // Orient a square according to perspective (rotates by 180 for black)
-  inline Square orient(Color perspective, Square s) {
-    return Square(int(s) ^ (bool(perspective) * 63));
-  }
-
-  // Find the index of the feature quantity from the king position and PieceSquare
-  template <Side AssociatedKing>
-  inline IndexType HalfKP<AssociatedKing>::MakeIndex(
-      Color perspective, Square s, Piece pc, Square ksq) {
-
-    return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective] + PS_END * ksq);
-  }
-
-  // Get a list of indices for active features
-  template <Side AssociatedKing>
-  void HalfKP<AssociatedKing>::AppendActiveIndices(
-      const Position& pos, Color perspective, IndexList* active) {
-
-    Square ksq = orient(perspective, pos.square<KING>(perspective));
-    Bitboard bb = pos.pieces() & ~pos.pieces(KING);
-    while (bb) {
-      Square s = pop_lsb(&bb);
-      active->push_back(MakeIndex(perspective, s, pos.piece_on(s), ksq));
+    // Orient a square according to perspective (rotate the board 180° for black)
+    // this has to stay until we find a better arch that works with "flip".
+    // allows us to use current master net for gensfen (primarily needed for higher quality data)
+    inline Square orient(Color perspective, Square s) {
+        return Square(int(s) ^ (bool(perspective) * 63));
     }
-  }
 
-  // Get a list of indices for recently changed features
-  template <Side AssociatedKing>
-  void HalfKP<AssociatedKing>::AppendChangedIndices(
-      const Position& pos, Color perspective,
-      IndexList* removed, IndexList* added) {
+    // Find the index of the feature quantity from the king position and PieceSquare
+    template <Side AssociatedKing>
+    inline IndexType HalfKP<AssociatedKing>::make_index(
+        Color perspective,
+        Square s,
+        Piece pc,
+        Square ksq) {
 
-    Square ksq = orient(perspective, pos.square<KING>(perspective));
-    const auto& dp = pos.state()->dirtyPiece;
-    for (int i = 0; i < dp.dirty_num; ++i) {
-      Piece pc = dp.piece[i];
-      if (type_of(pc) == KING) continue;
-      if (dp.from[i] != SQ_NONE)
-        removed->push_back(MakeIndex(perspective, dp.from[i], pc, ksq));
-      if (dp.to[i] != SQ_NONE)
-        added->push_back(MakeIndex(perspective, dp.to[i], pc, ksq));
+        return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective] + PS_END * ksq);
     }
-  }
 
-  template class HalfKP<Side::kFriend>;
+    // Get a list of indices for active features
+    template <Side AssociatedKing>
+    void HalfKP<AssociatedKing>::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {
+
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));
+
+        Bitboard bb = pos.pieces() & ~pos.pieces(KING);
+        while (bb) {
+            Square s = pop_lsb(&bb);
+            active->push_back(make_index(perspective, s, pos.piece_on(s), ksq));
+        }
+    }
+
+    // Get a list of indices for recently changed features
+    template <Side AssociatedKing>
+    void HalfKP<AssociatedKing>::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* added) {
+
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));
+
+        const auto& dp = pos.state()->dirtyPiece;
+        for (int i = 0; i < dp.dirty_num; ++i) {
+            Piece pc = dp.piece[i];
+
+            if (type_of(pc) == KING)
+                continue;
+
+            if (dp.from[i] != SQ_NONE)
+                removed->push_back(make_index(perspective, dp.from[i], pc, ksq));
+
+            if (dp.to[i] != SQ_NONE)
+                added->push_back(make_index(perspective, dp.to[i], pc, ksq));
+        }
+    }
+
+    template class HalfKP<Side::kFriend>;
+    template class HalfKP<Side::kEnemy>;
 
 }  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/half_kp.h b/src/nnue/features/half_kp.h
index ee6a8df3..4a4329e8 100644
--- a/src/nnue/features/half_kp.h
+++ b/src/nnue/features/half_kp.h
@@ -1,62 +1,74 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
 
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
 
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-//Definition of input features HalfKP of NNUE evaluation function
-
 #ifndef NNUE_FEATURES_HALF_KP_H_INCLUDED
 #define NNUE_FEATURES_HALF_KP_H_INCLUDED
 
-#include "../../evaluate.h"
 #include "features_common.h"
 
+#include "evaluate.h"
+
+//Definition of input features HalfKP of NNUE evaluation function
 namespace Eval::NNUE::Features {
 
-  // Feature HalfKP: Combination of the position of own king
-  // and the position of pieces other than kings
-  template <Side AssociatedKing>
-  class HalfKP {
+    // Feature HalfKP: Combination of the position of own king
+    // and the position of pieces other than kings
+    template <Side AssociatedKing>
+    class HalfKP {
 
-   public:
-    // Feature name
-    static constexpr const char* kName = "HalfKP(Friend)";
-    // Hash value embedded in the evaluation file
-    static constexpr std::uint32_t kHashValue =
-        0x5D69D5B9u ^ (AssociatedKing == Side::kFriend);
-    // Number of feature dimensions
-    static constexpr IndexType kDimensions =
-        static_cast<IndexType>(SQUARE_NB) * static_cast<IndexType>(PS_END);
-    // Maximum number of simultaneously active features
-    static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
-    // Trigger for full calculation instead of difference calculation
-    static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kFriendKingMoved;
+    public:
+        // Feature name
+        static constexpr const char* kName = (AssociatedKing == Side::kFriend) ?
+            "HalfKP(Friend)" : "HalfKP(Enemy)";
 
-    // Get a list of indices for active features
-    static void AppendActiveIndices(const Position& pos, Color perspective,
-                                    IndexList* active);
+        // Hash value embedded in the evaluation file
+        static constexpr std::uint32_t kHashValue =
+            0x5D69D5B9u ^ (AssociatedKing == Side::kFriend);
 
-    // Get a list of indices for recently changed features
-    static void AppendChangedIndices(const Position& pos, Color perspective,
-                                     IndexList* removed, IndexList* added);
+        // Number of feature dimensions
+        static constexpr IndexType kDimensions =
+            static_cast<IndexType>(SQUARE_NB) * static_cast<IndexType>(PS_END);
 
-   private:
-    // Index of a feature for a given king position and another piece on some square
-    static IndexType MakeIndex(Color perspective, Square s, Piece pc, Square sq_k);
-  };
+        // Maximum number of simultaneously active features
+        static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
+
+        // Trigger for full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger =
+            (AssociatedKing == Side::kFriend) ?
+            TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
+
+        // Get a list of indices for active features
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);
+
+        // Get a list of indices for recently changed features
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
+
+    private:
+        // Index of a feature for a given king position and another piece on some square
+        static IndexType make_index(Color perspective, Square s, Piece pc, Square sq_k);
+    };
 
 }  // namespace Eval::NNUE::Features
 
diff --git a/src/nnue/features/half_relative_ka.cpp b/src/nnue/features/half_relative_ka.cpp
new file mode 100644
index 00000000..d2ad31e6
--- /dev/null
+++ b/src/nnue/features/half_relative_ka.cpp
@@ -0,0 +1,90 @@
+﻿#include "half_relative_ka.h"
+#include "index_list.h"
+
+//Definition of input features HalfRelativeKA of NNUE evaluation function
+namespace Eval::NNUE::Features {
+
+    // Orient a square according to perspective (rotate the board 180° for black)
+    // Important note for "halfka": this arch was designed with "flip" in mind 
+    // although it still is untested which approach is better.
+    // this has to stay until we find a better arch that works with "flip".
+    // allows us to use current master net for gensfen (primarily needed for higher quality data)
+    inline Square orient(Color perspective, Square s) {
+        return Square(int(s) ^ (bool(perspective) * 63));
+    }
+
+    // Find the index of the feature quantity from the ball position and PieceSquare
+    template <Side AssociatedKing>
+    inline IndexType HalfRelativeKA<AssociatedKing>::make_index(
+        Color perspective,
+        Square s,
+        Piece pc,
+        Square sq_k) {
+
+        const IndexType p = IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
+        return make_index(sq_k, p);
+    }
+
+    // Find the index of the feature quantity from the ball position and PieceSquare
+    template <Side AssociatedKing>
+    inline IndexType HalfRelativeKA<AssociatedKing>::make_index(
+        Square sq_k,
+        IndexType p) {
+
+        constexpr IndexType W = kBoardWidth;
+        constexpr IndexType H = kBoardHeight;
+        const IndexType piece_index = (p - PS_W_PAWN) / SQUARE_NB;
+        const Square sq_p = static_cast<Square>((p - PS_W_PAWN) % SQUARE_NB);
+        const IndexType relative_file = file_of(sq_p) - file_of(sq_k) + (W / 2);
+        const IndexType relative_rank = rank_of(sq_p) - rank_of(sq_k) + (H / 2);
+        return H * W * piece_index + H * relative_file + relative_rank;
+    }
+
+    // Get a list of indices with a value of 1 among the features
+    template <Side AssociatedKing>
+    void HalfRelativeKA<AssociatedKing>::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {
+
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));
+
+        Bitboard bb = pos.pieces();
+        while (bb) {
+            Square s = pop_lsb(&bb);
+            active->push_back(make_index(perspective, s, pos.piece_on(s), ksq));
+        }
+    }
+
+    // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+    template <Side AssociatedKing>
+    void HalfRelativeKA<AssociatedKing>::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* added) {
+
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));
+
+        const auto& dp = pos.state()->dirtyPiece;
+        for (int i = 0; i < dp.dirty_num; ++i) {
+            Piece pc = dp.piece[i];
+
+            if (dp.from[i] != SQ_NONE)
+                removed->push_back(make_index(perspective, dp.from[i], pc, ksq));
+
+            if (dp.to[i] != SQ_NONE)
+                added->push_back(make_index(perspective, dp.to[i], pc, ksq));
+        }
+    }
+
+    template class HalfRelativeKA<Side::kFriend>;
+    template class HalfRelativeKA<Side::kEnemy>;
+
+}  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/half_relative_ka.h b/src/nnue/features/half_relative_ka.h
new file mode 100644
index 00000000..f42661e9
--- /dev/null
+++ b/src/nnue/features/half_relative_ka.h
@@ -0,0 +1,68 @@
+﻿#ifndef _NNUE_FEATURES_HALF_RELATIVE_KA_H_
+#define _NNUE_FEATURES_HALF_RELATIVE_KA_H_
+
+#include "features_common.h"
+
+#include "evaluate.h"
+
+// Definition of input features HalfRelativeKA of NNUE evaluation function
+// K - King
+// A - Any piece
+// KA - product of K and A
+namespace Eval::NNUE::Features {
+
+    // Feature HalfRelativeKA: Relative position of each piece other than the ball based on own ball or enemy ball
+    template <Side AssociatedKing>
+    class HalfRelativeKA {
+    public:
+        // feature quantity name
+        static constexpr const char* kName = (AssociatedKing == Side::kFriend) ?
+            "HalfRelativeKA(Friend)" : "HalfRelativeKA(Enemy)";
+
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t kHashValue =
+            0xA123051Fu ^ (AssociatedKing == Side::kFriend);
+
+        static constexpr IndexType kNumPieceKinds = 6 * 2;
+
+        // width of the virtual board with the ball in the center
+        static constexpr IndexType kBoardWidth = FILE_NB * 2 - 1;
+
+        // height of a virtual board with balls in the center
+        static constexpr IndexType kBoardHeight = RANK_NB * 2 - 1;
+
+        // number of feature dimensions
+        static constexpr IndexType kDimensions =
+            kNumPieceKinds * kBoardHeight * kBoardWidth;
+
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions = 32;
+
+        // Timing of full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger =
+            (AssociatedKing == Side::kFriend) ?
+            TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
+
+        // Get a list of indices with a value of 1 among the features
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);
+
+        // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
+
+        // Find the index of the feature quantity from the ball position and PieceSquare
+        static IndexType make_index(Square s, IndexType p);
+
+        // Find the index of the feature quantity from the ball position and PieceSquare
+        static IndexType make_index(Color perspective, Square s, Piece pc, Square sq_k);
+    };
+
+}  // namespace Eval::NNUE::Features
+
+#endif // #ifndef _NNUE_FEATURES_HALF_RELATIVE_KA_H_
diff --git a/src/nnue/features/half_relative_kp.cpp b/src/nnue/features/half_relative_kp.cpp
index 015ecb73..2ebccd59 100644
--- a/src/nnue/features/half_relative_kp.cpp
+++ b/src/nnue/features/half_relative_kp.cpp
@@ -1,78 +1,91 @@
-﻿//Definition of input features HalfRelativeKP of NNUE evaluation function
-
-#if defined(EVAL_NNUE)
-
-#include "half_relative_kp.h"
+﻿#include "half_relative_kp.h"
 #include "index_list.h"
 
-namespace Eval {
+//Definition of input features HalfRelativeKP of NNUE evaluation function
+namespace Eval::NNUE::Features {
 
-namespace NNUE {
+    // Orient a square according to perspective (rotate the board 180° for black)
+    // this has to stay until we find a better arch that works with "flip".
+    // allows us to use current master net for gensfen (primarily needed for higher quality data)
+    inline Square orient(Color perspective, Square s) {
+        return Square(int(s) ^ (bool(perspective) * 63));
+    }
 
-namespace Features {
+    // Find the index of the feature quantity from the ball position and PieceSquare
+    template <Side AssociatedKing>
+    inline IndexType HalfRelativeKP<AssociatedKing>::make_index(
+        Color perspective,
+        Square s,
+        Piece pc,
+        Square sq_k) {
 
-// Orient a square according to perspective (rotates by 180 for black)
-inline Square orient(Color perspective, Square s) {
-  return Square(int(s) ^ (bool(perspective) * 63));
-}
+        const IndexType p = IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
+        return make_index(sq_k, p);
+    }
 
-// Find the index of the feature quantity from the ball position and PieceSquare
-template <Side AssociatedKing>
-inline IndexType HalfRelativeKP<AssociatedKing>::MakeIndex(
-  Color perspective, Square s, Piece pc, Square sq_k) {
-  const IndexType p = IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
-  return MakeIndex(sq_k, p);
-}
+    // Find the index of the feature quantity from the ball position and PieceSquare
+    template <Side AssociatedKing>
+    inline IndexType HalfRelativeKP<AssociatedKing>::make_index(
+        Square sq_k,
+        IndexType p) {
 
-// Find the index of the feature quantity from the ball position and PieceSquare
-template <Side AssociatedKing>
-inline IndexType HalfRelativeKP<AssociatedKing>::MakeIndex(
-    Square sq_k, IndexType p) {
-  constexpr IndexType W = kBoardWidth;
-  constexpr IndexType H = kBoardHeight;
-  const IndexType piece_index = (p - PS_W_PAWN) / SQUARE_NB;
-  const Square sq_p = static_cast<Square>((p - PS_W_PAWN) % SQUARE_NB);
-  const IndexType relative_file = file_of(sq_p) - file_of(sq_k) + (W / 2);
-  const IndexType relative_rank = rank_of(sq_p) - rank_of(sq_k) + (H / 2);
-  return H * W * piece_index + H * relative_file + relative_rank;
-}
+        constexpr IndexType W = kBoardWidth;
+        constexpr IndexType H = kBoardHeight;
+        const IndexType piece_index = (p - PS_W_PAWN) / SQUARE_NB;
+        const Square sq_p = static_cast<Square>((p - PS_W_PAWN) % SQUARE_NB);
+        const IndexType relative_file = file_of(sq_p) - file_of(sq_k) + (W / 2);
+        const IndexType relative_rank = rank_of(sq_p) - rank_of(sq_k) + (H / 2);
+        return H * W * piece_index + H * relative_file + relative_rank;
+    }
 
-// Get a list of indices with a value of 1 among the features
-template <Side AssociatedKing>
-void HalfRelativeKP<AssociatedKing>::AppendActiveIndices(
-    const Position& pos, Color perspective, IndexList* active) {
-  Square ksq = orient(perspective, pos.square<KING>(perspective));
-  Bitboard bb = pos.pieces() & ~pos.pieces(KING);
-  while (bb) {
-    Square s = pop_lsb(&bb);
-    active->push_back(MakeIndex(perspective, s, pos.piece_on(s), ksq));
-  }
-}
+    // Get a list of indices with a value of 1 among the features
+    template <Side AssociatedKing>
+    void HalfRelativeKP<AssociatedKing>::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {
 
-// Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-template <Side AssociatedKing>
-void HalfRelativeKP<AssociatedKing>::AppendChangedIndices(
-    const Position& pos, Color perspective,
-    IndexList* removed, IndexList* added) {
-  Square ksq = orient(perspective, pos.square<KING>(perspective));
-  const auto& dp = pos.state()->dirtyPiece;
-  for (int i = 0; i < dp.dirty_num; ++i) {
-    Piece pc = dp.piece[i];
-    if (type_of(pc) == KING) continue;
-    if (dp.from[i] != SQ_NONE)
-      removed->push_back(MakeIndex(perspective, dp.from[i], pc, ksq));
-    if (dp.to[i] != SQ_NONE)
-      added->push_back(MakeIndex(perspective, dp.to[i], pc, ksq));
-  }
-}
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));
 
-template class HalfRelativeKP<Side::kFriend>;
-template class HalfRelativeKP<Side::kEnemy>;
+        Bitboard bb = pos.pieces() & ~pos.pieces(KING);
+        while (bb) {
+            Square s = pop_lsb(&bb);
+            active->push_back(make_index(perspective, s, pos.piece_on(s), ksq));
+        }
+    }
 
-}  // namespace Features
+    // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+    template <Side AssociatedKing>
+    void HalfRelativeKP<AssociatedKing>::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* added) {
 
-}  // namespace NNUE
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));
 
-}  // namespace Eval
+        const auto& dp = pos.state()->dirtyPiece;
+        for (int i = 0; i < dp.dirty_num; ++i) {
+            Piece pc = dp.piece[i];
 
-#endif  // defined(EVAL_NNUE)
+            if (type_of(pc) == KING)
+                continue;
+
+            if (dp.from[i] != SQ_NONE)
+                removed->push_back(make_index(perspective, dp.from[i], pc, ksq));
+
+            if (dp.to[i] != SQ_NONE)
+                added->push_back(make_index(perspective, dp.to[i], pc, ksq));
+        }
+    }
+
+    template class HalfRelativeKP<Side::kFriend>;
+    template class HalfRelativeKP<Side::kEnemy>;
+
+}  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/half_relative_kp.h b/src/nnue/features/half_relative_kp.h
index 2d4182e4..590a01a3 100644
--- a/src/nnue/features/half_relative_kp.h
+++ b/src/nnue/features/half_relative_kp.h
@@ -1,65 +1,66 @@
-﻿//Definition of input features HalfRelativeKP of NNUE evaluation function
-
-#ifndef _NNUE_FEATURES_HALF_RELATIVE_KP_H_
+﻿#ifndef _NNUE_FEATURES_HALF_RELATIVE_KP_H_
 #define _NNUE_FEATURES_HALF_RELATIVE_KP_H_
 
-#if defined(EVAL_NNUE)
-
-#include "../../evaluate.h"
 #include "features_common.h"
 
-namespace Eval {
+#include "evaluate.h"
 
-namespace NNUE {
+//Definition of input features HalfRelativeKP of NNUE evaluation function
+namespace Eval::NNUE::Features {
 
-namespace Features {
+    // Feature HalfRelativeKP: Relative position of each piece other than the ball based on own ball or enemy ball
+    template <Side AssociatedKing>
+    class HalfRelativeKP {
+    public:
+        // feature quantity name
+        static constexpr const char* kName = (AssociatedKing == Side::kFriend) ?
+            "HalfRelativeKP(Friend)" : "HalfRelativeKP(Enemy)";
 
-// Feature HalfRelativeKP: Relative position of each piece other than the ball based on own ball or enemy ball
-template <Side AssociatedKing>
-class HalfRelativeKP {
- public:
-  // feature quantity name
-  static constexpr const char* kName = (AssociatedKing == Side::kFriend) ?
-      "HalfRelativeKP(Friend)" : "HalfRelativeKP(Enemy)";
-  // Hash value embedded in the evaluation function file
-  static constexpr std::uint32_t kHashValue =
-      0xF9180919u ^ (AssociatedKing == Side::kFriend);
-  // Piece type excluding balls
-  static constexpr IndexType kNumPieceKinds = 5 * 2;
-  // width of the virtual board with the ball in the center
-  static constexpr IndexType kBoardWidth = FILE_NB * 2 - 1;
-  // height of a virtual board with balls in the center
-  static constexpr IndexType kBoardHeight = RANK_NB * 2 - 1;
-  // number of feature dimensions
-  static constexpr IndexType kDimensions =
-      kNumPieceKinds * kBoardHeight * kBoardWidth;
-  // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-  static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
-  // Timing of full calculation instead of difference calculation
-  static constexpr TriggerEvent kRefreshTrigger =
-      (AssociatedKing == Side::kFriend) ?
-      TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t kHashValue =
+            0xF9180919u ^ (AssociatedKing == Side::kFriend);
 
-  // Get a list of indices with a value of 1 among the features
-  static void AppendActiveIndices(const Position& pos, Color perspective,
-                                  IndexList* active);
+        // Piece type excluding balls
+        static constexpr IndexType kNumPieceKinds = 5 * 2;
 
-  // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-  static void AppendChangedIndices(const Position& pos, Color perspective,
-                                   IndexList* removed, IndexList* added);
+        // width of the virtual board with the ball in the center
+        static constexpr IndexType kBoardWidth = FILE_NB * 2 - 1;
 
-  // Find the index of the feature quantity from the ball position and PieceSquare
-  static IndexType MakeIndex(Square s, IndexType p);
-  // Find the index of the feature quantity from the ball position and PieceSquare
-  static IndexType MakeIndex(Color perspective, Square s, Piece pc, Square sq_k);
-};
+        // height of a virtual board with balls in the center
+        static constexpr IndexType kBoardHeight = RANK_NB * 2 - 1;
 
-}  // namespace Features
+        // number of feature dimensions
+        static constexpr IndexType kDimensions =
+            kNumPieceKinds * kBoardHeight * kBoardWidth;
 
-}  // namespace NNUE
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
 
-}  // namespace Eval
+        // Timing of full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger =
+            (AssociatedKing == Side::kFriend) ?
+            TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
 
-#endif  // defined(EVAL_NNUE)
+        // Get a list of indices with a value of 1 among the features
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);
+
+        // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
+
+        // Find the index of the feature quantity from the ball position and PieceSquare
+        static IndexType make_index(Square s, IndexType p);
+
+        // Find the index of the feature quantity from the ball position and PieceSquare
+        static IndexType make_index(Color perspective, Square s, Piece pc, Square sq_k);
+    };
+
+}  // namespace Eval::NNUE::Features
 
 #endif
diff --git a/src/nnue/features/k.cpp b/src/nnue/features/k.cpp
index 314b1338..7b62a75a 100644
--- a/src/nnue/features/k.cpp
+++ b/src/nnue/features/k.cpp
@@ -1,58 +1,45 @@
-﻿//Definition of input feature quantity K of NNUE evaluation function
-
-#if defined(EVAL_NNUE)
-
-#include "k.h"
+﻿#include "k.h"
 #include "index_list.h"
 
-namespace Eval {
+//Definition of input feature quantity K of NNUE evaluation function
+namespace Eval::NNUE::Features {
 
-namespace NNUE {
+    // Orient a square according to perspective (rotate the board 180° for black)
+    // this has to stay until we find a better arch that works with "flip".
+    // allows us to use current master net for gensfen (primarily needed for higher quality data)
+    inline Square orient(Color perspective, Square s) {
+        return Square(int(s) ^ (bool(perspective) * 63));
+    }
 
-namespace Features {
+    // Index of a feature for a given king position.
+    IndexType K::make_index(Color perspective, Square s, Color king_color) {
+        return IndexType(orient(perspective, s) + bool(perspective ^ king_color) * 64);
+    }
 
-// Orient a square according to perspective (rotates by 180 for black)
-inline Square orient(Color perspective, Square s) {
-  return Square(int(s) ^ (bool(perspective) * 63));
-}
+    // Get a list of indices with a value of 1 among the features
+    void K::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {
 
-// Index of a feature for a given king position.
-IndexType K::MakeIndex(Color perspective, Square s, Color king_color) {
-  return IndexType(orient(perspective, s) + bool(perspective ^ king_color) * 64);
-}
+        for (auto color : Colors) {
+          active->push_back(make_index(perspective, pos.square<KING>(color), color));
+        }
+    }
 
-// Get a list of indices with a value of 1 among the features
-void K::AppendActiveIndices(
-    const Position& pos, Color perspective, IndexList* active) {
-  for (auto color : Colors) {
-    active->push_back(MakeIndex(perspective, pos.square<KING>(color), color));
-  }
-}
+    // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+    void K::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* added) {
 
-// Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-void K::AppendChangedIndices(
-    const Position& pos, Color perspective,
-    IndexList* removed, IndexList* added) {
-  const auto& dp = pos.state()->dirtyPiece;
-  Color king_color;
-  if (dp.piece[0] == Piece::W_KING) {
-    king_color = WHITE;
-  }
-  else if (dp.piece[0] == Piece::B_KING) {
-    king_color = BLACK;
-  }
-  else {
-    return;
-  }
+        const auto& dp = pos.state()->dirtyPiece;
+        if (type_of(dp.piece[0]) == KING)
+        {
+            removed->push_back(make_index(perspective, dp.from[0], color_of(dp.piece[0])));
+            added->push_back(make_index(perspective, dp.to[0], color_of(dp.piece[0])));
+        }
+    }
 
-  removed->push_back(MakeIndex(perspective, dp.from[0], king_color));
-  added->push_back(MakeIndex(perspective, dp.to[0], king_color));
-}
-
-}  // namespace Features
-
-}  // namespace NNUE
-
-}  // namespace Eval
-
-#endif  // defined(EVAL_NNUE)
+}  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/k.h b/src/nnue/features/k.h
index 0c394f4e..928d77de 100644
--- a/src/nnue/features/k.h
+++ b/src/nnue/features/k.h
@@ -1,52 +1,49 @@
-﻿//Definition of input feature quantity K of NNUE evaluation function
-
-#ifndef _NNUE_FEATURES_K_H_
+﻿#ifndef _NNUE_FEATURES_K_H_
 #define _NNUE_FEATURES_K_H_
 
-#if defined(EVAL_NNUE)
-
-#include "../../evaluate.h"
 #include "features_common.h"
 
-namespace Eval {
+#include "evaluate.h"
 
-namespace NNUE {
+//Definition of input feature quantity K of NNUE evaluation function
+namespace Eval::NNUE::Features {
 
-namespace Features {
+    // Feature K: Ball position
+    class K {
+    public:
+        // feature quantity name
+        static constexpr const char* kName = "K";
 
-// Feature K: Ball position
-class K {
- public:
-  // feature quantity name
-  static constexpr const char* kName = "K";
-  // Hash value embedded in the evaluation function file
-  static constexpr std::uint32_t kHashValue = 0xD3CEE169u;
-  // number of feature dimensions
-  static constexpr IndexType kDimensions = SQUARE_NB * 2;
-  // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-  static constexpr IndexType kMaxActiveDimensions = 2;
-  // Timing of full calculation instead of difference calculation
-  static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t kHashValue = 0xD3CEE169u;
 
-  // Get a list of indices with a value of 1 among the features
-  static void AppendActiveIndices(const Position& pos, Color perspective,
-                                  IndexList* active);
+        // number of feature dimensions
+        static constexpr IndexType kDimensions = SQUARE_NB * 2;
 
-  // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-  static void AppendChangedIndices(const Position& pos, Color perspective,
-                                   IndexList* removed, IndexList* added);
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions = 2;
 
-private:
-  // Index of a feature for a given king position.
-  static IndexType MakeIndex(Color perspective, Square s, Color king_color);
-};
+        // Timing of full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
 
-}  // namespace Features
+        // Get a list of indices with a value of 1 among the features
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);
 
-}  // namespace NNUE
+        // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
 
-}  // namespace Eval
+    private:
+        // Index of a feature for a given king position.
+        static IndexType make_index(Color perspective, Square s, Color king_color);
+    };
 
-#endif  // defined(EVAL_NNUE)
+}  // namespace Eval::NNUE::Features
 
 #endif
diff --git a/src/nnue/features/p.cpp b/src/nnue/features/p.cpp
index b4a6faf9..a17e304f 100644
--- a/src/nnue/features/p.cpp
+++ b/src/nnue/features/p.cpp
@@ -1,56 +1,55 @@
-﻿//Definition of input feature P of NNUE evaluation function
-
-#if defined(EVAL_NNUE)
-
-#include "p.h"
+﻿#include "p.h"
 #include "index_list.h"
 
-namespace Eval {
+//Definition of input feature P of NNUE evaluation function
+namespace Eval::NNUE::Features {
 
-namespace NNUE {
+    // Orient a square according to perspective (rotate the board 180° for black)
+    // this has to stay until we find a better arch that works with "flip".
+    // allows us to use current master net for gensfen (primarily needed for higher quality data)
+    inline Square orient(Color perspective, Square s) {
+        return Square(int(s) ^ (bool(perspective) * 63));
+    }
 
-namespace Features {
+    // Find the index of the feature quantity from the king position and PieceSquare
+    inline IndexType P::make_index(
+        Color perspective, Square s, Piece pc) {
+        return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
+    }
 
-// Orient a square according to perspective (rotates by 180 for black)
-inline Square orient(Color perspective, Square s) {
-  return Square(int(s) ^ (bool(perspective) * 63));
-}
+    // Get a list of indices with a value of 1 among the features
+    void P::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {
 
-// Find the index of the feature quantity from the king position and PieceSquare
-inline IndexType P::MakeIndex(
-  Color perspective, Square s, Piece pc) {
-  return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
-}
+        Bitboard bb = pos.pieces() & ~pos.pieces(KING);
+        while (bb) {
+            Square s = pop_lsb(&bb);
+            active->push_back(make_index(perspective, s, pos.piece_on(s)));
+        }
+    }
 
-// Get a list of indices with a value of 1 among the features
-void P::AppendActiveIndices(
-    const Position& pos, Color perspective, IndexList* active) {
-  Bitboard bb = pos.pieces() & ~pos.pieces(KING);
-  while (bb) {
-    Square s = pop_lsb(&bb);
-    active->push_back(MakeIndex(perspective, s, pos.piece_on(s)));
-  }
-}
+    // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+    void P::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* added) {
 
-// Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-void P::AppendChangedIndices(
-    const Position& pos, Color perspective,
-    IndexList* removed, IndexList* added) {
-  const auto& dp = pos.state()->dirtyPiece;
-  for (int i = 0; i < dp.dirty_num; ++i) {
-    Piece pc = dp.piece[i];
-    if (type_of(pc) == KING) continue;
-    if (dp.from[i] != SQ_NONE)
-      removed->push_back(MakeIndex(perspective, dp.from[i], pc));
-    if (dp.to[i] != SQ_NONE)
-      added->push_back(MakeIndex(perspective, dp.to[i], pc));
-  }
-}
+        const auto& dp = pos.state()->dirtyPiece;
+        for (int i = 0; i < dp.dirty_num; ++i) {
+            Piece pc = dp.piece[i];
 
-}  // namespace Features
+            if (type_of(pc) == KING)
+              continue;
 
-}  // namespace NNUE
+            if (dp.from[i] != SQ_NONE)
+              removed->push_back(make_index(perspective, dp.from[i], pc));
 
-}  // namespace Eval
+            if (dp.to[i] != SQ_NONE)
+              added->push_back(make_index(perspective, dp.to[i], pc));
+        }
+    }
 
-#endif  // defined(EVAL_NNUE)
+}  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/p.h b/src/nnue/features/p.h
index b3d4191e..d461086b 100644
--- a/src/nnue/features/p.h
+++ b/src/nnue/features/p.h
@@ -1,52 +1,49 @@
-﻿//Definition of input feature P of NNUE evaluation function
-
-#ifndef _NNUE_FEATURES_P_H_
+﻿#ifndef _NNUE_FEATURES_P_H_
 #define _NNUE_FEATURES_P_H_
 
-#if defined(EVAL_NNUE)
-
-#include "../../evaluate.h"
 #include "features_common.h"
 
-namespace Eval {
+#include "evaluate.h"
 
-namespace NNUE {
+//Definition of input feature P of NNUE evaluation function
+namespace Eval::NNUE::Features {
 
-namespace Features {
+    // Feature P: PieceSquare of pieces other than balls
+    class P {
+    public:
+        // feature quantity name
+        static constexpr const char* kName = "P";
 
-// Feature P: PieceSquare of pieces other than balls
-class P {
- public:
-  // feature quantity name
-  static constexpr const char* kName = "P";
-  // Hash value embedded in the evaluation function file
-  static constexpr std::uint32_t kHashValue = 0x764CFB4Bu;
-  // number of feature dimensions
-  static constexpr IndexType kDimensions = PS_END;
-  // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-  static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
-  // Timing of full calculation instead of difference calculation
-  static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t kHashValue = 0x764CFB4Bu;
 
-  // Get a list of indices with a value of 1 among the features
-  static void AppendActiveIndices(const Position& pos, Color perspective,
-                                  IndexList* active);
+        // number of feature dimensions
+        static constexpr IndexType kDimensions = PS_END;
 
-  // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-  static void AppendChangedIndices(const Position& pos, Color perspective,
-                                   IndexList* removed, IndexList* added);
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
 
- private:
-  // Index of a feature for a given piece on some square
-  static IndexType MakeIndex(Color perspective, Square s, Piece pc);
-};
+        // Timing of full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
 
-}  // namespace Features
+        // Get a list of indices with a value of 1 among the features
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);
 
-}  // namespace NNUE
+        // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
 
-}  // namespace Eval
+    private:
+        // Index of a feature for a given piece on some square
+        static IndexType make_index(Color perspective, Square s, Piece pc);
+    };
 
-#endif  // defined(EVAL_NNUE)
+}  // namespace Eval::NNUE::Features
 
 #endif
diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index f24578a8..1f2ff7c5 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -24,6 +24,10 @@
 #include <iostream>
 #include "../nnue_common.h"
 
+#include <string>
+#include <type_traits>
+#include <cstdint>
+
 namespace Eval::NNUE::Layers {
 
   // Affine transformation layer
@@ -50,6 +54,8 @@ namespace Eval::NNUE::Layers {
     static constexpr std::size_t kBufferSize =
         PreviousLayer::kBufferSize + kSelfBufferSize;
 
+    static constexpr int kLayerIndex = PreviousLayer::kLayerIndex + 1;
+
     // Hash value embedded in the evaluation file
     static constexpr std::uint32_t GetHashValue() {
       std::uint32_t hash_value = 0xCC03DAE4u;
@@ -59,14 +65,27 @@ namespace Eval::NNUE::Layers {
       return hash_value;
     }
 
-    // A string that represents the structure from the input layer to this layer
-    static std::string GetStructureString() {
-      return "AffineTransform[" +
-        std::to_string(kOutputDimensions) + "<-" +
-        std::to_string(kInputDimensions) + "](" +
-        PreviousLayer::GetStructureString() + ")";
+    static std::string get_name() {
+        return "AffineTransform[" +
+            std::to_string(kOutputDimensions) + "<-" +
+            std::to_string(kInputDimensions) + "]";
     }
-    
+
+    // A string that represents the structure from the input layer to this layer
+    static std::string get_structure_string() {
+        return get_name() + "(" +
+            PreviousLayer::get_structure_string() + ")";
+    }
+
+    static std::string get_layers_info() {
+        std::string info = PreviousLayer::get_layers_info();
+        info += "\n  - ";
+        info += std::to_string(kLayerIndex);
+        info += " - ";
+        info += get_name();
+        return info;
+    }
+
    // Read network parameters
     bool ReadParameters(std::istream& stream) {
       if (!previous_layer_.ReadParameters(stream)) return false;
@@ -79,13 +98,17 @@ namespace Eval::NNUE::Layers {
 
     // write parameters
     bool WriteParameters(std::ostream& stream) const {
-      if (!previous_layer_.WriteParameters(stream)) return false;
-      stream.write(reinterpret_cast<const char*>(biases_),
-        kOutputDimensions * sizeof(BiasType));
-      stream.write(reinterpret_cast<const char*>(weights_),
-        kOutputDimensions * kPaddedInputDimensions *
-        sizeof(WeightType));
-      return !stream.fail();
+        if (!previous_layer_.WriteParameters(stream))
+            return false;
+
+        stream.write(reinterpret_cast<const char*>(biases_),
+            kOutputDimensions * sizeof(BiasType));
+
+        stream.write(reinterpret_cast<const char*>(weights_),
+            kOutputDimensions * kPaddedInputDimensions *
+            sizeof(WeightType));
+
+        return !stream.fail();
     }
 
     // Forward propagation
@@ -93,113 +116,606 @@ namespace Eval::NNUE::Layers {
         const TransformedFeatureType* transformed_features, char* buffer) const {
       const auto input = previous_layer_.Propagate(
           transformed_features, buffer + kSelfBufferSize);
+
+#if defined (USE_AVX512)
+
+      [[maybe_unused]] const __m512i kOnes512 = _mm512_set1_epi16(1);
+
+      [[maybe_unused]] auto m512_hadd = [](__m512i sum, int bias) -> int {
+        return _mm512_reduce_add_epi32(sum) + bias;
+      };
+
+      // This function takes
+      //   sum0 = [xmm0a, xmm0b, xmm0c, xmm0d]
+      //   sum1 = [xmm1a, xmm1b, xmm1c, xmm1d]
+      //   sum2 = [xmm2a, xmm2b, xmm2c, xmm2d]
+      //   sum3 = [xmm3a, xmm3b, xmm3c, xmm3d]
+      // and returns
+      //   ret = [
+      //     reduce_add_epi32(xmm0a), reduce_add_epi32(xmm1a), reduce_add_epi32(xmm2a), reduce_add_epi32(xmm3a),
+      //     reduce_add_epi32(xmm0b), reduce_add_epi32(xmm1b), reduce_add_epi32(xmm2b), reduce_add_epi32(xmm3b),
+      //     reduce_add_epi32(xmm0c), reduce_add_epi32(xmm1c), reduce_add_epi32(xmm2c), reduce_add_epi32(xmm3c),
+      //     reduce_add_epi32(xmm0d), reduce_add_epi32(xmm1d), reduce_add_epi32(xmm2d), reduce_add_epi32(xmm3d)
+      //   ]
+      [[maybe_unused]] auto m512_hadd128x16_interleave = [](
+        __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3) -> __m512i {
+
+        __m512i sum01a = _mm512_unpacklo_epi32(sum0, sum1);
+        __m512i sum01b = _mm512_unpackhi_epi32(sum0, sum1);
+
+        __m512i sum23a = _mm512_unpacklo_epi32(sum2, sum3);
+        __m512i sum23b = _mm512_unpackhi_epi32(sum2, sum3);
+
+        __m512i sum01 = _mm512_add_epi32(sum01a, sum01b);
+        __m512i sum23 = _mm512_add_epi32(sum23a, sum23b);
+
+        __m512i sum0123a = _mm512_unpacklo_epi64(sum01, sum23);
+        __m512i sum0123b = _mm512_unpackhi_epi64(sum01, sum23);
+
+        return _mm512_add_epi32(sum0123a, sum0123b);
+      };
+
+      [[maybe_unused]] auto m512_haddx4 = [m512_hadd128x16_interleave](
+        __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3, __m128i bias) -> __m128i {
+
+        __m512i sum = m512_hadd128x16_interleave(sum0, sum1, sum2, sum3);
+
+        __m256i sum256lo = _mm512_castsi512_si256(sum);
+        __m256i sum256hi = _mm512_extracti64x4_epi64(sum, 1);
+
+        sum256lo = _mm256_add_epi32(sum256lo, sum256hi);
+
+        __m128i sum128lo = _mm256_castsi256_si128(sum256lo);
+        __m128i sum128hi = _mm256_extracti128_si256(sum256lo, 1);
+
+        return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias);
+      };
+
+      [[maybe_unused]] auto m512_haddx8 = [m512_hadd128x16_interleave](
+        __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3,
+        __m512i sum4, __m512i sum5, __m512i sum6, __m512i sum7, __m256i bias) -> __m256i {
+
+        __m512i suma = m512_hadd128x16_interleave(sum0, sum1, sum2, sum3);
+        __m512i sumb = m512_hadd128x16_interleave(sum4, sum5, sum6, sum7);
+
+        __m512i indices0 = _mm512_setr_epi64(0, 1, 8, 9, 4, 5, 12, 13);
+        __m512i indices1 = _mm512_setr_epi64(2, 3, 10, 11, 6, 7, 14, 15);
+        __m512i x = _mm512_add_epi32(
+          _mm512_permutex2var_epi64(suma, indices0, sumb),
+          _mm512_permutex2var_epi64(suma, indices1, sumb));
+
+        __m256i sum256lo = _mm512_castsi512_si256(x);
+        __m256i sum256hi = _mm512_extracti64x4_epi64(x, 1);
+
+        return _mm256_add_epi32(_mm256_add_epi32(sum256lo, sum256hi), bias);
+      };
+
+      [[maybe_unused]] auto m512_hadd256x8 =[m512_hadd128x16_interleave](
+        __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3, __m256i bias) -> __m256i {
+
+        __m512i sum = m512_hadd128x16_interleave(sum0, sum1, sum2, sum3);
+
+        __m512i indices = _mm512_setr_epi32(
+          0, 4, 8, 12, 2, 6, 10, 14,
+          1, 5, 9, 13, 3, 7, 11, 15);
+        sum = _mm512_permutexvar_epi32(indices, sum);
+
+        __m256i sum256lo = _mm512_castsi512_si256(sum);
+        __m256i sum256hi = _mm512_extracti64x4_epi64(sum, 1);
+
+        return _mm256_add_epi32(_mm256_hadd_epi32(sum256lo, sum256hi), bias);
+      };
+
+      [[maybe_unused]] auto m512_hadd256x16 = [m512_hadd128x16_interleave](
+        __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3,
+        __m512i sum4, __m512i sum5, __m512i sum6, __m512i sum7, __m512i bias) -> __m512i {
+
+        __m512i suma = m512_hadd128x16_interleave(sum0, sum1, sum2, sum3);
+        __m512i sumb = m512_hadd128x16_interleave(sum4, sum5, sum6, sum7);
+
+        __m512i indices0 = _mm512_setr_epi64(0, 1, 8, 9, 4, 5, 12, 13);
+        __m512i indices1 = _mm512_setr_epi64(2, 3, 10, 11, 6, 7, 14, 15);
+        __m512i x = _mm512_add_epi32(
+          _mm512_permutex2var_epi64(suma, indices0, sumb),
+          _mm512_permutex2var_epi64(suma, indices1, sumb));
+
+        __m512i indices = _mm512_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15);
+        return _mm512_add_epi32(_mm512_permutexvar_epi32(indices, x), bias);
+      };
+
+#if defined (USE_VNNI)
+      [[maybe_unused]] auto m512_add_dpbusd_epi32 = [=](__m512i& acc, __m512i a, __m512i b) {
+        acc = _mm512_dpbusd_epi32(acc, a, b);
+#else
+      [[maybe_unused]] auto m512_dpbusd_epi32 = [=](__m512i a, __m512i b) -> __m512i {
+        __m512i product0 = _mm512_maddubs_epi16(a, b);
+        return _mm512_madd_epi16(product0, kOnes512);
+#endif
+      };
+
+#endif
+#if defined (USE_AVX2)
+
+      [[maybe_unused]] const __m256i kOnes256 = _mm256_set1_epi16(1);
+
+      [[maybe_unused]] auto m256_hadd = [](__m256i sum, int bias) -> int {
+        __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
+        sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC));
+        sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB));
+        return _mm_cvtsi128_si32(sum128) + bias;
+      };
+
+      [[maybe_unused]] auto m256_haddx4 = [](__m256i sum0, __m256i sum1, __m256i sum2, __m256i sum3, __m128i bias) -> __m128i {
+        sum0 = _mm256_hadd_epi32(sum0, sum1);
+        sum2 = _mm256_hadd_epi32(sum2, sum3);
+
+        sum0 = _mm256_hadd_epi32(sum0, sum2);
+
+        __m128i sum128lo = _mm256_castsi256_si128(sum0);
+        __m128i sum128hi = _mm256_extracti128_si256(sum0, 1);
+
+        return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias);
+      };
+#if defined (USE_VNNI)
+      [[maybe_unused]] auto m256_add_dpbusd_epi32 = [=](__m256i& acc, __m256i a, __m256i b) {
+        acc = _mm256_dpbusd_epi32(acc, a, b);
+#else
+      [[maybe_unused]] auto m256_dpbusd_epi32 = [=](__m256i a, __m256i b) -> __m256i {
+        __m256i product0 = _mm256_maddubs_epi16(a, b);
+        return _mm256_madd_epi16(product0, kOnes256);
+#endif
+      };
+
+#endif
+
+#if defined (USE_SSSE3)
+
+      [[maybe_unused]] const __m128i kOnes128 = _mm_set1_epi16(1);
+
+      [[maybe_unused]] auto m128_hadd = [](__m128i sum, int bias) -> int {
+        sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC
+        sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB
+        return _mm_cvtsi128_si32(sum) + bias;
+      };
+
+      [[maybe_unused]] auto m128_haddx4 = [](__m128i sum0, __m128i sum1, __m128i sum2, __m128i sum3, __m128i bias) -> __m128i {
+        sum0 = _mm_hadd_epi32(sum0, sum1);
+        sum2 = _mm_hadd_epi32(sum2, sum3);
+
+        sum0 = _mm_hadd_epi32(sum0, sum2);
+
+        return _mm_add_epi32(sum0, bias);
+      };
+
+      [[maybe_unused]] auto m128_dpbusd_epi32 = [=](__m128i a, __m128i b) -> __m128i {
+        __m128i product0 = _mm_maddubs_epi16(a, b);
+        return _mm_madd_epi16(product0, kOnes128);
+      };
+
+#endif
+
+#if defined (USE_AVX512)
+
+      constexpr IndexType kNumChunks512 = kPaddedInputDimensions / (kSimdWidth * 2);
+      constexpr IndexType kNumChunks256 = kPaddedInputDimensions / kSimdWidth;
+
       const auto output = reinterpret_cast<OutputType*>(buffer);
 
-  #if defined(USE_AVX512)
-      constexpr IndexType kNumChunks = kPaddedInputDimensions / (kSimdWidth * 2);
-      const auto input_vector = reinterpret_cast<const __m512i*>(input);
-  #if !defined(USE_VNNI)
-      const __m512i kOnes = _mm512_set1_epi16(1);
-  #endif
+      // Since to saturate a zmm register it takes 64 bytes we
+      // cannot use AVX512 for the smaller affine transforms.
+      // Instead we fallback to a AVX2 implementation if the
+      // kInputDimensions isn't a multiple of 64.
+      // Note that this means that for example for
+      // kInputDimensions of 96 we fallback to AVX2 even though
+      // the first 64 elements could be processed with AVX512.
+      // This is caused by mixing the __m256 and __m512 variables
+      // required to better handle that case and it would
+      // require handling more cases statically not to lose performance.
+      // This should be revisited if such input dimensions are to be considered.
+      [[maybe_unused]] const auto input_vector512 = reinterpret_cast<const __m512i*>(input);
+      [[maybe_unused]] const auto input_vector256 = reinterpret_cast<const __m256i*>(input);
+
+      // kOutputDimensions is either 1 or a multiple of kSimdWidth
+      // because then it is also an input dimension.
+      if constexpr (kOutputDimensions % 16 == 0 && kNumChunks256 == 1)
+      {
+        for (IndexType i = 0; i < kOutputDimensions; i += 16)
+        {
+          const IndexType offset01a = (i + 0) * kPaddedInputDimensions;
+          const IndexType offset23a = (i + 2) * kPaddedInputDimensions;
+          const IndexType offset45a = (i + 4) * kPaddedInputDimensions;
+          const IndexType offset67a = (i + 6) * kPaddedInputDimensions;
+          const IndexType offset01b = (i + 8) * kPaddedInputDimensions;
+          const IndexType offset23b = (i + 10) * kPaddedInputDimensions;
+          const IndexType offset45b = (i + 12) * kPaddedInputDimensions;
+          const IndexType offset67b = (i + 14) * kPaddedInputDimensions;
+
+          const __m512i bias = *reinterpret_cast<const __m512i*>(&biases_[i]);
+          __m512i* outptr = reinterpret_cast<__m512i*>(&output[i]);
+
+          const auto row01a = *reinterpret_cast<const __m512i*>(&weights_[offset01a]);
+          const auto row23a = *reinterpret_cast<const __m512i*>(&weights_[offset23a]);
+          const auto row45a = *reinterpret_cast<const __m512i*>(&weights_[offset45a]);
+          const auto row67a = *reinterpret_cast<const __m512i*>(&weights_[offset67a]);
+          const auto row01b = *reinterpret_cast<const __m512i*>(&weights_[offset01b]);
+          const auto row23b = *reinterpret_cast<const __m512i*>(&weights_[offset23b]);
+          const auto row45b = *reinterpret_cast<const __m512i*>(&weights_[offset45b]);
+          const auto row67b = *reinterpret_cast<const __m512i*>(&weights_[offset67b]);
+
+          const __m256i in256 = input_vector256[0];
+          const __m512i in = _mm512_inserti64x4(_mm512_castsi256_si512(in256), in256, 1);
+
+#if defined (USE_VNNI)
+          __m512i sum01a = _mm512_setzero_si512();
+          __m512i sum23a = _mm512_setzero_si512();
+          __m512i sum45a = _mm512_setzero_si512();
+          __m512i sum67a = _mm512_setzero_si512();
+          __m512i sum01b = _mm512_setzero_si512();
+          __m512i sum23b = _mm512_setzero_si512();
+          __m512i sum45b = _mm512_setzero_si512();
+          __m512i sum67b = _mm512_setzero_si512();
+
+          m512_add_dpbusd_epi32(sum01a, in, row01a);
+          m512_add_dpbusd_epi32(sum23a, in, row23a);
+          m512_add_dpbusd_epi32(sum45a, in, row45a);
+          m512_add_dpbusd_epi32(sum67a, in, row67a);
+          m512_add_dpbusd_epi32(sum01b, in, row01b);
+          m512_add_dpbusd_epi32(sum23b, in, row23b);
+          m512_add_dpbusd_epi32(sum45b, in, row45b);
+          m512_add_dpbusd_epi32(sum67b, in, row67b);
+#else
+          __m512i sum01a = m512_dpbusd_epi32(in, row01a);
+          __m512i sum23a = m512_dpbusd_epi32(in, row23a);
+          __m512i sum45a = m512_dpbusd_epi32(in, row45a);
+          __m512i sum67a = m512_dpbusd_epi32(in, row67a);
+          __m512i sum01b = m512_dpbusd_epi32(in, row01b);
+          __m512i sum23b = m512_dpbusd_epi32(in, row23b);
+          __m512i sum45b = m512_dpbusd_epi32(in, row45b);
+          __m512i sum67b = m512_dpbusd_epi32(in, row67b);
+#endif
+
+          *outptr = m512_hadd256x16(
+            sum01a, sum23a, sum45a, sum67a,
+            sum01b, sum23b, sum45b, sum67b, bias);
+        }
+      }
+      else if constexpr (kOutputDimensions % 4 == 0)
+      {
+        for (IndexType i = 0; i < kOutputDimensions; i += 4)
+        {
+          const IndexType offset0 = (i + 0) * kPaddedInputDimensions;
+          const IndexType offset1 = (i + 1) * kPaddedInputDimensions;
+          const IndexType offset2 = (i + 2) * kPaddedInputDimensions;
+          const IndexType offset3 = (i + 3) * kPaddedInputDimensions;
+
+          const __m128i bias = *reinterpret_cast<const __m128i*>(&biases_[i]);
+          __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]);
+
+          if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) == 0)
+          {
+            const auto row0 = reinterpret_cast<const __m512i*>(&weights_[offset0]);
+            const auto row1 = reinterpret_cast<const __m512i*>(&weights_[offset1]);
+            const auto row2 = reinterpret_cast<const __m512i*>(&weights_[offset2]);
+            const auto row3 = reinterpret_cast<const __m512i*>(&weights_[offset3]);
+
+#if defined (USE_VNNI)
+            __m512i sum0 = _mm512_setzero_si512();
+            __m512i sum1 = _mm512_setzero_si512();
+            __m512i sum2 = _mm512_setzero_si512();
+            __m512i sum3 = _mm512_setzero_si512();
+            const IndexType kStart = 0;
+#else
+            __m512i sum0 = m512_dpbusd_epi32(input_vector512[0], row0[0]);
+            __m512i sum1 = m512_dpbusd_epi32(input_vector512[0], row1[0]);
+            __m512i sum2 = m512_dpbusd_epi32(input_vector512[0], row2[0]);
+            __m512i sum3 = m512_dpbusd_epi32(input_vector512[0], row3[0]);
+            const IndexType kStart = 1;
+#endif
+
+            for (IndexType j = kStart; j < kNumChunks512; ++j)
+            {
+              const __m512i in = input_vector512[j];
+
+#if defined (USE_VNNI)
+              m512_add_dpbusd_epi32(sum0, in, row0[j]);
+              m512_add_dpbusd_epi32(sum1, in, row1[j]);
+              m512_add_dpbusd_epi32(sum2, in, row2[j]);
+              m512_add_dpbusd_epi32(sum3, in, row3[j]);
+#else
+              sum0 = _mm512_add_epi32(sum0, m512_dpbusd_epi32(in, row0[j]));
+              sum1 = _mm512_add_epi32(sum1, m512_dpbusd_epi32(in, row1[j]));
+              sum2 = _mm512_add_epi32(sum2, m512_dpbusd_epi32(in, row2[j]));
+              sum3 = _mm512_add_epi32(sum3, m512_dpbusd_epi32(in, row3[j]));
+#endif
+            }
+
+            *outptr = m512_haddx4(sum0, sum1, sum2, sum3, bias);
+          }
+          else
+          {
+            const auto row0 = reinterpret_cast<const __m256i*>(&weights_[offset0]);
+            const auto row1 = reinterpret_cast<const __m256i*>(&weights_[offset1]);
+            const auto row2 = reinterpret_cast<const __m256i*>(&weights_[offset2]);
+            const auto row3 = reinterpret_cast<const __m256i*>(&weights_[offset3]);
+
+#if defined (USE_VNNI)
+            __m256i sum0 = _mm256_setzero_si256();
+            __m256i sum1 = _mm256_setzero_si256();
+            __m256i sum2 = _mm256_setzero_si256();
+            __m256i sum3 = _mm256_setzero_si256();
+            const IndexType kStart = 0;
+#else
+            __m256i sum0 = m256_dpbusd_epi32(input_vector256[0], row0[0]);
+            __m256i sum1 = m256_dpbusd_epi32(input_vector256[0], row1[0]);
+            __m256i sum2 = m256_dpbusd_epi32(input_vector256[0], row2[0]);
+            __m256i sum3 = m256_dpbusd_epi32(input_vector256[0], row3[0]);
+            const IndexType kStart = 1;
+#endif
+
+            for (IndexType j = kStart; j < kNumChunks256; ++j)
+            {
+              const __m256i in = input_vector256[j];
+
+#if defined (USE_VNNI)
+              m256_add_dpbusd_epi32(sum0, in, row0[j]);
+              m256_add_dpbusd_epi32(sum1, in, row1[j]);
+              m256_add_dpbusd_epi32(sum2, in, row2[j]);
+              m256_add_dpbusd_epi32(sum3, in, row3[j]);
+#else
+              sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
+              sum1 = _mm256_add_epi32(sum1, m256_dpbusd_epi32(in, row1[j]));
+              sum2 = _mm256_add_epi32(sum2, m256_dpbusd_epi32(in, row2[j]));
+              sum3 = _mm256_add_epi32(sum3, m256_dpbusd_epi32(in, row3[j]));
+#endif
+            }
+
+            *outptr = m256_haddx4(sum0, sum1, sum2, sum3, bias);
+          }
+        }
+      }
+      else if constexpr (kOutputDimensions == 1)
+      {
+        if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) == 0)
+        {
+          const auto row0 = reinterpret_cast<const __m512i*>(&weights_[0]);
+
+#if defined (USE_VNNI)
+          __m512i sum0 = _mm512_setzero_si512();
+          const IndexType kStart = 0;
+#else
+          __m512i sum0 = m512_dpbusd_epi32(input_vector512[0], row0[0]);
+          const IndexType kStart = 1;
+#endif
+
+          for (IndexType j = kStart; j < kNumChunks512; ++j)
+          {
+            const __m512i in = input_vector512[j];
+
+#if defined (USE_VNNI)
+            m512_add_dpbusd_epi32(sum0, in, row0[j]);
+#else
+            sum0 = _mm512_add_epi32(sum0, m512_dpbusd_epi32(in, row0[j]));
+#endif
+          }
+
+          output[0] = m512_hadd(sum0, biases_[0]);
+        }
+        else
+        {
+          const auto row0 = reinterpret_cast<const __m256i*>(&weights_[0]);
+
+#if defined (USE_VNNI)
+          __m256i sum0 = _mm256_setzero_si256();
+          const IndexType kStart = 0;
+#else
+          __m256i sum0 = m256_dpbusd_epi32(input_vector256[0], row0[0]);
+          const IndexType kStart = 1;
+#endif
+
+          for (IndexType j = kStart; j < kNumChunks256; ++j)
+          {
+            const __m256i in = input_vector256[j];
+
+#if defined (USE_VNNI)
+            m256_add_dpbusd_epi32(sum0, in, row0[j]);
+#else
+            sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
+#endif
+          }
+
+          output[0] = m256_hadd(sum0, biases_[0]);
+        }
+      }
+      else
+      {
+        // This case can never happen because kOutputDimensions
+        // is always 1 or a multiple of kSimdWidth.
+        assert(false);
+      }
+
+#elif defined (USE_AVX2)
 
-  #elif defined(USE_AVX2)
       constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+
+      const auto output = reinterpret_cast<OutputType*>(buffer);
       const auto input_vector = reinterpret_cast<const __m256i*>(input);
-  #if !defined(USE_VNNI)
-      const __m256i kOnes = _mm256_set1_epi16(1);
-  #endif
 
-  #elif defined(USE_SSE2)
+      // kOutputDimensions is either 1 or a multiple of kSimdWidth
+      // because then it is also an input dimension.
+      if constexpr (kOutputDimensions % 4 == 0)
+      {
+        for (IndexType i = 0; i < kOutputDimensions; i += 4)
+        {
+          const IndexType offset0 = (i + 0) * kPaddedInputDimensions;
+          const IndexType offset1 = (i + 1) * kPaddedInputDimensions;
+          const IndexType offset2 = (i + 2) * kPaddedInputDimensions;
+          const IndexType offset3 = (i + 3) * kPaddedInputDimensions;
+
+          const __m128i bias = *reinterpret_cast<const __m128i*>(&biases_[i]);
+          __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]);
+
+          const auto row0 = reinterpret_cast<const __m256i*>(&weights_[offset0]);
+          const auto row1 = reinterpret_cast<const __m256i*>(&weights_[offset1]);
+          const auto row2 = reinterpret_cast<const __m256i*>(&weights_[offset2]);
+          const auto row3 = reinterpret_cast<const __m256i*>(&weights_[offset3]);
+
+#if defined (USE_VNNI)
+          __m256i sum0 = _mm256_setzero_si256();
+          __m256i sum1 = _mm256_setzero_si256();
+          __m256i sum2 = _mm256_setzero_si256();
+          __m256i sum3 = _mm256_setzero_si256();
+          const IndexType kStart = 0;
+#else
+          __m256i sum0 = m256_dpbusd_epi32(input_vector[0], row0[0]);
+          __m256i sum1 = m256_dpbusd_epi32(input_vector[0], row1[0]);
+          __m256i sum2 = m256_dpbusd_epi32(input_vector[0], row2[0]);
+          __m256i sum3 = m256_dpbusd_epi32(input_vector[0], row3[0]);
+          const IndexType kStart = 1;
+#endif
+
+          for (IndexType j = kStart; j < kNumChunks; ++j)
+          {
+            const __m256i in = input_vector[j];
+
+#if defined (USE_VNNI)
+            m256_add_dpbusd_epi32(sum0, in, row0[j]);
+            m256_add_dpbusd_epi32(sum1, in, row1[j]);
+            m256_add_dpbusd_epi32(sum2, in, row2[j]);
+            m256_add_dpbusd_epi32(sum3, in, row3[j]);
+#else
+            sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
+            sum1 = _mm256_add_epi32(sum1, m256_dpbusd_epi32(in, row1[j]));
+            sum2 = _mm256_add_epi32(sum2, m256_dpbusd_epi32(in, row2[j]));
+            sum3 = _mm256_add_epi32(sum3, m256_dpbusd_epi32(in, row3[j]));
+#endif
+          }
+
+          *outptr = m256_haddx4(sum0, sum1, sum2, sum3, bias);
+        }
+      }
+      else if constexpr (kOutputDimensions == 1)
+      {
+        const auto row0 = reinterpret_cast<const __m256i*>(&weights_[0]);
+
+#if defined (USE_VNNI)
+        __m256i sum0 = _mm256_setzero_si256();
+        const IndexType kStart = 0;
+#else
+        __m256i sum0 = m256_dpbusd_epi32(input_vector[0], row0[0]);
+        const IndexType kStart = 1;
+#endif
+
+        for (IndexType j = kStart; j < kNumChunks; ++j)
+        {
+          const __m256i in = input_vector[j];
+
+#if defined (USE_VNNI)
+          m256_add_dpbusd_epi32(sum0, in, row0[j]);
+#else
+          sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
+#endif
+        }
+
+        output[0] = m256_hadd(sum0, biases_[0]);
+      }
+      else
+      {
+        // This case can never happen because kOutputDimensions
+        // is always 1 or a multiple of kSimdWidth.
+        assert(false);
+      }
+
+#elif defined (USE_SSSE3)
+
       constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
-  #ifndef USE_SSSE3
-      const __m128i kZeros = _mm_setzero_si128();
-  #else
-      const __m128i kOnes = _mm_set1_epi16(1);
-  #endif
+
+      auto output = reinterpret_cast<OutputType*>(buffer);
       const auto input_vector = reinterpret_cast<const __m128i*>(input);
 
-  #elif defined(USE_MMX)
+      // kOutputDimensions is either 1 or a multiple of kSimdWidth
+      // because then it is also an input dimension.
+      if constexpr (kOutputDimensions % 4 == 0)
+      {
+        for (IndexType i = 0; i < kOutputDimensions; i += 4)
+        {
+          const IndexType offset0 = (i + 0) * kPaddedInputDimensions;
+          const IndexType offset1 = (i + 1) * kPaddedInputDimensions;
+          const IndexType offset2 = (i + 2) * kPaddedInputDimensions;
+          const IndexType offset3 = (i + 3) * kPaddedInputDimensions;
+
+          const __m128i bias = *reinterpret_cast<const __m128i*>(&biases_[i]);
+          __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]);
+
+          const auto row0 = reinterpret_cast<const __m128i*>(&weights_[offset0]);
+          const auto row1 = reinterpret_cast<const __m128i*>(&weights_[offset1]);
+          const auto row2 = reinterpret_cast<const __m128i*>(&weights_[offset2]);
+          const auto row3 = reinterpret_cast<const __m128i*>(&weights_[offset3]);
+
+          __m128i sum0 = m128_dpbusd_epi32(input_vector[0], row0[0]);
+          __m128i sum1 = m128_dpbusd_epi32(input_vector[0], row1[0]);
+          __m128i sum2 = m128_dpbusd_epi32(input_vector[0], row2[0]);
+          __m128i sum3 = m128_dpbusd_epi32(input_vector[0], row3[0]);
+
+          for (int j = 1; j < (int)kNumChunks; ++j)
+          {
+            const __m128i in = input_vector[j];
+
+            sum0 = _mm_add_epi32(sum0, m128_dpbusd_epi32(in, row0[j]));
+            sum1 = _mm_add_epi32(sum1, m128_dpbusd_epi32(in, row1[j]));
+            sum2 = _mm_add_epi32(sum2, m128_dpbusd_epi32(in, row2[j]));
+            sum3 = _mm_add_epi32(sum3, m128_dpbusd_epi32(in, row3[j]));
+          }
+
+          *outptr = m128_haddx4(sum0, sum1, sum2, sum3, bias);
+        }
+      }
+      else if constexpr (kOutputDimensions == 1)
+      {
+        const auto row0 = reinterpret_cast<const __m128i*>(&weights_[0]);
+
+        __m128i sum0 = m128_dpbusd_epi32(input_vector[0], row0[0]);
+
+        for (int j = 1; j < (int)kNumChunks; ++j)
+          sum0 = _mm_add_epi32(sum0, m128_dpbusd_epi32(input_vector[j], row0[j]));
+
+        output[0] = m128_hadd(sum0, biases_[0]);
+      }
+      else
+      {
+        // This case can never happen because kOutputDimensions
+        // is always 1 or a multiple of kSimdWidth.
+        assert(false);
+      }
+
+#else
+
+// Use old implementation for the other architectures.
+
+      auto output = reinterpret_cast<OutputType*>(buffer);
+
+#if defined(USE_SSE2)
+      constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+#ifndef USE_SSSE3
+      const __m128i kZeros = _mm_setzero_si128();
+#else
+      const __m128i kOnes = _mm_set1_epi16(1);
+#endif
+      const auto input_vector = reinterpret_cast<const __m128i*>(input);
+
+#elif defined(USE_MMX)
       constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
       const __m64 kZeros = _mm_setzero_si64();
       const auto input_vector = reinterpret_cast<const __m64*>(input);
 
-  #elif defined(USE_NEON)
+#elif defined(USE_NEON)
       constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
       const auto input_vector = reinterpret_cast<const int8x8_t*>(input);
-  #endif
+#endif
 
       for (IndexType i = 0; i < kOutputDimensions; ++i) {
         const IndexType offset = i * kPaddedInputDimensions;
 
-  #if defined(USE_AVX512)
-        __m512i sum = _mm512_setzero_si512();
-        const auto row = reinterpret_cast<const __m512i*>(&weights_[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-  #if defined(USE_VNNI)
-            sum = _mm512_dpbusd_epi32(sum, _mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
-  #else
-            __m512i product = _mm512_maddubs_epi16(_mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
-            product = _mm512_madd_epi16(product, kOnes);
-            sum = _mm512_add_epi32(sum, product);
-  #endif
-        }
-
-        // Note: Changing kMaxSimdWidth from 32 to 64 breaks loading existing networks.
-        // As a result kPaddedInputDimensions may not be an even multiple of 64(512bit)
-        // and we have to do one more 256bit chunk.
-        if (kPaddedInputDimensions != kNumChunks * kSimdWidth * 2)
-        {
-            const auto iv256  = reinterpret_cast<const __m256i*>(&input_vector[kNumChunks]);
-            const auto row256 = reinterpret_cast<const __m256i*>(&row[kNumChunks]);
-  #if defined(USE_VNNI)
-            __m256i product256 = _mm256_dpbusd_epi32(
-                _mm512_castsi512_si256(sum), _mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0]));
-            sum = _mm512_inserti32x8(sum, product256, 0);
-  #else
-            __m256i product256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0]));
-            sum = _mm512_add_epi32(sum, _mm512_cvtepi16_epi32(product256));
-  #endif
-        }
-        output[i] = _mm512_reduce_add_epi32(sum) + biases_[i];
-
-  #elif defined(USE_AVX2)
-        __m256i sum = _mm256_setzero_si256();
-        const auto row = reinterpret_cast<const __m256i*>(&weights_[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-  #if defined(USE_VNNI)
-          sum = _mm256_dpbusd_epi32(sum, _mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
-  #else
-          __m256i product = _mm256_maddubs_epi16(_mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
-          product = _mm256_madd_epi16(product, kOnes);
-          sum = _mm256_add_epi32(sum, product);
-  #endif
-        }
-        __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
-        sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC));
-        sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB));
-        output[i] = _mm_cvtsi128_si32(sum128) + biases_[i];
-
-  #elif defined(USE_SSSE3)
-        __m128i sum = _mm_setzero_si128();
-        const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
-        for (int j = 0; j < (int)kNumChunks - 1; j += 2) {
-          __m128i product0 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j]));
-          product0 = _mm_madd_epi16(product0, kOnes);
-          sum = _mm_add_epi32(sum, product0);
-          __m128i product1 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j+1]), _mm_load_si128(&row[j+1]));
-          product1 = _mm_madd_epi16(product1, kOnes);
-          sum = _mm_add_epi32(sum, product1);
-        }
-        if (kNumChunks & 0x1) {
-          __m128i product = _mm_maddubs_epi16(_mm_load_si128(&input_vector[kNumChunks-1]), _mm_load_si128(&row[kNumChunks-1]));
-          product = _mm_madd_epi16(product, kOnes);
-          sum = _mm_add_epi32(sum, product);
-        }
-        sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC
-        sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB
-        output[i] = _mm_cvtsi128_si32(sum) + biases_[i];
-
-  #elif defined(USE_SSE2)
+#if defined(USE_SSE2)
         __m128i sum_lo = _mm_cvtsi32_si128(biases_[i]);
         __m128i sum_hi = kZeros;
         const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
@@ -223,7 +739,7 @@ namespace Eval::NNUE::Layers {
         sum = _mm_add_epi32(sum, sum_second_32);
         output[i] = _mm_cvtsi128_si32(sum);
 
-  #elif defined(USE_MMX)
+#elif defined(USE_MMX)
         __m64 sum_lo = _mm_cvtsi32_si64(biases_[i]);
         __m64 sum_hi = kZeros;
         const auto row = reinterpret_cast<const __m64*>(&weights_[offset]);
@@ -244,7 +760,7 @@ namespace Eval::NNUE::Layers {
         sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum));
         output[i] = _mm_cvtsi64_si32(sum);
 
-  #elif defined(USE_NEON)
+#elif defined(USE_NEON)
         int32x4_t sum = {biases_[i]};
         const auto row = reinterpret_cast<const int8x8_t*>(&weights_[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
@@ -254,18 +770,21 @@ namespace Eval::NNUE::Layers {
         }
         output[i] = sum[0] + sum[1] + sum[2] + sum[3];
 
-  #else
+#else
         OutputType sum = biases_[i];
         for (IndexType j = 0; j < kInputDimensions; ++j) {
           sum += weights_[offset + j] * input[j];
         }
         output[i] = sum;
-  #endif
+#endif
 
       }
-  #if defined(USE_MMX)
+#if defined(USE_MMX)
       _mm_empty();
-  #endif
+#endif
+
+#endif
+
       return output;
     }
 
diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h
index d923986e..3e9ce655 100644
--- a/src/nnue/layers/clipped_relu.h
+++ b/src/nnue/layers/clipped_relu.h
@@ -23,6 +23,10 @@
 
 #include "../nnue_common.h"
 
+#include <string>
+#include <cstdint>
+#include <type_traits>
+
 namespace Eval::NNUE::Layers {
 
   // Clipped ReLU
@@ -47,6 +51,8 @@ namespace Eval::NNUE::Layers {
     static constexpr std::size_t kBufferSize =
         PreviousLayer::kBufferSize + kSelfBufferSize;
 
+    static constexpr int kLayerIndex = PreviousLayer::kLayerIndex + 1;
+
     // Hash value embedded in the evaluation file
     static constexpr std::uint32_t GetHashValue() {
       std::uint32_t hash_value = 0x538D24C7u;
@@ -54,11 +60,24 @@ namespace Eval::NNUE::Layers {
       return hash_value;
     }
 
+    static std::string get_name() {
+        return "ClippedReLU[" +
+            std::to_string(kOutputDimensions) + "]";
+    }
+
     // A string that represents the structure from the input layer to this layer
-    static std::string GetStructureString() {
-      return "ClippedReLU[" +
-        std::to_string(kOutputDimensions) + "](" +
-        PreviousLayer::GetStructureString() + ")";
+    static std::string get_structure_string() {
+        return get_name() + "(" +
+            PreviousLayer::get_structure_string() + ")";
+    }
+
+    static std::string get_layers_info() {
+        std::string info = PreviousLayer::get_layers_info();
+        info += "\n  - ";
+        info += std::to_string(kLayerIndex);
+        info += " - ";
+        info += get_name();
+        return info;
     }
 
     // Read network parameters
@@ -68,7 +87,7 @@ namespace Eval::NNUE::Layers {
 
     // write parameters
     bool WriteParameters(std::ostream& stream) const {
-      return previous_layer_.WriteParameters(stream);
+        return previous_layer_.WriteParameters(stream);
     }
 
     // Forward propagation
@@ -86,12 +105,12 @@ namespace Eval::NNUE::Layers {
       const auto out = reinterpret_cast<__m256i*>(output);
       for (IndexType i = 0; i < kNumChunks; ++i) {
         const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32(
-            _mm256_loadA_si256(&in[i * 4 + 0]),
-            _mm256_loadA_si256(&in[i * 4 + 1])), kWeightScaleBits);
+            _mm256_load_si256(&in[i * 4 + 0]),
+            _mm256_load_si256(&in[i * 4 + 1])), kWeightScaleBits);
         const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32(
-            _mm256_loadA_si256(&in[i * 4 + 2]),
-            _mm256_loadA_si256(&in[i * 4 + 3])), kWeightScaleBits);
-        _mm256_storeA_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
+            _mm256_load_si256(&in[i * 4 + 2]),
+            _mm256_load_si256(&in[i * 4 + 3])), kWeightScaleBits);
+        _mm256_store_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
             _mm256_packs_epi16(words0, words1), kZero), kOffsets));
       }
       constexpr IndexType kStart = kNumChunks * kSimdWidth;
@@ -170,9 +189,9 @@ namespace Eval::NNUE::Layers {
     }
 
    private:
-     // Make the learning class a friend
-     friend class Trainer<ClippedReLU>;
-     
+    // Make the learning class a friend
+    friend class Trainer<ClippedReLU>;
+
     PreviousLayer previous_layer_;
   };
 
diff --git a/src/nnue/layers/input_slice.h b/src/nnue/layers/input_slice.h
index 78756a39..7a4ef045 100644
--- a/src/nnue/layers/input_slice.h
+++ b/src/nnue/layers/input_slice.h
@@ -41,6 +41,8 @@ class InputSlice {
   // Size of forward propagation buffer used from the input layer to this layer
   static constexpr std::size_t kBufferSize = 0;
 
+  static constexpr int kLayerIndex = 1;
+
   // Hash value embedded in the evaluation file
   static constexpr std::uint32_t GetHashValue() {
     std::uint32_t hash_value = 0xEC42E90Du;
@@ -48,12 +50,24 @@ class InputSlice {
     return hash_value;
   }
 
-  // A string that represents the structure from the input layer to this layer
-  static std::string GetStructureString() {
-    return "InputSlice[" + std::to_string(kOutputDimensions) + "(" +
-      std::to_string(Offset) + ":" +
-      std::to_string(Offset + kOutputDimensions) + ")]";
-  }
+    static std::string get_name() {
+        return "InputSlice[" + std::to_string(kOutputDimensions) + "(" +
+            std::to_string(Offset) + ":" +
+            std::to_string(Offset + kOutputDimensions) + ")]";
+    }
+
+    // A string that represents the structure from the input layer to this layer
+    static std::string get_structure_string() {
+        return get_name();
+    }
+
+    static std::string get_layers_info() {
+        std::string info = "  - ";
+        info += std::to_string(kLayerIndex);
+        info += " - ";
+        info += get_name();
+        return info;
+    }
 
   // Read network parameters
   bool ReadParameters(std::istream& /*stream*/) {
@@ -62,7 +76,7 @@ class InputSlice {
 
   // write parameters
   bool WriteParameters(std::ostream& /*stream*/) const {
-    return true;
+      return true;
   }
 
   // Forward propagation
diff --git a/src/nnue/layers/sum.h b/src/nnue/layers/sum.h
index d8c7bf93..01ae251c 100644
--- a/src/nnue/layers/sum.h
+++ b/src/nnue/layers/sum.h
@@ -1,163 +1,196 @@
-﻿// Definition of layer Sum of NNUE evaluation function
-
-#ifndef _NNUE_LAYERS_SUM_H_
+﻿#ifndef _NNUE_LAYERS_SUM_H_
 #define _NNUE_LAYERS_SUM_H_
 
-#if defined(EVAL_NNUE)
+#include "nnue/nnue_common.h"
 
-#include "../nnue_common.h"
+// Definition of layer Sum of NNUE evaluation function
+namespace Eval::NNUE::Layers {
 
-namespace Eval {
+    // Layer that sums the output of multiple layers
+    template <typename FirstPreviousLayer, typename... RemainingPreviousLayers>
+    class Sum : public Sum<RemainingPreviousLayers...> {
+    private:
+        using Head = FirstPreviousLayer;
+        using Tail = Sum<RemainingPreviousLayers...>;
 
-namespace NNUE {
+     public:
+        // Input/output type
+        using InputType = typename Head::OutputType;
 
-namespace Layers {
+        using OutputType = InputType;
 
-// Layer that sums the output of multiple layers
-template <typename FirstPreviousLayer, typename... RemainingPreviousLayers>
-class Sum : public Sum<RemainingPreviousLayers...> {
- private:
-  using Head = FirstPreviousLayer;
-  using Tail = Sum<RemainingPreviousLayers...>;
+        static_assert(std::is_same<InputType, typename Tail::InputType>::value, "");
 
- public:
-  // Input/output type
-  using InputType = typename Head::OutputType;
-  using OutputType = InputType;
-  static_assert(std::is_same<InputType, typename Tail::InputType>::value, "");
+        // number of input/output dimensions
+        static constexpr IndexType kInputDimensions = Head::kOutputDimensions;
 
-  // number of input/output dimensions
-  static constexpr IndexType kInputDimensions = Head::kOutputDimensions;
-  static constexpr IndexType kOutputDimensions = kInputDimensions;
-  static_assert(kInputDimensions == Tail::kInputDimensions ,"");
+        static constexpr IndexType kOutputDimensions = kInputDimensions;
 
-  // Size of forward propagation buffer used in this layer
-  static constexpr std::size_t kSelfBufferSize =
-      CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
+        static_assert(kInputDimensions == Tail::kInputDimensions ,"");
 
-  // Size of the forward propagation buffer used from the input layer to this layer
-  static constexpr std::size_t kBufferSize =
-      std::max(Head::kBufferSize + kSelfBufferSize, Tail::kBufferSize);
+        // Size of forward propagation buffer used in this layer
+        static constexpr std::size_t kSelfBufferSize =
+            CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
 
-  // Hash value embedded in the evaluation function file
-  static constexpr std::uint32_t GetHashValue() {
-    std::uint32_t hash_value = 0xBCE400B4u;
-    hash_value ^= Head::GetHashValue() >> 1;
-    hash_value ^= Head::GetHashValue() << 31;
-    hash_value ^= Tail::GetHashValue() >> 2;
-    hash_value ^= Tail::GetHashValue() << 30;
-    return hash_value;
-  }
+        // Size of the forward propagation buffer used from the input layer to this layer
+        static constexpr std::size_t kBufferSize =
+            std::max(Head::kBufferSize + kSelfBufferSize, Tail::kBufferSize);
 
-  // A string that represents the structure from the input layer to this layer
-  static std::string GetStructureString() {
-    return "Sum[" +
-        std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")";
-  }
+        static constexpr int kLayerIndex = Tail::kLayerIndex + 1;
 
-  // read parameters
-  bool ReadParameters(std::istream& stream) {
-    if (!Tail::ReadParameters(stream)) return false;
-    return previous_layer_.ReadParameters(stream);
-  }
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t GetHashValue() {
+            std::uint32_t hash_value = 0xBCE400B4u;
+            hash_value ^= Head::GetHashValue() >> 1;
+            hash_value ^= Head::GetHashValue() << 31;
+            hash_value ^= Tail::GetHashValue() >> 2;
+            hash_value ^= Tail::GetHashValue() << 30;
+            return hash_value;
+        }
 
-  // write parameters
-  bool WriteParameters(std::ostream& stream) const {
-    if (!Tail::WriteParameters(stream)) return false;
-    return previous_layer_.WriteParameters(stream);
-  }
+        static std::string get_name() {
+             return "Sum[" +
+                std::to_string(kOutputDimensions) + "]";
+        }
 
-  // forward propagation
-  const OutputType* Propagate(
-      const TransformedFeatureType* transformed_features, char* buffer) const {
-    Tail::Propagate(transformed_features, buffer);
-    const auto head_output = previous_layer_.Propagate(
-        transformed_features, buffer + kSelfBufferSize);
-    const auto output = reinterpret_cast<OutputType*>(buffer);
-    for (IndexType i = 0; i <kOutputDimensions; ++i) {
-      output[i] += head_output[i];
-    }
-    return output;
-  }
+        // A string that represents the structure from the input layer to this layer
+        static std::string get_structure_string() {
+            return get_name() + "(" + get_summands_string() + ")";
+        }
 
- protected:
-  // A string that represents the list of layers to be summed
-  static std::string GetSummandsString() {
-    return Head::GetStructureString() + "," + Tail::GetSummandsString();
-  }
+        static std::string get_layers_info() {
+            std::string info = Tail::get_layers_info();
+            info += "\n  - ";
+            info += std::to_string(kLayerIndex);
+            info += " - ";
+            info += get_name();
+            return info;
+        }
 
-  // Make the learning class a friend
-  friend class Trainer<Sum>;
+        // read parameters
+        bool ReadParameters(std::istream& stream) {
+            if (!Tail::ReadParameters(stream))
+                return false;
 
-  // the layer immediately before this layer
-  FirstPreviousLayer previous_layer_;
-};
+            return previous_layer_.ReadParameters(stream);
+        }
 
-// Layer that sums the output of multiple layers (when there is one template argument)
-template <typename PreviousLayer>
-class Sum<PreviousLayer> {
- public:
-  // Input/output type
-  using InputType = typename PreviousLayer::OutputType;
-  using OutputType = InputType;
+        // write parameters
+        bool WriteParameters(std::ostream& stream) const {
+            if (!Tail::WriteParameters(stream))
+                return false;
 
-  // number of input/output dimensions
-  static constexpr IndexType kInputDimensions =
-      PreviousLayer::kOutputDimensions;
-  static constexpr IndexType kOutputDimensions = kInputDimensions;
+            return previous_layer_.WriteParameters(stream);
+        }
 
-  // Size of the forward propagation buffer used from the input layer to this layer
-  static constexpr std::size_t kBufferSize = PreviousLayer::kBufferSize;
+        // forward propagation
+        const OutputType* propagate(
+            const TransformedFeatureType* transformed_features, char* buffer) const {
 
-  // Hash value embedded in the evaluation function file
-  static constexpr std::uint32_t GetHashValue() {
-    std::uint32_t hash_value = 0xBCE400B4u;
-    hash_value ^= PreviousLayer::GetHashValue() >> 1;
-    hash_value ^= PreviousLayer::GetHashValue() << 31;
-    return hash_value;
-  }
+            Tail::propagate(transformed_features, buffer);
 
-  // A string that represents the structure from the input layer to this layer
-  static std::string GetStructureString() {
-    return "Sum[" +
-        std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")";
-  }
+            const auto head_output = previous_layer_.Propagate(
+                transformed_features, buffer + kSelfBufferSize);
 
-  // read parameters
-  bool ReadParameters(std::istream& stream) {
-    return previous_layer_.ReadParameters(stream);
-  }
+            const auto output = reinterpret_cast<OutputType*>(buffer);
 
-  // write parameters
-  bool WriteParameters(std::ostream& stream) const {
-    return previous_layer_.WriteParameters(stream);
-  }
+            for (IndexType i = 0; i <kOutputDimensions; ++i) {
+                output[i] += head_output[i];
+            }
 
-  // forward propagation
-  const OutputType* Propagate(
-      const TransformedFeatureType* transformed_features, char* buffer) const {
-    return previous_layer_.Propagate(transformed_features, buffer);
-  }
+            return output;
+        }
 
- protected:
-  // A string that represents the list of layers to be summed
-  static std::string GetSummandsString() {
-    return PreviousLayer::GetStructureString();
-  }
+    protected:
+        // A string that represents the list of layers to be summed
+        static std::string get_summands_string() {
+            return Head::get_structure_string() + "," + Tail::get_summands_string();
+        }
 
-  // Make the learning class a friend
-  friend class Trainer<Sum>;
+        // Make the learning class a friend
+        friend class Trainer<Sum>;
 
-  // the layer immediately before this layer
-  PreviousLayer previous_layer_;
-};
+        // the layer immediately before this layer
+        FirstPreviousLayer previous_layer_;
+    };
 
-}  // namespace Layers
+    // Layer that sums the output of multiple layers (when there is one template argument)
+    template <typename PreviousLayer>
+    class Sum<PreviousLayer> {
+    public:
+        // Input/output type
+        using InputType = typename PreviousLayer::OutputType;
 
-}  // namespace NNUE
+        using OutputType = InputType;
 
-}  // namespace Eval
+        // number of input/output dimensions
+        static constexpr IndexType kInputDimensions =
+            PreviousLayer::kOutputDimensions;
 
-#endif  // defined(EVAL_NNUE)
+        static constexpr IndexType kOutputDimensions = kInputDimensions;
+
+        // Size of the forward propagation buffer used from the input layer to this layer
+        static constexpr std::size_t kBufferSize = PreviousLayer::kBufferSize;
+
+        static constexpr int kLayerIndex = PreviousLayer::kLayerIndex + 1;
+
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t GetHashValue() {
+            std::uint32_t hash_value = 0xBCE400B4u;
+            hash_value ^= PreviousLayer::GetHashValue() >> 1;
+            hash_value ^= PreviousLayer::GetHashValue() << 31;
+            return hash_value;
+        }
+
+        static std::string get_name() {
+             return "Sum[" +
+                std::to_string(kOutputDimensions) + "]";
+        }
+
+        // A string that represents the structure from the input layer to this layer
+        static std::string get_structure_string() {
+            return get_name() + "(" + get_summands_string() + ")";
+        }
+
+        static std::string get_layers_info() {
+            std::string info = PreviousLayer::get_layers_info();
+            info += '\n';
+            info += std::to_string(kLayerIndex);
+            info += ": ";
+            info += get_name();
+            return info;
+        }
+
+        // read parameters
+        bool ReadParameters(std::istream& stream) {
+            return previous_layer_.ReadParameters(stream);
+        }
+
+        // write parameters
+        bool WriteParameters(std::ostream& stream) const {
+            return previous_layer_.WriteParameters(stream);
+        }
+
+        // forward propagation
+        const OutputType* Propagate(
+            const TransformedFeatureType* transformed_features, char* buffer) const {
+
+            return previous_layer_.Propagate(transformed_features, buffer);
+        }
+
+    protected:
+        // A string that represents the list of layers to be summed
+        static std::string get_summands_string() {
+            return PreviousLayer::get_structure_string();
+        }
+
+        // Make the learning class a friend
+        friend class Trainer<Sum>;
+
+        // the layer immediately before this layer
+        PreviousLayer previous_layer_;
+    };
+
+}  // namespace Eval::NNUE::Layers
 
 #endif
diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h
index 69dfaad2..3d2f5bb4 100644
--- a/src/nnue/nnue_accumulator.h
+++ b/src/nnue/nnue_accumulator.h
@@ -27,11 +27,8 @@ namespace Eval::NNUE {
 
   // Class that holds the result of affine transformation of input features
   struct alignas(kCacheLineSize) Accumulator {
-    std::int16_t
-        accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions];
-    Value score;
-    bool computed_accumulation;
-    bool computed_score;
+      std::int16_t accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions];
+      bool computed_accumulation;
   };
 
 }  // namespace Eval::NNUE
diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h
index d7ffa21a..58bfd146 100644
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -21,6 +21,8 @@
 #ifndef NNUE_COMMON_H_INCLUDED
 #define NNUE_COMMON_H_INCLUDED
 
+#include "types.h"
+
 #include <cstring>
 #include <iostream>
 
@@ -43,29 +45,6 @@
 #include <arm_neon.h>
 #endif
 
-// HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Otherwise a binary
-//       compiled with older g++ crashes because the output memory is not aligned
-//       even though alignas is specified.
-#if defined(USE_AVX2)
-#if defined(__GNUC__ ) && (__GNUC__ < 9) && defined(_WIN32) && !defined(__clang__)
-#define _mm256_loadA_si256  _mm256_loadu_si256
-#define _mm256_storeA_si256 _mm256_storeu_si256
-#else
-#define _mm256_loadA_si256  _mm256_load_si256
-#define _mm256_storeA_si256 _mm256_store_si256
-#endif
-#endif
-
-#if defined(USE_AVX512)
-#if defined(__GNUC__ ) && (__GNUC__ < 9) && defined(_WIN32) && !defined(__clang__)
-#define _mm512_loadA_si512   _mm512_loadu_si512
-#define _mm512_storeA_si512  _mm512_storeu_si512
-#else
-#define _mm512_loadA_si512   _mm512_load_si512
-#define _mm512_storeA_si512  _mm512_store_si512
-#endif
-#endif
-
 namespace Eval::NNUE {
 
   // Version of the evaluation file
@@ -113,7 +92,7 @@ namespace Eval::NNUE {
     PS_END2     = 12 * SQUARE_NB + 1
   };
 
-  extern uint32_t kpp_board_index[PIECE_NB][COLOR_NB];
+  extern const uint32_t kpp_board_index[PIECE_NB][COLOR_NB];
 
   // Type of input feature after conversion
   using TransformedFeatureType = std::uint8_t;
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index e1bc2ab8..8c17c959 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -25,10 +25,66 @@
 #include "nnue_architecture.h"
 #include "features/index_list.h"
 
-#include <cstring> // std::memset()
+#include <cstring>
+#include <string>
 
 namespace Eval::NNUE {
 
+  // If vector instructions are enabled, we update and refresh the
+  // accumulator tile by tile such that each tile fits in the CPU's
+  // vector registers.
+  #define VECTOR
+
+  #ifdef USE_AVX512
+  typedef __m512i vec_t;
+  #define vec_load(a) _mm512_load_si512(a)
+  #define vec_store(a,b) _mm512_store_si512(a,b)
+  #define vec_add_16(a,b) _mm512_add_epi16(a,b)
+  #define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
+  #define vec_zero _mm512_setzero_si512()
+  static constexpr IndexType kNumRegs = 8; // only 8 are needed
+
+  #elif USE_AVX2
+  typedef __m256i vec_t;
+  #define vec_load(a) _mm256_load_si256(a)
+  #define vec_store(a,b) _mm256_store_si256(a,b)
+  #define vec_add_16(a,b) _mm256_add_epi16(a,b)
+  #define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
+  #define vec_zero _mm256_setzero_si256()
+  static constexpr IndexType kNumRegs = 16;
+
+  #elif USE_SSE2
+  typedef __m128i vec_t;
+  #define vec_load(a) (*(a))
+  #define vec_store(a,b) *(a)=(b)
+  #define vec_add_16(a,b) _mm_add_epi16(a,b)
+  #define vec_sub_16(a,b) _mm_sub_epi16(a,b)
+  #define vec_zero _mm_setzero_si128()
+  static constexpr IndexType kNumRegs = Is64Bit ? 16 : 8;
+
+  #elif USE_MMX
+  typedef __m64 vec_t;
+  #define vec_load(a) (*(a))
+  #define vec_store(a,b) *(a)=(b)
+  #define vec_add_16(a,b) _mm_add_pi16(a,b)
+  #define vec_sub_16(a,b) _mm_sub_pi16(a,b)
+  #define vec_zero _mm_setzero_si64()
+  static constexpr IndexType kNumRegs = 8;
+
+  #elif USE_NEON
+  typedef int16x8_t vec_t;
+  #define vec_load(a) (*(a))
+  #define vec_store(a,b) *(a)=(b)
+  #define vec_add_16(a,b) vaddq_s16(a,b)
+  #define vec_sub_16(a,b) vsubq_s16(a,b)
+  #define vec_zero {0}
+  static constexpr IndexType kNumRegs = 16;
+
+  #else
+  #undef VECTOR
+
+  #endif
+
   // Input feature converter
   class FeatureTransformer {
 
@@ -36,6 +92,11 @@ namespace Eval::NNUE {
     // Number of output dimensions for one side
     static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions;
 
+    #ifdef VECTOR
+    static constexpr IndexType kTileHeight = kNumRegs * sizeof(vec_t) / 2;
+    static_assert(kHalfDimensions % kTileHeight == 0, "kTileHeight must divide kHalfDimensions");
+    #endif
+
    public:
     // Output type
     using OutputType = TransformedFeatureType;
@@ -48,20 +109,36 @@ namespace Eval::NNUE {
     static constexpr std::size_t kBufferSize =
         kOutputDimensions * sizeof(OutputType);
 
+    static constexpr int kLayerIndex = 0;
+
     // Hash value embedded in the evaluation file
     static constexpr std::uint32_t GetHashValue() {
+
       return RawFeatures::kHashValue ^ kOutputDimensions;
     }
 
+    static std::string get_name() {
+      return RawFeatures::get_name() + "[" +
+          std::to_string(kInputDimensions) + "->" +
+          std::to_string(kHalfDimensions) + "x2]";
+    }
+
     // a string representing the structure
-    static std::string GetStructureString() {
-      return RawFeatures::GetName() + "[" +
-        std::to_string(kInputDimensions) + "->" +
-        std::to_string(kHalfDimensions) + "x2]";
+    static std::string get_structure_string() {
+      return get_name();
+    }
+
+    static std::string get_layers_info() {
+      std::string info = "  - ";
+      info += std::to_string(kLayerIndex);
+      info += " - ";
+      info += get_name();
+      return info;
     }
 
     // Read network parameters
     bool ReadParameters(std::istream& stream) {
+
       for (std::size_t i = 0; i < kHalfDimensions; ++i)
         biases_[i] = read_little_endian<BiasType>(stream);
       for (std::size_t i = 0; i < kHalfDimensions * kInputDimensions; ++i)
@@ -72,34 +149,45 @@ namespace Eval::NNUE {
     // write parameters
     bool WriteParameters(std::ostream& stream) const {
       stream.write(reinterpret_cast<const char*>(biases_),
-        kHalfDimensions * sizeof(BiasType));
+          kHalfDimensions * sizeof(BiasType));
+
       stream.write(reinterpret_cast<const char*>(weights_),
-        kHalfDimensions * kInputDimensions * sizeof(WeightType));
+          kHalfDimensions * kInputDimensions * sizeof(WeightType));
+
       return !stream.fail();
     }
 
     // Proceed with the difference calculation if possible
-    bool UpdateAccumulatorIfPossible(const Position& pos) const {
+    bool update_accumulator_if_possible(const Position& pos) const {
+
       const auto now = pos.state();
-      if (now->accumulator.computed_accumulation) {
+      if (now->accumulator.computed_accumulation)
         return true;
-      }
+
       const auto prev = now->previous;
       if (prev && prev->accumulator.computed_accumulation) {
-        UpdateAccumulator(pos);
+        update_accumulator(pos);
         return true;
       }
+
       return false;
     }
 
     // Convert input features
-    void Transform(const Position& pos, OutputType* output, bool refresh) const {
-      if (refresh || !UpdateAccumulatorIfPossible(pos)) {
-        RefreshAccumulator(pos);
-      }
+    void Transform(const Position& pos, OutputType* output) const {
+
+      if (!update_accumulator_if_possible(pos))
+        refresh_accumulator(pos);
+
       const auto& accumulation = pos.state()->accumulator.accumulation;
 
-  #if defined(USE_AVX2)
+  #if defined(USE_AVX512)
+      constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth * 2);
+      static_assert(kHalfDimensions % (kSimdWidth * 2) == 0);
+      const __m512i kControl = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
+      const __m512i kZero = _mm512_setzero_si512();
+
+  #elif defined(USE_AVX2)
       constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
       constexpr int kControl = 0b11011000;
       const __m256i kZero = _mm256_setzero_si256();
@@ -126,14 +214,39 @@ namespace Eval::NNUE {
       for (IndexType p = 0; p < 2; ++p) {
         const IndexType offset = kHalfDimensions * p;
 
-  #if defined(USE_AVX2)
+  #if defined(USE_AVX512)
+        auto out = reinterpret_cast<__m512i*>(&output[offset]);
+        for (IndexType j = 0; j < kNumChunks; ++j) {
+          __m512i sum0 = _mm512_load_si512(
+              &reinterpret_cast<const __m512i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
+          __m512i sum1 = _mm512_load_si512(
+              &reinterpret_cast<const __m512i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
+          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+              sum0 = _mm512_add_epi16(sum0, reinterpret_cast<const __m512i*>(
+                  accumulation[perspectives[p]][i])[j * 2 + 0]);
+              sum1 = _mm512_add_epi16(sum1, reinterpret_cast<const __m512i*>(
+                  accumulation[perspectives[p]][i])[j * 2 + 1]);
+          }
+
+          _mm512_store_si512(&out[j], _mm512_permutexvar_epi64(kControl,
+              _mm512_max_epi8(_mm512_packs_epi16(sum0, sum1), kZero)));
+        }
+
+  #elif defined(USE_AVX2)
         auto out = reinterpret_cast<__m256i*>(&output[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
-          __m256i sum0 = _mm256_loadA_si256(
+          __m256i sum0 = _mm256_load_si256(
               &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
-          __m256i sum1 = _mm256_loadA_si256(
-            &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
-          _mm256_storeA_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
+          __m256i sum1 = _mm256_load_si256(
+              &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
+          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+              sum0 = _mm256_add_epi16(sum0, reinterpret_cast<const __m256i*>(
+                  accumulation[perspectives[p]][i])[j * 2 + 0]);
+              sum1 = _mm256_add_epi16(sum1, reinterpret_cast<const __m256i*>(
+                  accumulation[perspectives[p]][i])[j * 2 + 1]);
+          }
+
+          _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
               _mm256_packs_epi16(sum0, sum1), kZero), kControl));
         }
 
@@ -144,14 +257,21 @@ namespace Eval::NNUE {
               accumulation[perspectives[p]][0])[j * 2 + 0]);
           __m128i sum1 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
               accumulation[perspectives[p]][0])[j * 2 + 1]);
+          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+            sum0 = _mm_add_epi16(sum0, reinterpret_cast<const __m128i*>(
+                accumulation[perspectives[p]][i])[j * 2 + 0]);
+            sum1 = _mm_add_epi16(sum1, reinterpret_cast<const __m128i*>(
+                accumulation[perspectives[p]][i])[j * 2 + 1]);
+          }
+
       const __m128i packedbytes = _mm_packs_epi16(sum0, sum1);
 
           _mm_store_si128(&out[j],
 
   #ifdef USE_SSE41
-            _mm_max_epi8(packedbytes, kZero)
+              _mm_max_epi8(packedbytes, kZero)
   #else
-            _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
+              _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
   #endif
 
           );
@@ -164,6 +284,13 @@ namespace Eval::NNUE {
               accumulation[perspectives[p]][0])[j * 2 + 0]);
           __m64 sum1 = *(&reinterpret_cast<const __m64*>(
               accumulation[perspectives[p]][0])[j * 2 + 1]);
+          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+              sum0 = _mm_add_pi16(sum0, reinterpret_cast<const __m64*>(
+                  accumulation[perspectives[p]][i])[j * 2 + 0]);
+              sum1 = _mm_add_pi16(sum1, reinterpret_cast<const __m64*>(
+                  accumulation[perspectives[p]][i])[j * 2 + 1]);
+          }
+
           const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
           out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
         }
@@ -173,12 +300,22 @@ namespace Eval::NNUE {
         for (IndexType j = 0; j < kNumChunks; ++j) {
           int16x8_t sum = reinterpret_cast<const int16x8_t*>(
               accumulation[perspectives[p]][0])[j];
+
+          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+              sum = vaddq_s16(sum, reinterpret_cast<const int16x8_t*>(
+                  accumulation[perspectives[p]][i])[j]);
+          }
+
           out[j] = vmax_s8(vqmovn_s16(sum), kZero);
         }
 
   #else
         for (IndexType j = 0; j < kHalfDimensions; ++j) {
           BiasType sum = accumulation[static_cast<int>(perspectives[p])][0][j];
+          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+              sum += accumulation[static_cast<int>(perspectives[p])][i][j];
+          }
+
           output[offset + j] = static_cast<OutputType>(
               std::max<int>(0, std::min<int>(127, sum)));
         }
@@ -192,108 +329,150 @@ namespace Eval::NNUE {
 
    private:
     // Calculate cumulative value without using difference calculation
-    void RefreshAccumulator(const Position& pos) const {
+    void refresh_accumulator(const Position& pos) const {
+
+  #ifdef VECTOR
+      // Gcc-10.2 unnecessarily spills AVX2 registers if this array
+      // is defined in the VECTOR code below, once in each branch
+      vec_t acc[kNumRegs];
+  #endif
       auto& accumulator = pos.state()->accumulator;
-      IndexType i = 0;
-      Features::IndexList active_indices[2];
-      RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i],
-                                       active_indices);
-      for (Color perspective : { WHITE, BLACK }) {
-        std::memcpy(accumulator.accumulation[perspective][i], biases_,
-                   kHalfDimensions * sizeof(BiasType));
-        for (const auto index : active_indices[perspective]) {
-          const IndexType offset = kHalfDimensions * index;
-  #if defined(USE_AVX512)
-          auto accumulation = reinterpret_cast<__m512i*>(
-              &accumulator.accumulation[perspective][i][0]);
-          auto column = reinterpret_cast<const __m512i*>(&weights_[offset]);
-          constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
-          for (IndexType j = 0; j < kNumChunks; ++j)
-            _mm512_storeA_si512(&accumulation[j], _mm512_add_epi16(_mm512_loadA_si512(&accumulation[j]), column[j]));
+      for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+        Features::IndexList active_indices[2];
+        RawFeatures::append_active_indices(pos, kRefreshTriggers[i],
+                                           active_indices);
+          for (Color perspective : { WHITE, BLACK }) {
+#ifdef VECTOR
+            for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
+              auto accTile = reinterpret_cast<vec_t*>(
+                  &accumulator.accumulation[perspective][i][j * kTileHeight]);
 
-  #elif defined(USE_AVX2)
-          auto accumulation = reinterpret_cast<__m256i*>(
-              &accumulator.accumulation[perspective][i][0]);
-          auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
-          constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-          for (IndexType j = 0; j < kNumChunks; ++j)
-            _mm256_storeA_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadA_si256(&accumulation[j]), column[j]));
+              if (i == 0) {
+                auto biasesTile = reinterpret_cast<const vec_t*>(
+                    &biases_[j * kTileHeight]);
+                for (IndexType k = 0; k < kNumRegs; ++k)
+                  acc[k] = biasesTile[k];
+              } else {
+                for (IndexType k = 0; k < kNumRegs; ++k)
+                  acc[k] = vec_zero;
+              }
 
-  #elif defined(USE_SSE2)
-          auto accumulation = reinterpret_cast<__m128i*>(
-              &accumulator.accumulation[perspective][i][0]);
-          auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
-          constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-          for (IndexType j = 0; j < kNumChunks; ++j)
-            accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
+              for (const auto index : active_indices[perspective]) {
+                const IndexType offset = kHalfDimensions * index + j * kTileHeight;
+                auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
 
-  #elif defined(USE_MMX)
-          auto accumulation = reinterpret_cast<__m64*>(
-              &accumulator.accumulation[perspective][i][0]);
-          auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
-          constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-          for (IndexType j = 0; j < kNumChunks; ++j) {
-            accumulation[j] = _mm_add_pi16(accumulation[j], column[j]);
+                for (IndexType k = 0; k < kNumRegs; ++k)
+                  acc[k] = vec_add_16(acc[k], column[k]);
+              }
+
+              for (IndexType k = 0; k < kNumRegs; k++)
+                vec_store(&accTile[k], acc[k]);
+            }
+#else
+            if (i == 0) {
+              std::memcpy(accumulator.accumulation[perspective][i], biases_,
+                          kHalfDimensions * sizeof(BiasType));
+            } else {
+              std::memset(accumulator.accumulation[perspective][i], 0,
+                          kHalfDimensions * sizeof(BiasType));
+            }
+
+            for (const auto index : active_indices[perspective]) {
+              const IndexType offset = kHalfDimensions * index;
+
+              for (IndexType j = 0; j < kHalfDimensions; ++j)
+                accumulator.accumulation[perspective][i][j] += weights_[offset + j];
+            }
+#endif
           }
 
-  #elif defined(USE_NEON)
-          auto accumulation = reinterpret_cast<int16x8_t*>(
-              &accumulator.accumulation[perspective][i][0]);
-          auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
-          constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-          for (IndexType j = 0; j < kNumChunks; ++j)
-            accumulation[j] = vaddq_s16(accumulation[j], column[j]);
-
-  #else
-          for (IndexType j = 0; j < kHalfDimensions; ++j)
-            accumulator.accumulation[perspective][i][j] += weights_[offset + j];
-  #endif
-
         }
-      }
-  #if defined(USE_MMX)
-      _mm_empty();
-  #endif
 
-      accumulator.computed_accumulation = true;
-      accumulator.computed_score = false;
+#if defined(USE_MMX)
+        _mm_empty();
+#endif
+
+        accumulator.computed_accumulation = true;
     }
 
     // Calculate cumulative value using difference calculation
-    void UpdateAccumulator(const Position& pos) const {
-      const auto prev_accumulator = pos.state()->previous->accumulator;
-      auto& accumulator = pos.state()->accumulator;
-      IndexType i = 0;
+    void update_accumulator(const Position& pos) const {
+
+  #ifdef VECTOR
+      // Gcc-10.2 unnecessarily spills AVX2 registers if this array
+      // is defined in the VECTOR code below, once in each branch
+      vec_t acc[kNumRegs];
+  #endif
+    const auto& prev_accumulator = pos.state()->previous->accumulator;
+    auto& accumulator = pos.state()->accumulator;
+    for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
       Features::IndexList removed_indices[2], added_indices[2];
-      bool reset[2];
-      RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i],
-                                        removed_indices, added_indices, reset);
+      bool reset[2] = { false, false };
+      RawFeatures::append_changed_indices(pos, kRefreshTriggers[i],
+                                          removed_indices, added_indices, reset);
+
+#ifdef VECTOR
+      for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
+        for (Color perspective : { WHITE, BLACK }) {
+          auto accTile = reinterpret_cast<vec_t*>(
+              &accumulator.accumulation[perspective][i][j * kTileHeight]);
+
+          if (reset[perspective]) {
+            if (i == 0) {
+              auto biasesTile = reinterpret_cast<const vec_t*>(
+                  &biases_[j * kTileHeight]);
+              for (IndexType k = 0; k < kNumRegs; ++k)
+                acc[k] = biasesTile[k];
+            } else {
+              for (IndexType k = 0; k < kNumRegs; ++k)
+                acc[k] = vec_zero;
+            }
+          } else {
+            auto prevAccTile = reinterpret_cast<const vec_t*>(
+                &prev_accumulator.accumulation[perspective][i][j * kTileHeight]);
+
+            for (IndexType k = 0; k < kNumRegs; ++k)
+              acc[k] = vec_load(&prevAccTile[k]);
+
+            // Difference calculation for the deactivated features
+            for (const auto index : removed_indices[perspective]) {
+              const IndexType offset = kHalfDimensions * index + j * kTileHeight;
+              auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
+
+              for (IndexType k = 0; k < kNumRegs; ++k)
+                acc[k] = vec_sub_16(acc[k], column[k]);
+            }
+          }
+
+          { // Difference calculation for the activated features
+            for (const auto index : added_indices[perspective]) {
+              const IndexType offset = kHalfDimensions * index + j * kTileHeight;
+              auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
+
+              for (IndexType k = 0; k < kNumRegs; ++k)
+                acc[k] = vec_add_16(acc[k], column[k]);
+            }
+          }
+
+          for (IndexType k = 0; k < kNumRegs; ++k)
+            vec_store(&accTile[k], acc[k]);
+        }
+      }
+#if defined(USE_MMX)
+      _mm_empty();
+#endif
+
+#else
       for (Color perspective : { WHITE, BLACK }) {
 
-  #if defined(USE_AVX2)
-        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-        auto accumulation = reinterpret_cast<__m256i*>(
-            &accumulator.accumulation[perspective][i][0]);
-
-  #elif defined(USE_SSE2)
-        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-        auto accumulation = reinterpret_cast<__m128i*>(
-            &accumulator.accumulation[perspective][i][0]);
-
-  #elif defined(USE_MMX)
-        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-        auto accumulation = reinterpret_cast<__m64*>(
-            &accumulator.accumulation[perspective][i][0]);
-
-  #elif defined(USE_NEON)
-        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-        auto accumulation = reinterpret_cast<int16x8_t*>(
-            &accumulator.accumulation[perspective][i][0]);
-  #endif
-
         if (reset[perspective]) {
-          std::memcpy(accumulator.accumulation[perspective][i], biases_,
-                      kHalfDimensions * sizeof(BiasType));
+          if (i == 0) {
+            std::memcpy(accumulator.accumulation[perspective][i], biases_,
+                        kHalfDimensions * sizeof(BiasType));
+          } else {
+            std::memset(accumulator.accumulation[perspective][i], 0,
+                        kHalfDimensions * sizeof(BiasType));
+          }
         } else {
           std::memcpy(accumulator.accumulation[perspective][i],
                       prev_accumulator.accumulation[perspective][i],
@@ -302,83 +481,22 @@ namespace Eval::NNUE {
           for (const auto index : removed_indices[perspective]) {
             const IndexType offset = kHalfDimensions * index;
 
-  #if defined(USE_AVX2)
-            auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j) {
-              accumulation[j] = _mm256_sub_epi16(accumulation[j], column[j]);
-            }
-
-  #elif defined(USE_SSE2)
-            auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j) {
-              accumulation[j] = _mm_sub_epi16(accumulation[j], column[j]);
-            }
-
-  #elif defined(USE_MMX)
-            auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j) {
-              accumulation[j] = _mm_sub_pi16(accumulation[j], column[j]);
-            }
-
-  #elif defined(USE_NEON)
-            auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j) {
-              accumulation[j] = vsubq_s16(accumulation[j], column[j]);
-            }
-
-  #else
-            for (IndexType j = 0; j < kHalfDimensions; ++j) {
-              accumulator.accumulation[perspective][i][j] -=
-                  weights_[offset + j];
-            }
-  #endif
-
+            for (IndexType j = 0; j < kHalfDimensions; ++j)
+              accumulator.accumulation[perspective][i][j] -= weights_[offset + j];
           }
         }
         { // Difference calculation for the activated features
           for (const auto index : added_indices[perspective]) {
             const IndexType offset = kHalfDimensions * index;
 
-  #if defined(USE_AVX2)
-            auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j) {
-              accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]);
-            }
-
-  #elif defined(USE_SSE2)
-            auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j) {
-              accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
-            }
-
-  #elif defined(USE_MMX)
-            auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j) {
-              accumulation[j] = _mm_add_pi16(accumulation[j], column[j]);
-            }
-
-  #elif defined(USE_NEON)
-            auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j) {
-              accumulation[j] = vaddq_s16(accumulation[j], column[j]);
-            }
-
-  #else
-            for (IndexType j = 0; j < kHalfDimensions; ++j) {
-              accumulator.accumulation[perspective][i][j] +=
-                  weights_[offset + j];
-            }
-  #endif
-
+            for (IndexType j = 0; j < kHalfDimensions; ++j)
+              accumulator.accumulation[perspective][i][j] += weights_[offset + j];
           }
         }
       }
-  #if defined(USE_MMX)
-      _mm_empty();
-  #endif
-
+#endif
+      }
       accumulator.computed_accumulation = true;
-      accumulator.computed_score = false;
     }
 
     using BiasType = std::int16_t;
diff --git a/src/nnue/nnue_test_command.cpp b/src/nnue/nnue_test_command.cpp
index 311c5ded..d892222b 100644
--- a/src/nnue/nnue_test_command.cpp
+++ b/src/nnue/nnue_test_command.cpp
@@ -1,201 +1,215 @@
-﻿// USI extended command for NNUE evaluation function
-
-#if defined(ENABLE_TEST_CMD) && defined(EVAL_NNUE)
-
-#include "../thread.h"
-#include "../uci.h"
-#include "evaluate_nnue.h"
+﻿#include "evaluate_nnue.h"
 #include "nnue_test_command.h"
 
+#include "thread.h"
+#include "uci.h"
+
 #include <set>
 #include <fstream>
 
-#define ASSERT(X) { if (!(X)) { std::cout << "\nError : ASSERT(" << #X << "), " << __FILE__ << "(" << __LINE__ << "): " << __func__ << std::endl; \
- std::this_thread::sleep_for(std::chrono::microseconds(3000)); *(int*)1 =0;} }
-
-namespace Eval {
-
-namespace NNUE {
-
-namespace {
-
-// Testing RawFeatures mainly for difference calculation
-void TestFeatures(Position& pos) {
-  const std::uint64_t num_games = 1000;
-  StateInfo si;
-  pos.set(StartFEN, false, &si, Threads.main());
-  const int MAX_PLY = 256; // test up to 256 hands
-
-  StateInfo state[MAX_PLY]; // StateInfo only for the maximum number of steps
-  int ply; // Trouble from the initial phase
-
-  PRNG prng(20171128);
-
-  std::uint64_t num_moves = 0;
-  std::vector<std::uint64_t> num_updates(kRefreshTriggers.size() + 1);
-  std::vector<std::uint64_t> num_resets(kRefreshTriggers.size());
-  constexpr IndexType kUnknown = -1;
-  std::vector<IndexType> trigger_map(RawFeatures::kDimensions, kUnknown);
-  auto make_index_sets = [&](const Position& pos) {
-    std::vector<std::vector<std::set<IndexType>>> index_sets(
-        kRefreshTriggers.size(), std::vector<std::set<IndexType>>(2));
-    for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-      Features::IndexList active_indices[2];
-      RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i],
-                                       active_indices);
-      for (const auto perspective : Colors) {
-        for (const auto index : active_indices[perspective]) {
-          ASSERT(index < RawFeatures::kDimensions);
-          ASSERT(index_sets[i][perspective].count(index) == 0);
-          ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
-          index_sets[i][perspective].insert(index);
-          trigger_map[index] = i;
-        }
-      }
-    }
-    return index_sets;
-  };
-  auto update_index_sets = [&](const Position& pos, auto* index_sets) {
-    for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-      Features::IndexList removed_indices[2], added_indices[2];
-      bool reset[2];
-      RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i],
-                                        removed_indices, added_indices, reset);
-      for (const auto perspective : Colors) {
-        if (reset[perspective]) {
-          (*index_sets)[i][perspective].clear();
-          ++num_resets[i];
-        } else {
-          for (const auto index : removed_indices[perspective]) {
-            ASSERT(index < RawFeatures::kDimensions);
-            ASSERT((*index_sets)[i][perspective].count(index) == 1);
-            ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
-            (*index_sets)[i][perspective].erase(index);
-            ++num_updates.back();
-            ++num_updates[i];
-            trigger_map[index] = i;
-          }
-        }
-        for (const auto index : added_indices[perspective]) {
-          ASSERT(index < RawFeatures::kDimensions);
-          ASSERT((*index_sets)[i][perspective].count(index) == 0);
-          ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
-          (*index_sets)[i][perspective].insert(index);
-          ++num_updates.back();
-          ++num_updates[i];
-          trigger_map[index] = i;
-        }
-      }
-    }
-  };
-
-  std::cout << "feature set: " << RawFeatures::GetName()
-            << "[" << RawFeatures::kDimensions << "]" << std::endl;
-  std::cout << "start testing with random games";
-
-  for (std::uint64_t i = 0; i < num_games; ++i) {
-    auto index_sets = make_index_sets(pos);
-    for (ply = 0; ply < MAX_PLY; ++ply) {
-      MoveList<LEGAL> mg(pos); // Generate all legal hands
-
-      // There was no legal move == Clog
-      if (mg.size() == 0)
-        break;
-
-      // Randomly choose from the generated moves and advance the phase with the moves.
-      Move m = mg.begin()[prng.rand(mg.size())];
-      pos.do_move(m, state[ply]);
-
-      ++num_moves;
-      update_index_sets(pos, &index_sets);
-      ASSERT(index_sets == make_index_sets(pos));
-    }
-
-    pos.set(StartFEN, false, &si, Threads.main());
-
-    // Output'.' every 100 times (so you can see that it's progressing)
-    if ((i % 100) == 0)
-      std::cout << "." << std::flush;
-  }
-  std::cout << "passed." << std::endl;
-  std::cout << num_games << " games, " << num_moves << " moves, "
-            << num_updates.back() << " updates, "
-            << (1.0 * num_updates.back() / num_moves)
-            << " updates per move" << std::endl;
-  std::size_t num_observed_indices = 0;
-  for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-    const auto count = std::count(trigger_map.begin(), trigger_map.end(), i);
-    num_observed_indices += count;
-    std::cout << "TriggerEvent(" << static_cast<int>(kRefreshTriggers[i])
-              << "): " << count << " features ("
-              << (100.0 * count / RawFeatures::kDimensions) << "%), "
-              << num_updates[i] << " updates ("
-              << (1.0 * num_updates[i] / num_moves) << " per move), "
-              << num_resets[i] << " resets ("
-              << (100.0 * num_resets[i] / num_moves) << "%)"
-              << std::endl;
-  }
-  std::cout << "observed " << num_observed_indices << " ("
-            << (100.0 * num_observed_indices / RawFeatures::kDimensions)
-            << "% of " << RawFeatures::kDimensions
-            << ") features" << std::endl;
+#define ASSERT(X) { \
+    if (!(X)) { \
+        std::cout \
+            << "\nError : ASSERT(" << #X << "), " \
+            << __FILE__ << "(" << __LINE__ << "): " \
+            << __func__ << std::endl; \
+            std::this_thread::sleep_for(std::chrono::microseconds(3000)); \
+            *(int*)1 =0; \
+    } \
 }
 
-// Output a string that represents the structure of the evaluation function
-void PrintInfo(std::istream& stream) {
-  std::cout << "network architecture: " << GetArchitectureString() << std::endl;
-
-  while (true) {
-    std::string file_name;
-    stream >> file_name;
-    if (file_name.empty()) break;
-
-    std::uint32_t hash_value;
-    std::string architecture;
-    const bool success = [&]() {
-      std::ifstream file_stream(file_name, std::ios::binary);
-      if (!file_stream) return false;
-      if (!ReadHeader(file_stream, &hash_value, &architecture)) return false;
-      return true;
-    }();
-
-    std::cout << file_name << ": ";
-    if (success) {
-      if (hash_value == kHashValue) {
-        std::cout << "matches with this binary";
-        if (architecture != GetArchitectureString()) {
-          std::cout << ", but architecture string differs: " << architecture;
-        }
-        std::cout << std::endl;
-      } else {
-        std::cout << architecture << std::endl;
-      }
-    } else {
-      std::cout << "failed to read header" << std::endl;
-    }
-  }
-}
-
-}  // namespace
-
 // USI extended command for NNUE evaluation function
-void TestCommand(Position& pos, std::istream& stream) {
-  std::string sub_command;
-  stream >> sub_command;
+namespace Eval::NNUE {
 
-  if (sub_command == "test_features") {
-    TestFeatures(pos);
-  } else if (sub_command == "info") {
-    PrintInfo(stream);
-  } else {
-    std::cout << "usage:" << std::endl;
-    std::cout << " test nnue test_features" << std::endl;
-    std::cout << " test nnue info [path/to/" << fileName << "...]" << std::endl;
-  }
-}
+    namespace {
 
-}  // namespace NNUE
+        // Testing RawFeatures mainly for difference calculation
+        void test_features(Position& pos) {
+            const std::uint64_t num_games = 1000;
+            StateInfo si;
+            pos.set(StartFEN, false, &si, Threads.main());
+            const int MAX_PLY = 256; // test up to 256 hands
 
-}  // namespace Eval
+            StateInfo state[MAX_PLY]; // StateInfo only for the maximum number of steps
+            int ply; // Trouble from the initial phase
 
-#endif  // defined(ENABLE_TEST_CMD) && defined(EVAL_NNUE)
+            PRNG prng(20171128);
+
+            std::uint64_t num_moves = 0;
+            std::vector<std::uint64_t> num_updates(kRefreshTriggers.size() + 1);
+            std::vector<std::uint64_t> num_resets(kRefreshTriggers.size());
+            constexpr IndexType kUnknown = -1;
+            std::vector<IndexType> trigger_map(RawFeatures::kDimensions, kUnknown);
+
+            auto make_index_sets = [&](const Position& position) {
+                std::vector<std::vector<std::set<IndexType>>> index_sets(
+                    kRefreshTriggers.size(), std::vector<std::set<IndexType>>(2));
+
+                for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+                    Features::IndexList active_indices[2];
+                    RawFeatures::append_active_indices(position, kRefreshTriggers[i],
+                                                     active_indices);
+
+                    for (const auto perspective : Colors) {
+                        for (const auto index : active_indices[perspective]) {
+                            ASSERT(index < RawFeatures::kDimensions);
+                            ASSERT(index_sets[i][perspective].count(index) == 0);
+                            ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
+                            index_sets[i][perspective].insert(index);
+                            trigger_map[index] = i;
+                        }
+                    }
+                }
+
+                return index_sets;
+            };
+
+            auto update_index_sets = [&](const Position& position, auto* index_sets) {
+                for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+                    Features::IndexList removed_indices[2], added_indices[2];
+                    bool reset[2] = { false, false };
+                    RawFeatures::append_changed_indices(position, kRefreshTriggers[i],
+                                                      removed_indices, added_indices, reset);
+                    for (const auto perspective : Colors) {
+                        if (reset[perspective]) {
+                            (*index_sets)[i][perspective].clear();
+                            ++num_resets[i];
+                        } else {
+                            for (const auto index : removed_indices[perspective]) {
+                                ASSERT(index < RawFeatures::kDimensions);
+                                ASSERT((*index_sets)[i][perspective].count(index) == 1);
+                                ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
+                                (*index_sets)[i][perspective].erase(index);
+                                ++num_updates.back();
+                                ++num_updates[i];
+                                trigger_map[index] = i;
+                            }
+                        }
+
+                        for (const auto index : added_indices[perspective]) {
+                            ASSERT(index < RawFeatures::kDimensions);
+                            ASSERT((*index_sets)[i][perspective].count(index) == 0);
+                            ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
+                            (*index_sets)[i][perspective].insert(index);
+                            ++num_updates.back();
+                            ++num_updates[i];
+                            trigger_map[index] = i;
+                        }
+                    }
+                }
+            };
+
+            std::cout << "feature set: " << RawFeatures::get_name()
+                      << "[" << RawFeatures::kDimensions << "]" << std::endl;
+            std::cout << "start testing with random games";
+
+            for (std::uint64_t i = 0; i < num_games; ++i) {
+                auto index_sets = make_index_sets(pos);
+                for (ply = 0; ply < MAX_PLY; ++ply) {
+                    MoveList<LEGAL> mg(pos); // Generate all legal hands
+
+                    // There was no legal move == Clog
+                    if (mg.size() == 0)
+                        break;
+
+                    // Randomly choose from the generated moves and advance the phase with the moves.
+                    Move m = mg.begin()[prng.rand(mg.size())];
+                    pos.do_move(m, state[ply]);
+
+                    ++num_moves;
+                    update_index_sets(pos, &index_sets);
+                    ASSERT(index_sets == make_index_sets(pos));
+                }
+
+                pos.set(StartFEN, false, &si, Threads.main());
+
+                // Output'.' every 100 times (so you can see that it's progressing)
+                if ((i % 100) == 0)
+                    std::cout << "." << std::flush;
+            }
+
+            std::cout << "passed." << std::endl;
+            std::cout << num_games << " games, " << num_moves << " moves, "
+                      << num_updates.back() << " updates, "
+                      << (1.0 * num_updates.back() / num_moves)
+                      << " updates per move" << std::endl;
+            std::size_t num_observed_indices = 0;
+
+            for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+                const auto count = std::count(trigger_map.begin(), trigger_map.end(), i);
+                num_observed_indices += count;
+                std::cout << "TriggerEvent(" << static_cast<int>(kRefreshTriggers[i])
+                          << "): " << count << " features ("
+                          << (100.0 * count / RawFeatures::kDimensions) << "%), "
+                          << num_updates[i] << " updates ("
+                          << (1.0 * num_updates[i] / num_moves) << " per move), "
+                          << num_resets[i] << " resets ("
+                          << (100.0 * num_resets[i] / num_moves) << "%)"
+                          << std::endl;
+            }
+            std::cout << "observed " << num_observed_indices << " ("
+                      << (100.0 * num_observed_indices / RawFeatures::kDimensions)
+                      << "% of " << RawFeatures::kDimensions
+                      << ") features" << std::endl;
+        }
+
+        // Output a string that represents the structure of the evaluation function
+        void print_info(std::istream& stream) {
+            std::cout << "network architecture: " << get_architecture_string() << std::endl;
+
+            while (true) {
+                std::string file_name;
+                stream >> file_name;
+                if (file_name.empty())
+                    break;
+
+                std::uint32_t hash_value;
+                std::string architecture;
+                const bool success = [&]() {
+                    std::ifstream file_stream(file_name, std::ios::binary);
+
+                    if (!file_stream)
+                        return false;
+                    if (!read_header(file_stream, &hash_value, &architecture))
+                        return false;
+
+                    return true;
+                }();
+
+                std::cout << file_name << ": ";
+                if (success) {
+                    if (hash_value == kHashValue) {
+                        std::cout << "matches with this binary";
+                        if (architecture != get_architecture_string()) {
+                            std::cout << ", but architecture string differs: " << architecture;
+                        }
+
+                        std::cout << std::endl;
+                    } else {
+                        std::cout << architecture << std::endl;
+                    }
+                } else {
+                    std::cout << "failed to read header" << std::endl;
+                }
+            }
+        }
+
+    }  // namespace
+
+    // USI extended command for NNUE evaluation function
+    void test_command(Position& pos, std::istream& stream) {
+        std::string sub_command;
+        stream >> sub_command;
+
+        if (sub_command == "test_features") {
+            test_features(pos);
+        } else if (sub_command == "info") {
+            print_info(stream);
+        } else {
+            std::cout << "usage:" << std::endl;
+            std::cout << " test nnue test_features" << std::endl;
+            std::cout << " test nnue info [path/to/" << fileName << "...]" << std::endl;
+        }
+    }
+
+}  // namespace Eval::NNUE
diff --git a/src/nnue/nnue_test_command.h b/src/nnue/nnue_test_command.h
index 570ef01b..fcfe16f6 100644
--- a/src/nnue/nnue_test_command.h
+++ b/src/nnue/nnue_test_command.h
@@ -1,21 +1,12 @@
-﻿// USI extended command interface for NNUE evaluation function
-
-#ifndef _NNUE_TEST_COMMAND_H_
+﻿#ifndef _NNUE_TEST_COMMAND_H_
 #define _NNUE_TEST_COMMAND_H_
 
-#if defined(ENABLE_TEST_CMD) && defined(EVAL_NNUE)
+// USI extended command interface for NNUE evaluation function
+namespace Eval::NNUE {
 
-namespace Eval {
+    // USI extended command for NNUE evaluation function
+    void test_command(Position& pos, std::istream& stream);
 
-namespace NNUE {
-
-// USI extended command for NNUE evaluation function
-void TestCommand(Position& pos, std::istream& stream);
-
-}  // namespace NNUE
-
-}  // namespace Eval
-
-#endif  // defined(ENABLE_TEST_CMD) && defined(EVAL_NNUE)
+}  // namespace Eval::NNUE
 
 #endif
diff --git a/src/nnue/trainer/features/all_factorizers.h b/src/nnue/trainer/features/all_factorizers.h
new file mode 100644
index 00000000..75d62ec8
--- /dev/null
+++ b/src/nnue/trainer/features/all_factorizers.h
@@ -0,0 +1,10 @@
+#ifndef _NNUE_TRAINER_FEATURES_ALL_FACTORIZERS_H_
+#define _NNUE_TRAINER_FEATURES_ALL_FACTORIZERS_H_
+
+#include "factorizer.h"
+#include "factorizer_feature_set.h"
+
+#include "factorizer_half_kp.h"
+#include "factorizer_half_ka.h"
+
+#endif
diff --git a/src/nnue/trainer/features/factorizer.h b/src/nnue/trainer/features/factorizer.h
index 148ee8ec..b64b0c74 100644
--- a/src/nnue/trainer/features/factorizer.h
+++ b/src/nnue/trainer/features/factorizer.h
@@ -1,110 +1,117 @@
-﻿// NNUE evaluation function feature conversion class template
-
-#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_H_
+﻿#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_H_
 #define _NNUE_TRAINER_FEATURES_FACTORIZER_H_
 
-#if defined(EVAL_NNUE)
+#include "nnue/nnue_common.h"
 
-#include "../../nnue_common.h"
-#include "../trainer.h"
+#include "nnue/trainer/trainer.h"
 
-namespace Eval {
+// NNUE evaluation function feature conversion class template
+namespace Eval::NNUE::Features {
 
-namespace NNUE {
+    // Class template that converts input features into learning features
+    // By default, the learning feature is the same as the original input feature, and specialized as necessary
+    template <typename FeatureType>
+    class Factorizer {
+    public:
+        static constexpr std::string get_name() {
+            return "Factorizer<" + FeatureType::get_name() + "> -> " + std::string("No factorizer");
+        }
 
-namespace Features {
+        static constexpr std::string get_factorizers_string() {
+            return "  - " + get_name();
+        }
 
-// Class template that converts input features into learning features
-// By default, the learning feature is the same as the original input feature, and specialized as necessary
-template <typename FeatureType>
-class Factorizer {
- public:
-  // Get the dimensionality of the learning feature
-  static constexpr IndexType GetDimensions() {
-    return FeatureType::kDimensions;
-  }
+        // Get the dimensionality of the learning feature
+        static constexpr IndexType get_dimensions() {
+            return FeatureType::kDimensions;
+        }
 
-  // Get index of learning feature and scale of learning rate
-  static void AppendTrainingFeatures(
-      IndexType base_index, std::vector<TrainingFeature>* training_features) {
-    assert(base_index <FeatureType::kDimensions);
-    training_features->emplace_back(base_index);
-  }
-};
+        // Get index of learning feature and scale of learning rate
+        static void append_training_features(
+            IndexType base_index, std::vector<TrainingFeature>* training_features) {
 
-// Learning feature information
-struct FeatureProperties {
-  bool active;
-  IndexType dimensions;
-};
+            assert(base_index <FeatureType::kDimensions);
+            training_features->emplace_back(base_index);
+        }
+    };
 
-// Add the original input features to the learning features
-template <typename FeatureType>
-IndexType AppendBaseFeature(
-    FeatureProperties properties, IndexType base_index,
-    std::vector<TrainingFeature>* training_features) {
-  assert(properties.dimensions == FeatureType::kDimensions);
-  assert(base_index < FeatureType::kDimensions);
-  training_features->emplace_back(base_index);
-  return properties.dimensions;
-}
+    // Learning feature information
+    struct FeatureProperties {
+        bool active;
+        IndexType dimensions;
+    };
 
-// If the learning rate scale is not 0, inherit other types of learning features
-template <typename FeatureType>
-IndexType InheritFeaturesIfRequired(
-    IndexType index_offset, FeatureProperties properties, IndexType base_index,
-    std::vector<TrainingFeature>* training_features) {
-  if (!properties.active) {
-    return 0;
-  }
-  assert(properties.dimensions == Factorizer<FeatureType>::GetDimensions());
-  assert(base_index < FeatureType::kDimensions);
-  const auto start = training_features->size();
-  Factorizer<FeatureType>::AppendTrainingFeatures(
-      base_index, training_features);
-  for (auto i = start; i < training_features->size(); ++i) {
-    auto& feature = (*training_features)[i];
-    assert(feature.GetIndex() < Factorizer<FeatureType>::GetDimensions());
-    feature.ShiftIndex(index_offset);
-  }
-  return properties.dimensions;
-}
+    // Add the original input features to the learning features
+    template <typename FeatureType>
+    IndexType append_base_feature(
+        FeatureProperties properties, IndexType base_index,
+        std::vector<TrainingFeature>* training_features) {
 
-// Return the index difference as needed, without adding learning features
-// Call instead of InheritFeaturesIfRequired() if there are no corresponding features
-IndexType SkipFeatures(FeatureProperties properties) {
-  if (!properties.active) {
-    return 0;
-  }
-  return properties.dimensions;
-}
-
-// Get the dimensionality of the learning feature
-template <std::size_t N>
-constexpr IndexType GetActiveDimensions(
-    const FeatureProperties (&properties)[N]) {
-  static_assert(N > 0, "");
-  IndexType dimensions = properties[0].dimensions;
-  for (std::size_t i = 1; i < N; ++i) {
-    if (properties[i].active) {
-      dimensions += properties[i].dimensions;
+        assert(properties.dimensions == FeatureType::kDimensions);
+        assert(base_index < FeatureType::kDimensions);
+        training_features->emplace_back(base_index);
+        return properties.dimensions;
     }
-  }
-  return dimensions;
-}
 
-// get the number of elements in the array
-template <typename T, std::size_t N>
-constexpr std::size_t GetArrayLength(const T (&/*array*/)[N]) {
-  return N;
-}
+    // If the learning rate scale is not 0, inherit other types of learning features
+    template <typename FeatureType>
+    IndexType inherit_features_if_required(
+        IndexType index_offset, FeatureProperties properties, IndexType base_index,
+        std::vector<TrainingFeature>* training_features) {
 
-}  // namespace Features
+        if (!properties.active) {
+            return 0;
+        }
 
-}  // namespace NNUE
+        assert(properties.dimensions == Factorizer<FeatureType>::get_dimensions());
+        assert(base_index < FeatureType::kDimensions);
 
-}  // namespace Eval
+        const auto start = training_features->size();
+        Factorizer<FeatureType>::append_training_features(
+            base_index, training_features);
 
-#endif  // defined(EVAL_NNUE)
+        for (auto i = start; i < training_features->size(); ++i) {
+            auto& feature = (*training_features)[i];
+            assert(feature.get_index() < Factorizer<FeatureType>::get_dimensions());
+            feature.shift_index(index_offset);
+        }
+
+        return properties.dimensions;
+    }
+
+    // Return the index difference as needed, without adding learning features
+    // Call instead of InheritFeaturesIfRequired() if there are no corresponding features
+    IndexType skip_features(FeatureProperties properties) {
+        if (!properties.active)
+            return 0;
+
+        return properties.dimensions;
+    }
+
+    // Get the dimensionality of the learning feature
+    template <std::size_t N>
+    constexpr IndexType get_active_dimensions(
+        const FeatureProperties (&properties)[N]) {
+
+        static_assert(N > 0, "");
+
+        IndexType dimensions = properties[0].dimensions;
+
+        for (std::size_t i = 1; i < N; ++i) {
+            if (properties[i].active) {
+                dimensions += properties[i].dimensions;
+            }
+        }
+
+        return dimensions;
+    }
+
+    // get the number of elements in the array
+    template <typename T, std::size_t N>
+    constexpr std::size_t get_array_length(const T (&/*array*/)[N]) {
+        return N;
+    }
+
+}  // namespace Eval::NNUE::Features
 
 #endif
diff --git a/src/nnue/trainer/features/factorizer_feature_set.h b/src/nnue/trainer/features/factorizer_feature_set.h
index af524719..60f42166 100644
--- a/src/nnue/trainer/features/factorizer_feature_set.h
+++ b/src/nnue/trainer/features/factorizer_feature_set.h
@@ -1,104 +1,121 @@
-﻿// Specialization for feature set of feature conversion class template of NNUE evaluation function
-
-#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_
+﻿#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_
 #define _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_
 
-#if defined(EVAL_NNUE)
-
-#include "../../features/feature_set.h"
 #include "factorizer.h"
 
-namespace Eval {
+#include "nnue/features/feature_set.h"
 
-namespace NNUE {
+// Specialization for feature set of feature conversion class template of NNUE evaluation function
+namespace Eval::NNUE::Features {
 
-namespace Features {
+    // Class template that converts input features into learning features
+    // Specialization for FeatureSet
+    template <typename FirstFeatureType, typename... RemainingFeatureTypes>
+    class Factorizer<FeatureSet<FirstFeatureType, RemainingFeatureTypes...>> {
+    private:
+        using Head = Factorizer<FeatureSet<FirstFeatureType>>;
+        using Tail = Factorizer<FeatureSet<RemainingFeatureTypes...>>;
 
-// Class template that converts input features into learning features
-// Specialization for FeatureSet
-template <typename FirstFeatureType, typename... RemainingFeatureTypes>
-class Factorizer<FeatureSet<FirstFeatureType, RemainingFeatureTypes...>> {
- private:
-  using Head = Factorizer<FeatureSet<FirstFeatureType>>;
-  using Tail = Factorizer<FeatureSet<RemainingFeatureTypes...>>;
+    public:
+        // number of dimensions of original input features
+        static constexpr IndexType kBaseDimensions =
+            FeatureSet<FirstFeatureType, RemainingFeatureTypes...>::kDimensions;
 
- public:
-  // number of dimensions of original input features
-  static constexpr IndexType kBaseDimensions =
-      FeatureSet<FirstFeatureType, RemainingFeatureTypes...>::kDimensions;
-
-  // Get the dimensionality of the learning feature
-  static constexpr IndexType GetDimensions() {
-    return Head::GetDimensions() + Tail::GetDimensions();
-  }
-
-  // Get index of learning feature and scale of learning rate
-  static void AppendTrainingFeatures(
-      IndexType base_index, std::vector<TrainingFeature>* training_features,
-      IndexType base_dimensions = kBaseDimensions) {
-    assert(base_index < kBaseDimensions);
-    constexpr auto boundary = FeatureSet<RemainingFeatureTypes...>::kDimensions;
-    if (base_index < boundary) {
-      Tail::AppendTrainingFeatures(
-          base_index, training_features, base_dimensions);
-    } else {
-      const auto start = training_features->size();
-      Head::AppendTrainingFeatures(
-          base_index - boundary, training_features, base_dimensions);
-      for (auto i = start; i < training_features->size(); ++i) {
-        auto& feature = (*training_features)[i];
-        const auto index = feature.GetIndex();
-        assert(index < Head::GetDimensions() ||
-                   (index >= base_dimensions &&
-                    index < base_dimensions +
-                            Head::GetDimensions() - Head::kBaseDimensions));
-        if (index < Head::kBaseDimensions) {
-          feature.ShiftIndex(Tail::kBaseDimensions);
-        } else {
-          feature.ShiftIndex(Tail::GetDimensions() - Tail::kBaseDimensions);
+        static constexpr std::string get_factorizers_string() {
+            std::string str = "  - ";
+            str += Head::get_name();
+            str += '\n';
+            str += Tail::get_factorizers_string();
+            return str;
         }
-      }
-    }
-  }
-};
 
-// Class template that converts input features into learning features
-// Specialization when FeatureSet has one template argument
-template <typename FeatureType>
-class Factorizer<FeatureSet<FeatureType>> {
-public:
-  // number of dimensions of original input features
-  static constexpr IndexType kBaseDimensions = FeatureType::kDimensions;
+        // Get the dimensionality of the learning feature
+        static constexpr IndexType get_dimensions() {
+            return Head::get_dimensions() + Tail::get_dimensions();
+        }
 
-  // Get the dimensionality of the learning feature
-  static constexpr IndexType GetDimensions() {
-    return Factorizer<FeatureType>::GetDimensions();
-  }
+        // Get index of learning feature and scale of learning rate
+        static void append_training_features(
+            IndexType base_index, std::vector<TrainingFeature>* training_features,
+            IndexType base_dimensions = kBaseDimensions) {
 
-  // Get index of learning feature and scale of learning rate
-  static void AppendTrainingFeatures(
-      IndexType base_index, std::vector<TrainingFeature>* training_features,
-      IndexType base_dimensions = kBaseDimensions) {
-    assert(base_index < kBaseDimensions);
-    const auto start = training_features->size();
-    Factorizer<FeatureType>::AppendTrainingFeatures(
-        base_index, training_features);
-    for (auto i = start; i < training_features->size(); ++i) {
-      auto& feature = (*training_features)[i];
-      assert(feature.GetIndex() < Factorizer<FeatureType>::GetDimensions());
-      if (feature.GetIndex() >= kBaseDimensions) {
-        feature.ShiftIndex(base_dimensions - kBaseDimensions);
-      }
-    }
-  }
-};
+            assert(base_index < kBaseDimensions);
 
-}  // namespace Features
+            constexpr auto boundary = FeatureSet<RemainingFeatureTypes...>::kDimensions;
 
-}  // namespace NNUE
+            if (base_index < boundary) {
+                Tail::append_training_features(
+                    base_index, training_features, base_dimensions);
+            }
+            else {
+                const auto start = training_features->size();
 
-}  // namespace Eval
+                Head::append_training_features(
+                    base_index - boundary, training_features, base_dimensions);
 
-#endif  // defined(EVAL_NNUE)
+                for (auto i = start; i < training_features->size(); ++i) {
+                    auto& feature = (*training_features)[i];
+                    const auto index = feature.get_index();
+
+                    assert(index < Head::get_dimensions() ||
+                               (index >= base_dimensions &&
+                                index < base_dimensions +
+                                        Head::get_dimensions() - Head::kBaseDimensions));
+
+                    if (index < Head::kBaseDimensions) {
+                        feature.shift_index(Tail::kBaseDimensions);
+                    }
+                    else {
+                        feature.shift_index(Tail::get_dimensions() - Tail::kBaseDimensions);
+                    }
+                }
+            }
+        }
+    };
+
+    // Class template that converts input features into learning features
+    // Specialization when FeatureSet has one template argument
+    template <typename FeatureType>
+    class Factorizer<FeatureSet<FeatureType>> {
+    public:
+        // number of dimensions of original input features
+        static constexpr IndexType kBaseDimensions = FeatureType::kDimensions;
+
+        static constexpr std::string get_name() {
+            return Factorizer<FeatureType>::get_name();
+        }
+
+        static constexpr std::string get_factorizers_string() {
+            return "  - " + get_name();
+        }
+
+        // Get the dimensionality of the learning feature
+        static constexpr IndexType get_dimensions() {
+            return Factorizer<FeatureType>::get_dimensions();
+        }
+
+        // Get index of learning feature and scale of learning rate
+        static void append_training_features(
+            IndexType base_index, std::vector<TrainingFeature>* training_features,
+            IndexType base_dimensions = kBaseDimensions) {
+
+            assert(base_index < kBaseDimensions);
+
+            const auto start = training_features->size();
+
+            Factorizer<FeatureType>::append_training_features(
+                base_index, training_features);
+
+            for (auto i = start; i < training_features->size(); ++i) {
+                auto& feature = (*training_features)[i];
+                assert(feature.get_index() < Factorizer<FeatureType>::get_dimensions());
+                if (feature.get_index() >= kBaseDimensions) {
+                    feature.shift_index(base_dimensions - kBaseDimensions);
+                }
+            }
+        }
+    };
+
+}  // namespace Eval::NNUE::Features
 
 #endif
diff --git a/src/nnue/trainer/features/factorizer_half_ka.h b/src/nnue/trainer/features/factorizer_half_ka.h
new file mode 100644
index 00000000..36d36a2d
--- /dev/null
+++ b/src/nnue/trainer/features/factorizer_half_ka.h
@@ -0,0 +1,93 @@
+﻿#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KA_H_
+#define _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KA_H_
+
+#include "factorizer.h"
+
+#include "nnue/features/half_ka.h"
+#include "nnue/features/a.h"
+#include "nnue/features/half_relative_ka.h"
+
+// Specialization of NNUE evaluation function feature conversion class template for HalfKA
+namespace Eval::NNUE::Features {
+
+    // Class template that converts input features into learning features
+    // Specialization for HalfKA
+    template <Side AssociatedKing>
+    class Factorizer<HalfKA<AssociatedKing>> {
+    private:
+        using FeatureType = HalfKA<AssociatedKing>;
+
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions =
+            FeatureType::kMaxActiveDimensions;
+
+        // Type of learning feature
+        enum TrainingFeatureType {
+            kFeaturesHalfKA,
+            kFeaturesA,
+            kFeaturesHalfRelativeKA,
+            kNumTrainingFeatureTypes,
+        };
+
+        // Learning feature information
+        static constexpr FeatureProperties kProperties[] = {
+            // kFeaturesHalfA
+            {true, FeatureType::kDimensions},
+            // kFeaturesA
+            {true, Factorizer<A>::get_dimensions()},
+            // kFeaturesHalfRelativeKA
+            {true, Factorizer<HalfRelativeKA<AssociatedKing>>::get_dimensions()},
+        };
+
+        static_assert(get_array_length(kProperties) == kNumTrainingFeatureTypes, "");
+
+    public:
+        static constexpr std::string get_name() {
+            return std::string("Factorizer<") + FeatureType::kName + "> -> " + "A, HalfRelativeKA";
+        }
+
+        static constexpr std::string get_factorizers_string() {
+            return "  - " + get_name();
+        }
+
+        // Get the dimensionality of the learning feature
+        static constexpr IndexType get_dimensions() {
+            return get_active_dimensions(kProperties);
+        }
+
+        // Get index of learning feature and scale of learning rate
+        static void append_training_features(
+            IndexType base_index, std::vector<TrainingFeature>* training_features) {
+
+            // kFeaturesHalfA
+            IndexType index_offset = append_base_feature<FeatureType>(
+                kProperties[kFeaturesHalfKA], base_index, training_features);
+
+            const auto sq_k = static_cast<Square>(base_index / PS_END2);
+            const auto a = static_cast<IndexType>(base_index % PS_END2);
+
+            // kFeaturesA
+            index_offset += inherit_features_if_required<A>(
+                index_offset, kProperties[kFeaturesA], a, training_features);
+
+            // kFeaturesHalfRelativeKA
+            if (a >= PS_W_PAWN) {
+                index_offset += inherit_features_if_required<HalfRelativeKA<AssociatedKing>>(
+                    index_offset, kProperties[kFeaturesHalfRelativeKA],
+                    HalfRelativeKA<AssociatedKing>::make_index(sq_k, a),
+                    training_features);
+            }
+            else {
+                index_offset += skip_features(kProperties[kFeaturesHalfRelativeKA]);
+            }
+
+            assert(index_offset == get_dimensions());
+        }
+    };
+
+    template <Side AssociatedKing>
+    constexpr FeatureProperties Factorizer<HalfKA<AssociatedKing>>::kProperties[];
+
+}  // namespace Eval::NNUE::Features
+
+#endif // #ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KA_H_
diff --git a/src/nnue/trainer/features/factorizer_half_kp.h b/src/nnue/trainer/features/factorizer_half_kp.h
index 955894e8..c554f0fc 100644
--- a/src/nnue/trainer/features/factorizer_half_kp.h
+++ b/src/nnue/trainer/features/factorizer_half_kp.h
@@ -1,103 +1,104 @@
-﻿// Specialization of NNUE evaluation function feature conversion class template for HalfKP
-
-#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_
+﻿#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_
 #define _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_
 
-#if defined(EVAL_NNUE)
-
-#include "../../features/half_kp.h"
-#include "../../features/p.h"
-#include "../../features/half_relative_kp.h"
 #include "factorizer.h"
 
-namespace Eval {
+#include "nnue/features/half_kp.h"
+#include "nnue/features/p.h"
+#include "nnue/features/half_relative_kp.h"
 
-namespace NNUE {
+// Specialization of NNUE evaluation function feature conversion class template for HalfKP
+namespace Eval::NNUE::Features {
 
-namespace Features {
+    // Class template that converts input features into learning features
+    // Specialization for HalfKP
+    template <Side AssociatedKing>
+    class Factorizer<HalfKP<AssociatedKing>> {
+    private:
+        using FeatureType = HalfKP<AssociatedKing>;
 
-// Class template that converts input features into learning features
-// Specialization for HalfKP
-template <Side AssociatedKing>
-class Factorizer<HalfKP<AssociatedKing>> {
- private:
-  using FeatureType = HalfKP<AssociatedKing>;
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions =
+            FeatureType::kMaxActiveDimensions;
 
-  // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-  static constexpr IndexType kMaxActiveDimensions =
-      FeatureType::kMaxActiveDimensions;
+        // Type of learning feature
+        enum TrainingFeatureType {
+            kFeaturesHalfKP,
+            kFeaturesHalfK,
+            kFeaturesP,
+            kFeaturesHalfRelativeKP,
+            kNumTrainingFeatureTypes,
+        };
 
-  // Type of learning feature
-  enum TrainingFeatureType {
-    kFeaturesHalfKP,
-    kFeaturesHalfK,
-    kFeaturesP,
-    kFeaturesHalfRelativeKP,
-    kNumTrainingFeatureTypes,
-  };
+        // Learning feature information
+        static constexpr FeatureProperties kProperties[] = {
+            // kFeaturesHalfKP
+            {true, FeatureType::kDimensions},
+            // kFeaturesHalfK
+            {true, SQUARE_NB},
+            // kFeaturesP
+            {true, Factorizer<P>::get_dimensions()},
+            // kFeaturesHalfRelativeKP
+            {true, Factorizer<HalfRelativeKP<AssociatedKing>>::get_dimensions()},
+        };
 
-  // Learning feature information
-  static constexpr FeatureProperties kProperties[] = {
-    // kFeaturesHalfKP
-    {true, FeatureType::kDimensions},
-    // kFeaturesHalfK
-    {true, SQUARE_NB},
-    // kFeaturesP
-    {true, Factorizer<P>::GetDimensions()},
-    // kFeaturesHalfRelativeKP
-    {true, Factorizer<HalfRelativeKP<AssociatedKing>>::GetDimensions()},
-  };
-  static_assert(GetArrayLength(kProperties) == kNumTrainingFeatureTypes, "");
+        static_assert(get_array_length(kProperties) == kNumTrainingFeatureTypes, "");
 
- public:
-  // Get the dimensionality of the learning feature
-  static constexpr IndexType GetDimensions() {
-    return GetActiveDimensions(kProperties);
-  }
+    public:
+        static constexpr std::string get_name() {
+            return std::string("Factorizer<") + FeatureType::kName + "> -> " + "HalfK, P, HalfRelativeKP";
+        }
 
-  // Get index of learning feature and scale of learning rate
-  static void AppendTrainingFeatures(
-      IndexType base_index, std::vector<TrainingFeature>* training_features) {
-    // kFeaturesHalfKP
-    IndexType index_offset = AppendBaseFeature<FeatureType>(
-        kProperties[kFeaturesHalfKP], base_index, training_features);
+        static constexpr std::string get_factorizers_string() {
+            return "  - " + get_name();
+        }
 
-    const auto sq_k = static_cast<Square>(base_index / PS_END);
-    const auto p = static_cast<IndexType>(base_index % PS_END);
-    // kFeaturesHalfK
-    {
-      const auto& properties = kProperties[kFeaturesHalfK];
-      if (properties.active) {
-        training_features->emplace_back(index_offset + sq_k);
-        index_offset += properties.dimensions;
-      }
-    }
-    // kFeaturesP
-    index_offset += InheritFeaturesIfRequired<P>(
-        index_offset, kProperties[kFeaturesP], p, training_features);
-    // kFeaturesHalfRelativeKP
-    if (p >= PS_W_PAWN) {
-      index_offset += InheritFeaturesIfRequired<HalfRelativeKP<AssociatedKing>>(
-          index_offset, kProperties[kFeaturesHalfRelativeKP],
-          HalfRelativeKP<AssociatedKing>::MakeIndex(sq_k, p),
-          training_features);
-    } else {
-      index_offset += SkipFeatures(kProperties[kFeaturesHalfRelativeKP]);
-    }
+        // Get the dimensionality of the learning feature
+        static constexpr IndexType get_dimensions() {
+            return get_active_dimensions(kProperties);
+        }
 
-    assert(index_offset == GetDimensions());
-  }
-};
+        // Get index of learning feature and scale of learning rate
+        static void append_training_features(
+            IndexType base_index, std::vector<TrainingFeature>* training_features) {
 
-template <Side AssociatedKing>
-constexpr FeatureProperties Factorizer<HalfKP<AssociatedKing>>::kProperties[];
+            // kFeaturesHalfKP
+            IndexType index_offset = append_base_feature<FeatureType>(
+                kProperties[kFeaturesHalfKP], base_index, training_features);
 
-}  // namespace Features
+            const auto sq_k = static_cast<Square>(base_index / PS_END);
+            const auto p = static_cast<IndexType>(base_index % PS_END);
 
-}  // namespace NNUE
+            // kFeaturesHalfK
+            {
+                const auto& properties = kProperties[kFeaturesHalfK];
+                if (properties.active) {
+                    training_features->emplace_back(index_offset + sq_k);
+                    index_offset += properties.dimensions;
+                }
+            }
 
-}  // namespace Eval
+            // kFeaturesP
+            index_offset += inherit_features_if_required<P>(
+                index_offset, kProperties[kFeaturesP], p, training_features);
+            // kFeaturesHalfRelativeKP
+            if (p >= PS_W_PAWN) {
+                index_offset += inherit_features_if_required<HalfRelativeKP<AssociatedKing>>(
+                    index_offset, kProperties[kFeaturesHalfRelativeKP],
+                    HalfRelativeKP<AssociatedKing>::make_index(sq_k, p),
+                    training_features);
+            }
+            else {
+                index_offset += skip_features(kProperties[kFeaturesHalfRelativeKP]);
+            }
 
-#endif  // defined(EVAL_NNUE)
+            assert(index_offset == get_dimensions());
+        }
+    };
+
+    template <Side AssociatedKing>
+    constexpr FeatureProperties Factorizer<HalfKP<AssociatedKing>>::kProperties[];
+
+}  // namespace Eval::NNUE::Features
 
 #endif
diff --git a/src/nnue/trainer/trainer.h b/src/nnue/trainer/trainer.h
index 4b467041..973bc898 100644
--- a/src/nnue/trainer/trainer.h
+++ b/src/nnue/trainer/trainer.h
@@ -1,125 +1,122 @@
-﻿// Common header of class template for learning NNUE evaluation function
-
-#ifndef _NNUE_TRAINER_H_
+﻿#ifndef _NNUE_TRAINER_H_
 #define _NNUE_TRAINER_H_
 
-#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
-
-#include "../nnue_common.h"
-#include "../features/index_list.h"
+#include "nnue/nnue_common.h"
+#include "nnue/features/index_list.h"
 
 #include <sstream>
+
 #if defined(USE_BLAS)
 static_assert(std::is_same<LearnFloatType, float>::value, "");
 #include <cblas.h>
 #endif
 
-namespace Eval {
+// Common header of class template for learning NNUE evaluation function
+namespace Eval::NNUE {
 
-namespace NNUE {
+    // Ponanza constant used in the relation between evaluation value and winning percentage
+    constexpr double kPonanzaConstant = 600.0;
 
-// Ponanza constant used in the relation between evaluation value and winning percentage
-constexpr double kPonanzaConstant = 600.0;
+    // Class that represents one index of learning feature
+    class TrainingFeature {
+        using StorageType = std::uint32_t;
+        static_assert(std::is_unsigned<StorageType>::value, "");
 
-// Class that represents one index of learning feature
-class TrainingFeature {
-  using StorageType = std::uint32_t;
-  static_assert(std::is_unsigned<StorageType>::value, "");
+    public:
+        static constexpr std::uint32_t kIndexBits = 24;
 
- public:
-  static constexpr std::uint32_t kIndexBits = 24;
-  static_assert(kIndexBits < std::numeric_limits<StorageType>::digits, "");
-  static constexpr std::uint32_t kCountBits =
-      std::numeric_limits<StorageType>::digits - kIndexBits;
+        static_assert(kIndexBits < std::numeric_limits<StorageType>::digits, "");
 
-  explicit TrainingFeature(IndexType index) :
-      index_and_count_((index << kCountBits) | 1) {
-    assert(index < (1 << kIndexBits));
-  }
-  TrainingFeature& operator+=(const TrainingFeature& other) {
-    assert(other.GetIndex() == GetIndex());
-    assert(other.GetCount() + GetCount() < (1 << kCountBits));
-    index_and_count_ += other.GetCount();
-    return *this;
-  }
-  IndexType GetIndex() const {
-    return static_cast<IndexType>(index_and_count_ >> kCountBits);
-  }
-  void ShiftIndex(IndexType offset) {
-    assert(GetIndex() + offset < (1 << kIndexBits));
-    index_and_count_ += offset << kCountBits;
-  }
-  IndexType GetCount() const {
-    return static_cast<IndexType>(index_and_count_ & ((1 << kCountBits) - 1));
-  }
-  bool operator<(const TrainingFeature& other) const {
-    return index_and_count_ < other.index_and_count_;
-  }
+        static constexpr std::uint32_t kCountBits =
+            std::numeric_limits<StorageType>::digits - kIndexBits;
 
- private:
-  StorageType index_and_count_;
-};
+        explicit TrainingFeature(IndexType index) :
+            index_and_count_((index << kCountBits) | 1) {
 
-// Structure that represents one sample of training data
-struct Example {
-  std::vector<TrainingFeature> training_features[2];
-  Learner::PackedSfenValue psv;
-  int sign;
-  double weight;
-};
+            assert(index < (1 << kIndexBits));
+        }
 
-// Message used for setting hyperparameters
-struct Message {
-  Message(const std::string& name, const std::string& value = ""):
-      name(name), value(value), num_peekers(0), num_receivers(0) {}
-  const std::string name;
-  const std::string value;
-  std::uint32_t num_peekers;
-  std::uint32_t num_receivers;
-};
+        TrainingFeature& operator+=(const TrainingFeature& other) {
+            assert(other.get_index() == get_index());
+            assert(other.get_count() + get_count() < (1 << kCountBits));
+            index_and_count_ += other.get_count();
+            return *this;
+        }
 
-// determine whether to accept the message
-bool ReceiveMessage(const std::string& name, Message* message) {
-  const auto subscript = "[" + std::to_string(message->num_peekers) + "]";
-  if (message->name.substr(0, name.size() + 1) == name + "[") {
-    ++message->num_peekers;
-  }
-  if (message->name == name || message->name == name + subscript) {
-    ++message->num_receivers;
-    return true;
-  }
-  return false;
-}
+        IndexType get_index() const {
+            return static_cast<IndexType>(index_and_count_ >> kCountBits);
+        }
 
-// split the string
-std::vector<std::string> Split(const std::string& input, char delimiter) {
-  std::istringstream stream(input);
-  std::string field;
-  std::vector<std::string> fields;
-  while (std::getline(stream, field, delimiter)) {
-    fields.push_back(field);
-  }
-  return fields;
-}
+        void shift_index(IndexType offset) {
+            assert(get_index() + offset < (1 << kIndexBits));
+            index_and_count_ += offset << kCountBits;
+        }
 
-// round a floating point number to an integer
-template <typename IntType>
-IntType Round(double value) {
-  return static_cast<IntType>(std::floor(value + 0.5));
-}
+        IndexType get_count() const {
+            return static_cast<IndexType>(index_and_count_ & ((1 << kCountBits) - 1));
+        }
 
-// make_shared with alignment
-template <typename T, typename... ArgumentTypes>
-std::shared_ptr<T> MakeAlignedSharedPtr(ArgumentTypes&&... arguments) {
-  const auto ptr = new(std_aligned_alloc(alignof(T), sizeof(T)))
-      T(std::forward<ArgumentTypes>(arguments)...);
-  return std::shared_ptr<T>(ptr, AlignedDeleter<T>());
-}
+        bool operator<(const TrainingFeature& other) const {
+            return index_and_count_ < other.index_and_count_;
+        }
 
-}  // namespace NNUE
+    private:
+        StorageType index_and_count_;
+    };
 
-}  // namespace Eval
+    // Structure that represents one sample of training data
+    struct Example {
+        std::vector<TrainingFeature> training_features[2];
+        Learner::PackedSfenValue psv;
+        Value discrete_nn_eval;
+        int sign;
+        double weight;
+    };
 
-#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+    // Message used for setting hyperparameters
+    struct Message {
+        Message(const std::string& message_name, const std::string& message_value = "") :
+            name(message_name), value(message_value), num_peekers(0), num_receivers(0)
+        {
+        }
+
+        const std::string name;
+        const std::string value;
+        std::uint32_t num_peekers;
+        std::uint32_t num_receivers;
+    };
+
+    // determine whether to accept the message
+    bool receive_message(const std::string& name, Message* message) {
+        const auto subscript = "[" + std::to_string(message->num_peekers) + "]";
+
+        if (message->name.substr(0, name.size() + 1) == name + "[") {
+            ++message->num_peekers;
+        }
+
+        if (message->name == name || message->name == name + subscript) {
+            ++message->num_receivers;
+            return true;
+        }
+
+        return false;
+    }
+
+    // round a floating point number to an integer
+    template <typename IntType>
+    IntType round(double value) {
+        return static_cast<IntType>(std::floor(value + 0.5));
+    }
+
+    // make_shared with alignment
+    template <typename T, typename... ArgumentTypes>
+    std::shared_ptr<T> make_aligned_shared_ptr(ArgumentTypes&&... arguments) {
+        const auto ptr = new(std_aligned_alloc(alignof(T), sizeof(T)))
+            T(std::forward<ArgumentTypes>(arguments)...);
+
+        return std::shared_ptr<T>(ptr, AlignedDeleter<T>());
+    }
+
+}  // namespace Eval::NNUE
 
 #endif
diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h
index db56c1c0..53e8f904 100644
--- a/src/nnue/trainer/trainer_affine_transform.h
+++ b/src/nnue/trainer/trainer_affine_transform.h
@@ -1,301 +1,476 @@
-﻿// Specialization of NNUE evaluation function learning class template for AffineTransform
-
-#ifndef _NNUE_TRAINER_AFFINE_TRANSFORM_H_
+﻿#ifndef _NNUE_TRAINER_AFFINE_TRANSFORM_H_
 #define _NNUE_TRAINER_AFFINE_TRANSFORM_H_
 
-#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
-
-#include "../../learn/learn.h"
-#include "../layers/affine_transform.h"
 #include "trainer.h"
 
+#include "extra/stockfish_blas.h"
+
+#include "learn/learn.h"
+
+#include "nnue/layers/affine_transform.h"
+
+#include "thread.h"
+
 #include <random>
 
-namespace Eval {
+// Specialization of NNUE evaluation function learning class template for AffineTransform
+namespace Eval::NNUE {
 
-namespace NNUE {
+    // Learning: Affine transformation layer
+    template <typename PreviousLayer, IndexType OutputDimensions>
+    class Trainer<Layers::AffineTransform<PreviousLayer, OutputDimensions>> {
+    private:
+        // Type of layer to learn
+        using LayerType = Layers::AffineTransform<PreviousLayer, OutputDimensions>;
 
-// Learning: Affine transformation layer
-template <typename PreviousLayer, IndexType OutputDimensions>
-class Trainer<Layers::AffineTransform<PreviousLayer, OutputDimensions>> {
- private:
-  // Type of layer to learn
-  using LayerType = Layers::AffineTransform<PreviousLayer, OutputDimensions>;
+    public:
+        // factory function
+        static std::shared_ptr<Trainer> create(
+            LayerType* target_layer, FeatureTransformer* ft) {
 
- public:
-  // factory function
-  static std::shared_ptr<Trainer> Create(
-      LayerType* target_layer, FeatureTransformer* feature_transformer) {
-    return std::shared_ptr<Trainer>(
-        new Trainer(target_layer, feature_transformer));
-  }
-
-  // Set options such as hyperparameters
-  void SendMessage(Message* message) {
-    previous_layer_trainer_->SendMessage(message);
-    if (ReceiveMessage("momentum", message)) {
-      momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
-    }
-    if (ReceiveMessage("learning_rate_scale", message)) {
-      learning_rate_scale_ =
-          static_cast<LearnFloatType>(std::stod(message->value));
-    }
-    if (ReceiveMessage("reset", message)) {
-      DequantizeParameters();
-    }
-    if (ReceiveMessage("quantize_parameters", message)) {
-      QuantizeParameters();
-    }
-  }
-
-  // Initialize the parameters with random numbers
-  template <typename RNG>
-  void Initialize(RNG& rng) {
-    previous_layer_trainer_->Initialize(rng);
-    if (kIsOutputLayer) {
-      // Initialize output layer with 0
-      std::fill(std::begin(biases_), std::end(biases_),
-                static_cast<LearnFloatType>(0.0));
-      std::fill(std::begin(weights_), std::end(weights_),
-                static_cast<LearnFloatType>(0.0));
-    } else {
-      // Assuming that the input distribution is unit-mean 0.5, equal variance,
-      // Initialize the output distribution so that each unit has a mean of 0.5 and the same variance as the input
-      const double kSigma = 1.0 / std::sqrt(kInputDimensions);
-      auto distribution = std::normal_distribution<double>(0.0, kSigma);
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        double sum = 0.0;
-        for (IndexType j = 0; j < kInputDimensions; ++j) {
-          const auto weight = static_cast<LearnFloatType>(distribution(rng));
-          weights_[kInputDimensions * i + j] = weight;
-          sum += weight;
+            return std::shared_ptr<Trainer>(
+                new Trainer(target_layer, ft));
         }
-        biases_[i] = static_cast<LearnFloatType>(0.5 - 0.5 * sum);
-      }
-    }
-    QuantizeParameters();
-  }
 
-  // forward propagation
-  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
-    if (output_.size() < kOutputDimensions * batch.size()) {
-      output_.resize(kOutputDimensions * batch.size());
-      gradients_.resize(kInputDimensions * batch.size());
-    }
-    batch_size_ = static_cast<IndexType>(batch.size());
-    batch_input_ = previous_layer_trainer_->Propagate(batch);
+        // Set options such as hyperparameters
+        void send_message(Message* message) {
+            previous_layer_trainer_->send_message(message);
+
+            if (receive_message("momentum", message)) {
+                momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
+            }
+
+            if (receive_message("learning_rate_scale", message)) {
+                learning_rate_scale_ =
+                    static_cast<LearnFloatType>(std::stod(message->value));
+            }
+
+            if (receive_message("reset", message)) {
+                dequantize_parameters();
+            }
+
+            if (receive_message("quantize_parameters", message)) {
+                quantize_parameters();
+            }
+
+            if (receive_message("check_health", message)) {
+                check_health();
+            }
+        }
+
+        // Initialize the parameters with random numbers
+        template <typename RNG>
+        void initialize(RNG& rng) {
+            previous_layer_trainer_->initialize(rng);
+
+            if (kIsOutputLayer) {
+                // Initialize output layer with 0
+                std::fill(std::begin(biases_), std::end(biases_),
+                          static_cast<LearnFloatType>(0.0));
+                std::fill(std::begin(weights_), std::end(weights_),
+                          static_cast<LearnFloatType>(0.0));
+            }
+            else {
+                // Assuming that the input distribution is unit-mean 0.5, equal variance,
+                // Initialize the output distribution so that each unit has a mean of 0.5 and the same variance as the input
+                const double kSigma = 1.0 / std::sqrt(kInputDimensions);
+                auto distribution = std::normal_distribution<double>(0.0, kSigma);
+
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                    double sum = 0.0;
+                      for (IndexType j = 0; j < kInputDimensions; ++j) {
+                          const auto weight = static_cast<LearnFloatType>(distribution(rng));
+                          weights_[kInputDimensions * i + j] = weight;
+                          sum += weight;
+                      }
+
+                    biases_[i] = static_cast<LearnFloatType>(0.5 - 0.5 * sum);
+                }
+            }
+
+            quantize_parameters();
+        }
+
+        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
+        {
+            const auto size = batch_end - batch_begin;
+
+            if ((long)output_.size() < (long)kOutputDimensions * size) {
+                output_.resize(kOutputDimensions * size);
+                gradients_.resize(kInputDimensions * size);
+            }
+
+            if (thread_states_.size() < thread_pool.size())
+            {
+                thread_states_.resize(thread_pool.size());
+            }
+
+            combined_batch_size_ = size;
+            combined_batch_input_ = previous_layer_trainer_->step_start(thread_pool, batch_begin, batch_end);
+
+            auto& main_thread_state = thread_states_[0];
+
 #if defined(USE_BLAS)
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      cblas_scopy(kOutputDimensions, biases_, 1, &output_[batch_offset], 1);
-    }
-    cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
-                kOutputDimensions, batch_size_, kInputDimensions, 1.0,
-                weights_, kInputDimensions,
-                batch_input_, kInputDimensions,
-                1.0, &output_[0], kOutputDimensions);
-#else
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType input_batch_offset = kInputDimensions * b;
-      const IndexType output_batch_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        double sum = biases_[i];
-        for (IndexType j = 0; j < kInputDimensions; ++j) {
-          const IndexType index = kInputDimensions * i + j;
-          sum += weights_[index] * batch_input_[input_batch_offset + j];
-        }
-        output_[output_batch_offset + i] = static_cast<LearnFloatType>(sum);
-      }
-    }
-#endif
-    return output_.data();
-  }
 
-  // backpropagation
-  void Backpropagate(const LearnFloatType* gradients,
-                     LearnFloatType learning_rate) {
-    const LearnFloatType local_learning_rate =
-        learning_rate * learning_rate_scale_;
+            // update
+            cblas_sscal(
+                kOutputDimensions, momentum_, main_thread_state.biases_diff_, 1
+            );
+
+#else
+
+            Blas::sscal(
+                kOutputDimensions, momentum_, main_thread_state.biases_diff_, 1
+            );
+
+#endif
+
+            for (IndexType i = 1; i < thread_states_.size(); ++i)
+                thread_states_[i].reset_biases();
+
+            return output_.data();
+        }
+
+        // forward propagation
+        void propagate(Thread& th, const uint64_t offset, const uint64_t count) {
+
+            previous_layer_trainer_->propagate(th, offset, count);
+
 #if defined(USE_BLAS)
-    // backpropagate
-    cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
-                kInputDimensions, batch_size_, kOutputDimensions, 1.0,
+
+            for (IndexType b = offset; b < offset + count; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                cblas_scopy(
+                    kOutputDimensions, biases_, 1, &output_[batch_offset], 1
+                );
+            }
+
+            cblas_sgemm(
+                CblasColMajor, CblasTrans, CblasNoTrans,
+                kOutputDimensions, count, kInputDimensions,
+                1.0,
                 weights_, kInputDimensions,
-                gradients, kOutputDimensions,
-                0.0, &gradients_[0], kInputDimensions);
-    // update
-    cblas_sscal(kOutputDimensions, momentum_, biases_diff_, 1);
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      cblas_saxpy(kOutputDimensions, 1.0,
-                  &gradients[batch_offset], 1, biases_diff_, 1);
-    }
-    cblas_saxpy(kOutputDimensions, -local_learning_rate,
-                biases_diff_, 1, biases_, 1);
-    cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
-                kOutputDimensions, kInputDimensions, batch_size_, 1.0,
-                gradients, kOutputDimensions,
-                batch_input_, kInputDimensions,
-                momentum_, weights_diff_, kInputDimensions);
-    cblas_saxpy(kOutputDimensions * kInputDimensions, -local_learning_rate,
-                weights_diff_, 1, weights_, 1);
+                combined_batch_input_ + offset * kInputDimensions, kInputDimensions,
+                1.0,
+                &output_[offset * kOutputDimensions], kOutputDimensions
+            );
 #else
-    // backpropagate
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType input_batch_offset = kInputDimensions * b;
-      const IndexType output_batch_offset = kOutputDimensions * b;
-      for (IndexType j = 0; j < kInputDimensions; ++j) {
-        double sum = 0.0;
-        for (IndexType i = 0; i < kOutputDimensions; ++i) {
-          const IndexType index = kInputDimensions * i + j;
-          sum += weights_[index] * gradients[output_batch_offset + i];
-        }
-        gradients_[input_batch_offset + j] = static_cast<LearnFloatType>(sum);
-      }
-    }
-    // update
-    for (IndexType i = 0; i < kOutputDimensions; ++i) {
-      biases_diff_[i] *= momentum_;
-    }
-    for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
-      weights_diff_[i] *= momentum_;
-    }
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType input_batch_offset = kInputDimensions * b;
-      const IndexType output_batch_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        biases_diff_[i] += gradients[output_batch_offset + i];
-      }
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        for (IndexType j = 0; j < kInputDimensions; ++j) {
-          const IndexType index = kInputDimensions * i + j;
-          weights_diff_[index] += gradients[output_batch_offset + i] *
-              batch_input_[input_batch_offset + j];
-        }
-      }
-    }
-    for (IndexType i = 0; i < kOutputDimensions; ++i) {
-      biases_[i] -= local_learning_rate * biases_diff_[i];
-    }
-    for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
-      weights_[i] -= local_learning_rate * weights_diff_[i];
-    }
-#endif
-    previous_layer_trainer_->Backpropagate(gradients_.data(), learning_rate);
-  }
 
- private:
-  // constructor
-  Trainer(LayerType* target_layer, FeatureTransformer* feature_transformer) :
-      batch_size_(0),
-      batch_input_(nullptr),
-      previous_layer_trainer_(Trainer<PreviousLayer>::Create(
-          &target_layer->previous_layer_, feature_transformer)),
-      target_layer_(target_layer),
-      biases_(),
-      weights_(),
-      biases_diff_(),
-      weights_diff_(),
-      momentum_(0.0),
-      learning_rate_scale_(1.0) {
-    DequantizeParameters();
-  }
+            for (IndexType b = offset; b < offset + count; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                Blas::scopy(
+                    kOutputDimensions, biases_, 1, &output_[batch_offset], 1
+                );
+            }
 
-  // Weight saturation and parameterization
-  void QuantizeParameters() {
-    for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
-      weights_[i] = std::max(-kMaxWeightMagnitude,
-                             std::min(+kMaxWeightMagnitude, weights_[i]));
-    }
-    for (IndexType i = 0; i < kOutputDimensions; ++i) {
-      target_layer_->biases_[i] =
-          Round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
-    }
-    for (IndexType i = 0; i < kOutputDimensions; ++i) {
-      const auto offset = kInputDimensions * i;
-      const auto padded_offset = LayerType::kPaddedInputDimensions * i;
-      for (IndexType j = 0; j < kInputDimensions; ++j) {
-        target_layer_->weights_[padded_offset + j] =
-            Round<typename LayerType::WeightType>(
-                weights_[offset + j] * kWeightScale);
-      }
-    }
-  }
-
-  // read parameterized integer
-  void DequantizeParameters() {
-    for (IndexType i = 0; i < kOutputDimensions; ++i) {
-      biases_[i] = static_cast<LearnFloatType>(
-          target_layer_->biases_[i] / kBiasScale);
-    }
-    for (IndexType i = 0; i < kOutputDimensions; ++i) {
-      const auto offset = kInputDimensions * i;
-      const auto padded_offset = LayerType::kPaddedInputDimensions * i;
-      for (IndexType j = 0; j < kInputDimensions; ++j) {
-        weights_[offset + j] = static_cast<LearnFloatType>(
-            target_layer_->weights_[padded_offset + j] / kWeightScale);
-      }
-    }
-    std::fill(std::begin(biases_diff_), std::end(biases_diff_),
-              static_cast<LearnFloatType>(0.0));
-    std::fill(std::begin(weights_diff_), std::end(weights_diff_),
-              static_cast<LearnFloatType>(0.0));
-  }
-
-  // number of input/output dimensions
-  static constexpr IndexType kInputDimensions = LayerType::kInputDimensions;
-  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
-
-  // If the output dimensionality is 1, the output layer
-  static constexpr bool kIsOutputLayer = kOutputDimensions == 1;
-
-  // Coefficient used for parameterization
-  static constexpr LearnFloatType kActivationScale =
-      std::numeric_limits<std::int8_t>::max();
-  static constexpr LearnFloatType kBiasScale = kIsOutputLayer ?
-      (kPonanzaConstant * FV_SCALE) :
-      ((1 << kWeightScaleBits) * kActivationScale);
-  static constexpr LearnFloatType kWeightScale = kBiasScale / kActivationScale;
-
-  // Upper limit of absolute value of weight used to prevent overflow when parameterizing integers
-  static constexpr LearnFloatType kMaxWeightMagnitude =
-      std::numeric_limits<typename LayerType::WeightType>::max() / kWeightScale;
-
-  // number of samples in mini-batch
-  IndexType batch_size_;
-
-  // Input mini batch
-  const LearnFloatType* batch_input_;
-
-  // Trainer of the previous layer
-  const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
-
-  // layer to learn
-  LayerType* const target_layer_;
-
-  // parameter
-  LearnFloatType biases_[kOutputDimensions];
-  LearnFloatType weights_[kOutputDimensions * kInputDimensions];
-
-  // Buffer used for updating parameters
-  LearnFloatType biases_diff_[kOutputDimensions];
-  LearnFloatType weights_diff_[kOutputDimensions * kInputDimensions];
-
-  // Forward propagation buffer
-  std::vector<LearnFloatType> output_;
-
-  // buffer for back propagation
-  std::vector<LearnFloatType> gradients_;
-
-  // hyper parameter
-  LearnFloatType momentum_;
-  LearnFloatType learning_rate_scale_;
-};
-
-}  // namespace NNUE
-
-}  // namespace Eval
-
-#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+            Blas::sgemm(
+                Blas::MatrixLayout::ColMajor, Blas::MatrixTranspose::Trans, Blas::MatrixTranspose::NoTrans,
+                kOutputDimensions, count, kInputDimensions,
+                1.0,
+                weights_, kInputDimensions,
+                combined_batch_input_ + offset * kInputDimensions, kInputDimensions,
+                1.0,
+                &output_[offset * kOutputDimensions], kOutputDimensions
+            );
+
+#endif
+        }
+
+        // backpropagation
+        void backpropagate(Thread& th,
+                           const LearnFloatType* gradients,
+                           uint64_t offset,
+                           uint64_t count) {
+
+            auto& thread_state = thread_states_[th.thread_idx()];
+            const auto momentum = th.thread_idx() == 0 ? momentum_ : 0.0f;
+#if defined(USE_BLAS)
+
+            cblas_sgemm(
+                CblasColMajor, CblasNoTrans, CblasNoTrans,
+                kInputDimensions, count, kOutputDimensions,
+                1.0,
+                weights_, kInputDimensions,
+                gradients + offset * kOutputDimensions, kOutputDimensions,
+                0.0,
+                &gradients_[offset * kInputDimensions], kInputDimensions
+            );
+
+            for (IndexType b = offset; b < offset + count; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                cblas_saxpy(
+                    kOutputDimensions, 1.0,
+                    &gradients[batch_offset], 1, thread_state.biases_diff_, 1
+                );
+            }
+
+            cblas_sgemm(
+                CblasRowMajor, CblasTrans, CblasNoTrans,
+                kOutputDimensions, kInputDimensions, count,
+                1.0,
+                gradients + offset * kOutputDimensions, kOutputDimensions,
+                combined_batch_input_ + offset * kInputDimensions, kInputDimensions,
+                momentum,
+                thread_state.weights_diff_, kInputDimensions
+            );
+
+#else
+
+            // backpropagate
+            Blas::sgemm(
+                Blas::MatrixLayout::ColMajor, Blas::MatrixTranspose::NoTrans, Blas::MatrixTranspose::NoTrans,
+                kInputDimensions, count, kOutputDimensions,
+                1.0,
+                weights_, kInputDimensions,
+                gradients + offset * kOutputDimensions, kOutputDimensions,
+                0.0,
+                &gradients_[offset * kInputDimensions], kInputDimensions
+            );
+
+            for (IndexType b = offset; b < offset + count; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                Blas::saxpy(kOutputDimensions, 1.0,
+                          &gradients[batch_offset], 1, thread_state.biases_diff_, 1);
+            }
+
+            Blas::sgemm(
+                Blas::MatrixLayout::RowMajor, Blas::MatrixTranspose::Trans, Blas::MatrixTranspose::NoTrans,
+                kOutputDimensions, kInputDimensions, count,
+                1.0,
+                gradients + offset * kOutputDimensions, kOutputDimensions,
+                combined_batch_input_ + offset * kInputDimensions, kInputDimensions,
+                momentum,
+                thread_state.weights_diff_, kInputDimensions
+            );
+
+#endif
+
+            previous_layer_trainer_->backpropagate(th, gradients_.data(), offset, count);
+        }
+
+        void reduce_thread_state()
+        {
+            for (IndexType i = 1; i < thread_states_.size(); ++i)
+            {
+                thread_states_[0] += thread_states_[i];
+            }
+        }
+
+        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate)
+        {
+            const LearnFloatType local_learning_rate =
+                learning_rate * learning_rate_scale_;
+
+            reduce_thread_state();
+
+            auto& main_thread_state = thread_states_[0];
+
+            for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                const double d = local_learning_rate * main_thread_state.biases_diff_[i];
+                biases_[i] -= d;
+                abs_biases_diff_sum_ += std::abs(d);
+            }
+            num_biases_diffs_ += kOutputDimensions;
+
+            for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
+                const double d = local_learning_rate * main_thread_state.weights_diff_[i];
+                weights_[i] -= d;
+                abs_weights_diff_sum_ += std::abs(d);
+            }
+            num_weights_diffs_ += kOutputDimensions * kInputDimensions;
+
+            previous_layer_trainer_->step_end(thread_pool, learning_rate);
+        }
+
+    private:
+        // constructor
+        Trainer(LayerType* target_layer, FeatureTransformer* ft) :
+            combined_batch_size_(0),
+            combined_batch_input_(nullptr),
+            previous_layer_trainer_(Trainer<PreviousLayer>::create(
+                &target_layer->previous_layer_, ft)),
+            target_layer_(target_layer),
+            biases_(),
+            weights_(),
+            momentum_(0.2),
+            learning_rate_scale_(1.0) {
+
+            dequantize_parameters();
+        }
+
+        void reset_stats() {
+            abs_biases_diff_sum_ = 0.0;
+            abs_weights_diff_sum_ = 0.0;
+            num_biases_diffs_ = 0;
+            num_weights_diffs_ = 0;
+        }
+
+        void check_health() {
+
+            double abs_bias_sum = 0.0;
+            double abs_weight_sum = 0.0;
+
+            for(auto b : biases_)
+                abs_bias_sum += std::abs(b);
+
+            for(auto w : weights_)
+                abs_weight_sum += std::abs(w);
+
+            auto out = sync_region_cout.new_region();
+
+            out << "INFO (check_health):"
+                << " layer " << LayerType::kLayerIndex
+                << " - " << LayerType::get_name()
+                << std::endl;
+
+            out << "  - avg_abs_bias        = " << abs_bias_sum / std::size(biases_) << std::endl;
+            out << "  - avg_abs_bias_diff   = " << abs_biases_diff_sum_ / num_biases_diffs_ << std::endl;
+            out << "  - avg_abs_weight      = " << abs_weight_sum / std::size(weights_) << std::endl;
+            out << "  - avg_abs_weight_diff = " << abs_weights_diff_sum_ / num_weights_diffs_ << std::endl;
+
+            out.unlock();
+
+            reset_stats();
+        }
+
+        // Weight saturation and parameterization
+        void quantize_parameters() {
+            for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
+                weights_[i] = std::max(-kMaxWeightMagnitude,
+                                       std::min(+kMaxWeightMagnitude, weights_[i]));
+            }
+
+            for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                target_layer_->biases_[i] =
+                    round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
+            }
+
+            for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                const auto offset = kInputDimensions * i;
+                const auto padded_offset = LayerType::kPaddedInputDimensions * i;
+                for (IndexType j = 0; j < kInputDimensions; ++j) {
+                    target_layer_->weights_[padded_offset + j] =
+                        round<typename LayerType::WeightType>(
+                            weights_[offset + j] * kWeightScale);
+                }
+            }
+        }
+
+        // read parameterized integer
+        void dequantize_parameters() {
+            for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                biases_[i] = static_cast<LearnFloatType>(
+                    target_layer_->biases_[i] / kBiasScale);
+            }
+
+            for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                const auto offset = kInputDimensions * i;
+                const auto padded_offset = LayerType::kPaddedInputDimensions * i;
+                for (IndexType j = 0; j < kInputDimensions; ++j) {
+                    weights_[offset + j] = static_cast<LearnFloatType>(
+                        target_layer_->weights_[padded_offset + j] / kWeightScale);
+                }
+            }
+
+            for (auto& state : thread_states_)
+            {
+                state.reset_weights();
+                state.reset_biases();
+            }
+
+
+            reset_stats();
+        }
+
+        // number of input/output dimensions
+        static constexpr IndexType kInputDimensions = LayerType::kInputDimensions;
+        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+
+        // If the output dimensionality is 1, the output layer
+        static constexpr bool kIsOutputLayer = kOutputDimensions == 1;
+
+        // Coefficient used for parameterization
+        static constexpr LearnFloatType kActivationScale =
+            std::numeric_limits<std::int8_t>::max();
+
+        static constexpr LearnFloatType kBiasScale = kIsOutputLayer ?
+            (kPonanzaConstant * FV_SCALE) :
+            ((1 << kWeightScaleBits) * kActivationScale);
+
+        static constexpr LearnFloatType kWeightScale = kBiasScale / kActivationScale;
+
+        // Upper limit of absolute value of weight used to prevent overflow when parameterizing integers
+        static constexpr LearnFloatType kMaxWeightMagnitude =
+            std::numeric_limits<typename LayerType::WeightType>::max() / kWeightScale;
+
+        // number of samples in mini-batch
+        IndexType combined_batch_size_;
+
+        double abs_biases_diff_sum_;
+        double abs_weights_diff_sum_;
+        uint64_t num_biases_diffs_;
+        uint64_t num_weights_diffs_;
+
+        // Input mini batch
+        const LearnFloatType* combined_batch_input_;
+
+        // Trainer of the previous layer
+        const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
+
+        // layer to learn
+        LayerType* const target_layer_;
+
+        // parameter
+        struct alignas(kCacheLineSize) ThreadState
+        {
+            // Buffer used for updating parameters
+            alignas(kCacheLineSize) LearnFloatType biases_diff_[kOutputDimensions];
+            alignas(kCacheLineSize) LearnFloatType weights_diff_[kOutputDimensions * kInputDimensions];
+
+            ThreadState() { reset_weights(); reset_biases(); }
+
+            ThreadState& operator+=(const ThreadState& other)
+            {
+                for (IndexType i = 0; i < kOutputDimensions; ++i)
+                {
+                    biases_diff_[i] += other.biases_diff_[i];
+                }
+
+                for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i)
+                {
+                    weights_diff_[i] += other.weights_diff_[i];
+                }
+
+                return *this;
+            }
+
+            void reset_weights()
+            {
+                std::fill(std::begin(weights_diff_), std::end(weights_diff_), 0.0f);
+            }
+
+            void reset_biases()
+            {
+                std::fill(std::begin(biases_diff_), std::end(biases_diff_), 0.0f);
+            }
+        };
+
+        alignas(kCacheLineSize) LearnFloatType biases_[kOutputDimensions];
+        alignas(kCacheLineSize) LearnFloatType weights_[kOutputDimensions * kInputDimensions];
+
+        std::vector<ThreadState, CacheLineAlignedAllocator<ThreadState>> thread_states_;
+
+        // Forward propagation buffer
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
+
+        // buffer for back propagation
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
+
+        // hyper parameter
+        LearnFloatType momentum_;
+        LearnFloatType learning_rate_scale_;
+    };
+
+}  // namespace Eval::NNUE
 
 #endif
diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index fd7b1a07..ff883afc 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -1,142 +1,356 @@
-﻿// Specialization of NNUE evaluation function learning class template for ClippedReLU
-
-#ifndef _NNUE_TRAINER_CLIPPED_RELU_H_
+﻿#ifndef _NNUE_TRAINER_CLIPPED_RELU_H_
 #define _NNUE_TRAINER_CLIPPED_RELU_H_
 
-#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
-
-#include "../../learn/learn.h"
-#include "../layers/clipped_relu.h"
 #include "trainer.h"
 
-namespace Eval {
+#include "learn/learn.h"
 
-namespace NNUE {
+#include "nnue/layers/clipped_relu.h"
 
-// Learning: Affine transformation layer
-template <typename PreviousLayer>
-class Trainer<Layers::ClippedReLU<PreviousLayer>> {
- private:
-  // Type of layer to learn
-  using LayerType = Layers::ClippedReLU<PreviousLayer>;
+#include "thread.h"
 
- public:
-  // factory function
-  static std::shared_ptr<Trainer> Create(
-      LayerType* target_layer, FeatureTransformer* feature_transformer) {
-    return std::shared_ptr<Trainer>(
-        new Trainer(target_layer, feature_transformer));
-  }
+// Specialization of NNUE evaluation function learning class template for ClippedReLU
+namespace Eval::NNUE {
 
-  // Set options such as hyperparameters
-  void SendMessage(Message* message) {
-    previous_layer_trainer_->SendMessage(message);
-    if (ReceiveMessage("check_health", message)) {
-      CheckHealth();
-    }
-  }
+    // Learning: Affine transformation layer
+    template <typename PreviousLayer>
+    class Trainer<Layers::ClippedReLU<PreviousLayer>> {
+    private:
+        // Type of layer to learn
+        using LayerType = Layers::ClippedReLU<PreviousLayer>;
 
-  // Initialize the parameters with random numbers
-  template <typename RNG>
-  void Initialize(RNG& rng) {
-    previous_layer_trainer_->Initialize(rng);
-  }
+    public:
+        // factory function
+        static std::shared_ptr<Trainer> create(
+            LayerType* target_layer, FeatureTransformer* ft) {
 
-  // forward propagation
-  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
-    if (output_.size() < kOutputDimensions * batch.size()) {
-      output_.resize(kOutputDimensions * batch.size());
-      gradients_.resize(kInputDimensions * batch.size());
-    }
-    const auto input = previous_layer_trainer_->Propagate(batch);
-    batch_size_ = static_cast<IndexType>(batch.size());
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        const IndexType index = batch_offset + i;
-        output_[index] = std::max(+kZero, std::min(+kOne, input[index]));
-        min_activations_[i] = std::min(min_activations_[i], output_[index]);
-        max_activations_[i] = std::max(max_activations_[i], output_[index]);
-      }
-    }
-    return output_.data();
-  }
+            return std::shared_ptr<Trainer>(
+                new Trainer(target_layer, ft));
+        }
 
-  // backpropagation
-  void Backpropagate(const LearnFloatType* gradients,
-                     LearnFloatType learning_rate) {
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        const IndexType index = batch_offset + i;
-        gradients_[index] = gradients[index] *
-            (output_[index] > kZero) * (output_[index] < kOne);
-      }
-    }
-    previous_layer_trainer_->Backpropagate(gradients_.data(), learning_rate);
-  }
+        // Set options such as hyperparameters
+        void send_message(Message* message) {
+            previous_layer_trainer_->send_message(message);
+            if (receive_message("check_health", message)) {
+                check_health();
+            }
+        }
 
- private:
-  // constructor
-  Trainer(LayerType* target_layer, FeatureTransformer* feature_transformer) :
-      batch_size_(0),
-      previous_layer_trainer_(Trainer<PreviousLayer>::Create(
-          &target_layer->previous_layer_, feature_transformer)),
-      target_layer_(target_layer) {
-    std::fill(std::begin(min_activations_), std::end(min_activations_),
-              std::numeric_limits<LearnFloatType>::max());
-    std::fill(std::begin(max_activations_), std::end(max_activations_),
-              std::numeric_limits<LearnFloatType>::lowest());
-  }
+        // Initialize the parameters with random numbers
+        template <typename RNG>
+        void initialize(RNG& rng) {
+            previous_layer_trainer_->initialize(rng);
+        }
 
-  // Check if there are any problems with learning
-  void CheckHealth() {
-    const auto largest_min_activation = *std::max_element(
-        std::begin(min_activations_), std::end(min_activations_));
-    const auto smallest_max_activation = *std::min_element(
-        std::begin(max_activations_), std::end(max_activations_));
-    std::cout << "INFO: largest min activation = " << largest_min_activation
-              << ", smallest max activation = " << smallest_max_activation
-              << std::endl;
+        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
+        {
+            const auto size = batch_end - batch_begin;
 
-    std::fill(std::begin(min_activations_), std::end(min_activations_),
-              std::numeric_limits<LearnFloatType>::max());
-    std::fill(std::begin(max_activations_), std::end(max_activations_),
-              std::numeric_limits<LearnFloatType>::lowest());
-  }
+            if ((long)output_.size() < (long)kOutputDimensions * size) {
+              output_.resize(kOutputDimensions * size);
+              gradients_.resize(kInputDimensions * size);
+            }
 
-  // number of input/output dimensions
-  static constexpr IndexType kInputDimensions = LayerType::kOutputDimensions;
-  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+            if (thread_states_.size() < thread_pool.size())
+            {
+                thread_states_.resize(thread_pool.size());
+            }
 
-  // LearnFloatType constant
-  static constexpr LearnFloatType kZero = static_cast<LearnFloatType>(0.0);
-  static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
+            input_ = previous_layer_trainer_->step_start(thread_pool, batch_begin, batch_end);
 
-  // number of samples in mini-batch
-  IndexType batch_size_;
+            batch_size_ = size;
 
-  // Trainer of the previous layer
-  const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
+            return output_.data();
+        }
 
-  // layer to learn
-  LayerType* const target_layer_;
+        // forward propagation
+        void propagate(Thread& th, const uint64_t offset, const uint64_t count) {
 
-  // Forward propagation buffer
-  std::vector<LearnFloatType> output_;
+            auto& thread_state = thread_states_[th.thread_idx()];
 
-  // buffer for back propagation
-  std::vector<LearnFloatType> gradients_;
+            previous_layer_trainer_->propagate(th, offset, count);
 
-  // Health check statistics
-  LearnFloatType min_activations_[kOutputDimensions];
-  LearnFloatType max_activations_[kOutputDimensions];
-};
+#if defined (USE_SSE2)
 
-}  // namespace NNUE
+            {
+                static_assert(kOutputDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time");
 
-}  // namespace Eval
+                const __m128 kZero4 = _mm_set1_ps(+kZero);
+                const __m128 kOne4 = _mm_set1_ps(+kOne);
 
-#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+                for (IndexType b = offset; b < offset + count; ++b)
+                {
+                    const IndexType batch_offset = kOutputDimensions * b;
+
+                    for (IndexType i = 0; i < kOutputDimensions; i += 16)
+                    {
+                        __m128 out0 = _mm_loadu_ps(&input_[i + 0 + batch_offset]);
+                        __m128 out1 = _mm_loadu_ps(&input_[i + 4 + batch_offset]);
+                        __m128 out2 = _mm_loadu_ps(&input_[i + 8 + batch_offset]);
+                        __m128 out3 = _mm_loadu_ps(&input_[i + 12 + batch_offset]);
+
+                        out0 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out0));
+                        out1 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out1));
+                        out2 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out2));
+                        out3 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out3));
+
+                        _mm_storeu_ps(&output_[i + 0 + batch_offset], out0);
+                        _mm_storeu_ps(&output_[i + 4 + batch_offset], out1);
+                        _mm_storeu_ps(&output_[i + 8 + batch_offset], out2);
+                        _mm_storeu_ps(&output_[i + 12 + batch_offset], out3);
+
+                        __m128 minact0 = _mm_loadu_ps(&thread_state.min_activations_[i + 0]);
+                        __m128 minact1 = _mm_loadu_ps(&thread_state.min_activations_[i + 4]);
+                        __m128 minact2 = _mm_loadu_ps(&thread_state.min_activations_[i + 8]);
+                        __m128 minact3 = _mm_loadu_ps(&thread_state.min_activations_[i + 12]);
+
+                        __m128 maxact0 = _mm_loadu_ps(&thread_state.max_activations_[i + 0]);
+                        __m128 maxact1 = _mm_loadu_ps(&thread_state.max_activations_[i + 4]);
+                        __m128 maxact2 = _mm_loadu_ps(&thread_state.max_activations_[i + 8]);
+                        __m128 maxact3 = _mm_loadu_ps(&thread_state.max_activations_[i + 12]);
+
+                        minact0 = _mm_min_ps(out0, minact0);
+                        minact1 = _mm_min_ps(out1, minact1);
+                        minact2 = _mm_min_ps(out2, minact2);
+                        minact3 = _mm_min_ps(out3, minact3);
+
+                        maxact0 = _mm_max_ps(out0, maxact0);
+                        maxact1 = _mm_max_ps(out1, maxact1);
+                        maxact2 = _mm_max_ps(out2, maxact2);
+                        maxact3 = _mm_max_ps(out3, maxact3);
+
+                        _mm_storeu_ps(&thread_state.min_activations_[i + 0], minact0);
+                        _mm_storeu_ps(&thread_state.min_activations_[i + 4], minact1);
+                        _mm_storeu_ps(&thread_state.min_activations_[i + 8], minact2);
+                        _mm_storeu_ps(&thread_state.min_activations_[i + 12], minact3);
+
+                        _mm_storeu_ps(&thread_state.max_activations_[i + 0], maxact0);
+                        _mm_storeu_ps(&thread_state.max_activations_[i + 4], maxact1);
+                        _mm_storeu_ps(&thread_state.max_activations_[i + 8], maxact2);
+                        _mm_storeu_ps(&thread_state.max_activations_[i + 12], maxact3);
+                    }
+                }
+            }
+
+#else
+
+            for (IndexType b = offset; b < offset + count; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                    const IndexType index = batch_offset + i;
+                    output_[index] = std::max(+kZero, std::min(+kOne, input_[index]));
+                    thread_state.min_activations_[i] = std::min(thread_state.min_activations_[i], output_[index]);
+                    thread_state.max_activations_[i] = std::max(thread_state.max_activations_[i], output_[index]);
+                }
+            }
+
+#endif
+        }
+
+        // backpropagation
+        void backpropagate(Thread& th,
+                           const LearnFloatType* gradients,
+                           const uint64_t offset,
+                           const uint64_t count) {
+
+            auto& thread_state = thread_states_[th.thread_idx()];
+
+#if defined (USE_SSE2)
+
+            {
+                static_assert(kOutputDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time");
+
+                const __m128 kZero4 = _mm_set1_ps(+kZero);
+                const __m128 kOne4 = _mm_set1_ps(+kOne);
+
+                for (IndexType b = offset; b < offset + count; ++b)
+                {
+                    const IndexType batch_offset = kOutputDimensions * b;
+
+                    for (IndexType i = 0; i < kOutputDimensions; i += 16)
+                    {
+                        __m128 out0 = _mm_loadu_ps(&output_[batch_offset + i + 0]);
+                        __m128 out1 = _mm_loadu_ps(&output_[batch_offset + i + 4]);
+                        __m128 out2 = _mm_loadu_ps(&output_[batch_offset + i + 8]);
+                        __m128 out3 = _mm_loadu_ps(&output_[batch_offset + i + 12]);
+
+                        __m128 clipped0 = _mm_or_ps(_mm_cmple_ps(out0, kZero4), _mm_cmpge_ps(out0, kOne4));
+                        __m128 clipped1 = _mm_or_ps(_mm_cmple_ps(out1, kZero4), _mm_cmpge_ps(out1, kOne4));
+                        __m128 clipped2 = _mm_or_ps(_mm_cmple_ps(out2, kZero4), _mm_cmpge_ps(out2, kOne4));
+                        __m128 clipped3 = _mm_or_ps(_mm_cmple_ps(out3, kZero4), _mm_cmpge_ps(out3, kOne4));
+
+                        __m128 grad0 = _mm_loadu_ps(&gradients[batch_offset + i + 0]);
+                        __m128 grad1 = _mm_loadu_ps(&gradients[batch_offset + i + 4]);
+                        __m128 grad2 = _mm_loadu_ps(&gradients[batch_offset + i + 8]);
+                        __m128 grad3 = _mm_loadu_ps(&gradients[batch_offset + i + 12]);
+
+                        grad0 = _mm_andnot_ps(clipped0, grad0);
+                        grad1 = _mm_andnot_ps(clipped1, grad1);
+                        grad2 = _mm_andnot_ps(clipped2, grad2);
+                        grad3 = _mm_andnot_ps(clipped3, grad3);
+
+                        _mm_storeu_ps(&gradients_[batch_offset + i + 0], grad0);
+                        _mm_storeu_ps(&gradients_[batch_offset + i + 4], grad1);
+                        _mm_storeu_ps(&gradients_[batch_offset + i + 8], grad2);
+                        _mm_storeu_ps(&gradients_[batch_offset + i + 12], grad3);
+
+                        const int clipped_mask =
+                            (_mm_movemask_ps(clipped0) << 0)
+                            | (_mm_movemask_ps(clipped1) << 4)
+                            | (_mm_movemask_ps(clipped2) << 8)
+                            | (_mm_movemask_ps(clipped3) << 12);
+
+                        thread_state.num_clipped_ += popcount(clipped_mask);
+                    }
+                }
+            }
+
+#else
+
+            for (IndexType b = offset; b < offset + count; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                    const IndexType index = batch_offset + i;
+                    const bool clipped = (output_[index] <= kZero) | (output_[index] >= kOne);
+                    gradients_[index] = gradients[index] * !clipped;
+                    thread_state.num_clipped_ += clipped;
+                }
+            }
+
+#endif
+
+            thread_state.num_total_ += count * kOutputDimensions;
+
+            previous_layer_trainer_->backpropagate(th, gradients_.data(), offset, count);
+        }
+
+        void reduce_thread_state()
+        {
+            for (IndexType i = 1; i < thread_states_.size(); ++i)
+            {
+                thread_states_[0] += thread_states_[i];
+            }
+        }
+
+        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate)
+        {
+            previous_layer_trainer_->step_end(thread_pool, learning_rate);
+        }
+
+    private:
+        // constructor
+        Trainer(LayerType* target_layer, FeatureTransformer* ft) :
+            batch_size_(0),
+            previous_layer_trainer_(Trainer<PreviousLayer>::create(
+                &target_layer->previous_layer_, ft)),
+            target_layer_(target_layer) {
+
+            reset_stats();
+        }
+
+        void reset_stats() {
+            for(auto& state : thread_states_)
+                state.reset();
+        }
+
+        // Check if there are any problems with learning
+        void check_health() {
+
+            reduce_thread_state();
+
+            auto& main_thread_state = thread_states_[0];
+
+            const auto largest_min_activation = *std::max_element(
+                std::begin(main_thread_state.min_activations_), std::end(main_thread_state.min_activations_));
+            const auto smallest_max_activation = *std::min_element(
+                std::begin(main_thread_state.max_activations_), std::end(main_thread_state.max_activations_));
+
+            auto out = sync_region_cout.new_region();
+
+            out << "INFO (check_health):"
+                << " layer " << LayerType::kLayerIndex
+                << " - " << LayerType::get_name()
+                << std::endl;
+
+            out << "  - largest min activation = " << largest_min_activation
+                << " , smallest max activation = " << smallest_max_activation
+                << std::endl;
+
+            out << "  - clipped " << static_cast<double>(main_thread_state.num_clipped_) / main_thread_state.num_total_ * 100.0 << "% of outputs"
+                << std::endl;
+
+            out.unlock();
+
+            reset_stats();
+        }
+
+        // number of input/output dimensions
+        static constexpr IndexType kInputDimensions = LayerType::kOutputDimensions;
+        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+
+        // LearnFloatType constant
+        static constexpr LearnFloatType kZero = static_cast<LearnFloatType>(0.0);
+        static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
+
+        // number of samples in mini-batch
+        IndexType batch_size_;
+
+        IndexType num_total_;
+
+        const LearnFloatType* input_;
+
+        // Trainer of the previous layer
+        const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
+
+        // layer to learn
+        LayerType* const target_layer_;
+
+        // Forward propagation buffer
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
+
+        // buffer for back propagation
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
+
+        struct alignas(kCacheLineSize) ThreadState
+        {
+            // Health check statistics
+            LearnFloatType min_activations_[kOutputDimensions];
+            LearnFloatType max_activations_[kOutputDimensions];
+            IndexType num_clipped_;
+            IndexType num_total_;
+
+            ThreadState() { reset(); }
+
+            ThreadState& operator+=(const ThreadState& other)
+            {
+                for (IndexType i = 0; i < kOutputDimensions; ++i)
+                {
+                    min_activations_[i] = std::min(min_activations_[i], other.min_activations_[i]);
+                }
+
+                for (IndexType i = 0; i < kOutputDimensions; ++i)
+                {
+                    max_activations_[i] = std::max(max_activations_[i], other.max_activations_[i]);
+                }
+
+                num_clipped_ += other.num_clipped_;
+                num_total_ += other.num_total_;
+
+                return *this;
+            }
+
+            void reset()
+            {
+                std::fill(std::begin(min_activations_), std::end(min_activations_), std::numeric_limits<float>::max());
+                std::fill(std::begin(max_activations_), std::end(max_activations_), std::numeric_limits<float>::lowest());
+                num_clipped_ = 0;
+                num_total_ = 0;
+            }
+        };
+
+        std::vector<ThreadState, CacheLineAlignedAllocator<ThreadState>> thread_states_;
+    };
+
+}  // namespace Eval::NNUE
 
 #endif
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 97dbeff4..9afda728 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -1,14 +1,17 @@
-﻿// Specialization for feature transformer of learning class template of NNUE evaluation function
-
-#ifndef _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
+﻿#ifndef _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
 #define _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
 
-#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
-
-#include "../../learn/learn.h"
-#include "../nnue_feature_transformer.h"
 #include "trainer.h"
-#include "features/factorizer_feature_set.h"
+
+#include "extra/stockfish_blas.h"
+
+#include "features/all_factorizers.h"
+
+#include "learn/learn.h"
+
+#include "nnue/nnue_feature_transformer.h"
+
+#include "thread.h"
 
 #include <array>
 #include <bitset>
@@ -16,362 +19,756 @@
 #include <random>
 #include <set>
 
-#if defined(_OPENMP)
-#include <omp.h>
-#endif
+// Specialization for feature transformer of learning class template of NNUE evaluation function
+namespace Eval::NNUE {
 
-namespace Eval {
+    // Learning: Input feature converter
+    template <>
+    class Trainer<FeatureTransformer> {
+    private:
+        // Type of layer to learn
+        using LayerType = FeatureTransformer;
 
-namespace NNUE {
+    public:
+        template <typename T>
+        friend struct AlignedDeleter;
 
-// Learning: Input feature converter
-template <>
-class Trainer<FeatureTransformer> {
- private:
-  // Type of layer to learn
-  using LayerType = FeatureTransformer;
+        template <typename T, typename... ArgumentTypes>
+        friend std::shared_ptr<T> make_aligned_shared_ptr(ArgumentTypes&&... arguments);
 
- public:
-  template <typename T>
-  friend struct AlignedDeleter;
-  template <typename T, typename... ArgumentTypes>
-  friend std::shared_ptr<T> MakeAlignedSharedPtr(ArgumentTypes&&... arguments);
+        // factory function
+        static std::shared_ptr<Trainer> create(LayerType* target_layer) {
+            return make_aligned_shared_ptr<Trainer>(target_layer);
+        }
 
-  // factory function
-  static std::shared_ptr<Trainer> Create(LayerType* target_layer) {
-    return MakeAlignedSharedPtr<Trainer>(target_layer);
-  }
+        // Set options such as hyperparameters
+        void send_message(Message* message) {
+            if (receive_message("momentum", message)) {
+                momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
+            }
 
-  // Set options such as hyperparameters
-  void SendMessage(Message* message) {
-    if (ReceiveMessage("momentum", message)) {
-      momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
-    }
-    if (ReceiveMessage("learning_rate_scale", message)) {
-      learning_rate_scale_ =
-          static_cast<LearnFloatType>(std::stod(message->value));
-    }
-    if (ReceiveMessage("reset", message)) {
-      DequantizeParameters();
-    }
-    if (ReceiveMessage("quantize_parameters", message)) {
-      QuantizeParameters();
-    }
-    if (ReceiveMessage("clear_unobserved_feature_weights", message)) {
-      ClearUnobservedFeatureWeights();
-    }
-    if (ReceiveMessage("check_health", message)) {
-      CheckHealth();
-    }
-  }
+            if (receive_message("learning_rate_scale", message)) {
+                learning_rate_scale_ =
+                    static_cast<LearnFloatType>(std::stod(message->value));
+            }
 
-  // Initialize the parameters with random numbers
-  template <typename RNG>
-  void Initialize(RNG& rng) {
-    std::fill(std::begin(weights_), std::end(weights_), +kZero);
-    const double kSigma = 0.1 / std::sqrt(RawFeatures::kMaxActiveDimensions);
-    auto distribution = std::normal_distribution<double>(0.0, kSigma);
-    for (IndexType i = 0; i < kHalfDimensions * RawFeatures::kDimensions; ++i) {
-      const auto weight = static_cast<LearnFloatType>(distribution(rng));
-      weights_[i] = weight;
-    }
-    for (IndexType i = 0; i < kHalfDimensions; ++i) {
-      biases_[i] = static_cast<LearnFloatType>(0.5);
-    }
-    QuantizeParameters();
-  }
+            if (receive_message("reset", message)) {
+                dequantize_parameters();
+            }
+
+            if (receive_message("quantize_parameters", message)) {
+                quantize_parameters();
+            }
+
+            if (receive_message("clear_unobserved_feature_weights", message)) {
+                clear_unobserved_feature_weights();
+            }
+
+            if (receive_message("check_health", message)) {
+                check_health();
+            }
+        }
+
+        // Initialize the parameters with random numbers
+        template <typename RNG>
+        void initialize(RNG& rng) {
+            std::fill(std::begin(weights_), std::end(weights_), +kZero);
+
+            const double kSigma = 0.1 / std::sqrt(RawFeatures::kMaxActiveDimensions);
+            auto distribution = std::normal_distribution<double>(0.0, kSigma);
+
+            for (IndexType i = 0; i < kHalfDimensions * RawFeatures::kDimensions; ++i) {
+                const auto weight = static_cast<LearnFloatType>(distribution(rng));
+                weights_[i] = weight;
+            }
+
+            for (IndexType i = 0; i < kHalfDimensions; ++i) {
+                biases_[i] = static_cast<LearnFloatType>(0.5);
+            }
+
+            quantize_parameters();
+        }
+
+        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
+        {
+            const auto size = batch_end - batch_begin;
+
+            if ((long)output_.size() < (long)kOutputDimensions * size) {
+                output_.resize(kOutputDimensions * size);
+                gradients_.resize(kOutputDimensions * size);
+            }
+
+            if (thread_stat_states_.size() < thread_pool.size())
+            {
+                thread_stat_states_.resize(thread_pool.size());
+            }
+
+            if (thread_bias_states_.size() < thread_pool.size())
+            {
+                thread_bias_states_.resize(thread_pool.size());
+            }
+
+            batch_ = &*batch_begin;
+            batch_size_ = size;
+
+            auto& main_thread_bias_state = thread_bias_states_[0];
 
-  // forward propagation
-  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
-    if (output_.size() < kOutputDimensions * batch.size()) {
-      output_.resize(kOutputDimensions * batch.size());
-      gradients_.resize(kOutputDimensions * batch.size());
-    }
-    batch_ = &batch;
-    // affine transform
-#pragma omp parallel for
-    for (IndexType b = 0; b < batch.size(); ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType c = 0; c < 2; ++c) {
-        const IndexType output_offset = batch_offset + kHalfDimensions * c;
 #if defined(USE_BLAS)
-        cblas_scopy(kHalfDimensions, biases_, 1, &output_[output_offset], 1);
-        for (const auto& feature : batch[b].training_features[c]) {
-          const IndexType weights_offset = kHalfDimensions * feature.GetIndex();
-          cblas_saxpy(kHalfDimensions, (float)feature.GetCount(),
-                      &weights_[weights_offset], 1, &output_[output_offset], 1);
-        }
-#else
-        for (IndexType i = 0; i < kHalfDimensions; ++i) {
-          output_[output_offset + i] = biases_[i];
-        }
-        for (const auto& feature : batch[b].training_features[c]) {
-          const IndexType weights_offset = kHalfDimensions * feature.GetIndex();
-          for (IndexType i = 0; i < kHalfDimensions; ++i) {
-            output_[output_offset + i] +=
-                feature.GetCount() * weights_[weights_offset + i];
-          }
-        }
-#endif
-      }
-    }
-    // clipped ReLU
-    for (IndexType b = 0; b < batch.size(); ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        const IndexType index = batch_offset + i;
-        min_pre_activation_ = std::min(min_pre_activation_, output_[index]);
-        max_pre_activation_ = std::max(max_pre_activation_, output_[index]);
-        output_[index] = std::max(+kZero, std::min(+kOne, output_[index]));
-        const IndexType t = i % kHalfDimensions;
-        min_activations_[t] = std::min(min_activations_[t], output_[index]);
-        max_activations_[t] = std::max(max_activations_[t], output_[index]);
-      }
-    }
-    return output_.data();
-  }
 
-  // backpropagation
-  void Backpropagate(const LearnFloatType* gradients,
-                     LearnFloatType learning_rate) {
-    const LearnFloatType local_learning_rate =
-        learning_rate * learning_rate_scale_;
-    for (IndexType b = 0; b < batch_->size(); ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        const IndexType index = batch_offset + i;
-        gradients_[index] = gradients[index] *
-            ((output_[index] > kZero) * (output_[index] < kOne));
-      }
-    }
-    // Since the weight matrix updates only the columns corresponding to the features that appeared in the input,
-    // Correct the learning rate and adjust the scale without using momentum
-    const LearnFloatType effective_learning_rate =
-        static_cast<LearnFloatType>(local_learning_rate / (1.0 - momentum_));
+            cblas_sscal(
+                kHalfDimensions, momentum_, main_thread_bias_state.biases_diff_, 1
+            );
+
+#else
+
+            Blas::sscal(
+                kHalfDimensions, momentum_, main_thread_bias_state.biases_diff_, 1
+            );
+
+#endif
+
+            for (IndexType i = 1; i < thread_bias_states_.size(); ++i)
+                thread_bias_states_[i].reset();
+
+            return output_.data();
+        }
+
+        // forward propagation
+        void propagate(Thread& th, uint64_t offset, uint64_t count) {
+
+            auto& thread_stat_state = thread_stat_states_[th.thread_idx()];
+
+            for (IndexType b = offset; b < offset + count; ++b)
+            {
+                const IndexType batch_offset = kOutputDimensions * b;
+
+                for (IndexType c = 0; c < 2; ++c) {
+                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
+
 #if defined(USE_BLAS)
-    cblas_sscal(kHalfDimensions, momentum_, biases_diff_, 1);
-    for (IndexType b = 0; b < batch_->size(); ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType c = 0; c < 2; ++c) {
-        const IndexType output_offset = batch_offset + kHalfDimensions * c;
-        cblas_saxpy(kHalfDimensions, 1.0,
-                    &gradients_[output_offset], 1, biases_diff_, 1);
-      }
-    }
-    cblas_saxpy(kHalfDimensions, -local_learning_rate,
-                biases_diff_, 1, biases_, 1);
-#pragma omp parallel
-    {
-#if defined(_OPENMP)
-      const IndexType num_threads = omp_get_num_threads();
-      const IndexType thread_index = omp_get_thread_num();
-#endif
-      for (IndexType b = 0; b < batch_->size(); ++b) {
-        const IndexType batch_offset = kOutputDimensions * b;
-        for (IndexType c = 0; c < 2; ++c) {
-          const IndexType output_offset = batch_offset + kHalfDimensions * c;
-          for (const auto& feature : (*batch_)[b].training_features[c]) {
-#if defined(_OPENMP)
-            if (feature.GetIndex() % num_threads != thread_index) continue;
-#endif
-            const IndexType weights_offset =
-                kHalfDimensions * feature.GetIndex();
-            const auto scale = static_cast<LearnFloatType>(
-                effective_learning_rate / feature.GetCount());
-            cblas_saxpy(kHalfDimensions, -scale,
-                        &gradients_[output_offset], 1,
-                        &weights_[weights_offset], 1);
-          }
-        }
-      }
-    }
+
+                    cblas_scopy(
+                        kHalfDimensions, biases_, 1, &output_[output_offset], 1
+                    );
+
+                    for (const auto& feature : batch_[b].training_features[c]) {
+                        const IndexType weights_offset = kHalfDimensions * feature.get_index();
+                        cblas_saxpy(
+                            kHalfDimensions, (float)feature.get_count(),
+                            &weights_[weights_offset], 1, &output_[output_offset], 1
+                        );
+                    }
+
 #else
-    for (IndexType i = 0; i < kHalfDimensions; ++i) {
-      biases_diff_[i] *= momentum_;
-    }
-    for (IndexType b = 0; b < batch_->size(); ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType c = 0; c < 2; ++c) {
-        const IndexType output_offset = batch_offset + kHalfDimensions * c;
-        for (IndexType i = 0; i < kHalfDimensions; ++i) {
-          biases_diff_[i] += gradients_[output_offset + i];
-        }
-      }
-    }
-    for (IndexType i = 0; i < kHalfDimensions; ++i) {
-      biases_[i] -= local_learning_rate * biases_diff_[i];
-    }
-    for (IndexType b = 0; b < batch_->size(); ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType c = 0; c < 2; ++c) {
-        const IndexType output_offset = batch_offset + kHalfDimensions * c;
-        for (const auto& feature : (*batch_)[b].training_features[c]) {
-          const IndexType weights_offset = kHalfDimensions * feature.GetIndex();
-          const auto scale = static_cast<LearnFloatType>(
-              effective_learning_rate / feature.GetCount());
-          for (IndexType i = 0; i < kHalfDimensions; ++i) {
-            weights_[weights_offset + i] -=
-                scale * gradients_[output_offset + i];
-          }
-        }
-      }
-    }
-#endif
-    for (IndexType b = 0; b < batch_->size(); ++b) {
-      for (IndexType c = 0; c < 2; ++c) {
-        for (const auto& feature : (*batch_)[b].training_features[c]) {
-          observed_features.set(feature.GetIndex());
-        }
-      }
-    }
-  }
 
- private:
-  // constructor
-  Trainer(LayerType* target_layer) :
-      batch_(nullptr),
-      target_layer_(target_layer),
-      biases_(),
-      weights_(),
-      biases_diff_(),
-      momentum_(0.0),
-      learning_rate_scale_(1.0) {
-    min_pre_activation_ = std::numeric_limits<LearnFloatType>::max();
-    max_pre_activation_ = std::numeric_limits<LearnFloatType>::lowest();
-    std::fill(std::begin(min_activations_), std::end(min_activations_),
-              std::numeric_limits<LearnFloatType>::max());
-    std::fill(std::begin(max_activations_), std::end(max_activations_),
-              std::numeric_limits<LearnFloatType>::lowest());
-    DequantizeParameters();
-  }
-
-  // Weight saturation and parameterization
-  void QuantizeParameters() {
-    for (IndexType i = 0; i < kHalfDimensions; ++i) {
-      target_layer_->biases_[i] =
-          Round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
-    }
-    std::vector<TrainingFeature> training_features;
-#pragma omp parallel for private(training_features)
-    for (IndexType j = 0; j < RawFeatures::kDimensions; ++j) {
-      training_features.clear();
-      Features::Factorizer<RawFeatures>::AppendTrainingFeatures(
-          j, &training_features);
-      for (IndexType i = 0; i < kHalfDimensions; ++i) {
-        double sum = 0.0;
-        for (const auto& feature : training_features) {
-          sum += weights_[kHalfDimensions * feature.GetIndex() + i];
-        }
-        target_layer_->weights_[kHalfDimensions * j + i] =
-            Round<typename LayerType::WeightType>(sum * kWeightScale);
-      }
-    }
-  }
-
-  // read parameterized integer
-  void DequantizeParameters() {
-    for (IndexType i = 0; i < kHalfDimensions; ++i) {
-      biases_[i] = static_cast<LearnFloatType>(
-          target_layer_->biases_[i] / kBiasScale);
-    }
-    std::fill(std::begin(weights_), std::end(weights_), +kZero);
-    for (IndexType i = 0; i < kHalfDimensions * RawFeatures::kDimensions; ++i) {
-      weights_[i] = static_cast<LearnFloatType>(
-          target_layer_->weights_[i] / kWeightScale);
-    }
-    std::fill(std::begin(biases_diff_), std::end(biases_diff_), +kZero);
-  }
-
-  // Set the weight corresponding to the feature that does not appear in the learning data to 0
-  void ClearUnobservedFeatureWeights() {
-    for (IndexType i = 0; i < kInputDimensions; ++i) {
-      if (!observed_features.test(i)) {
-        std::fill(std::begin(weights_) + kHalfDimensions * i,
-                  std::begin(weights_) + kHalfDimensions * (i + 1), +kZero);
-      }
-    }
-    QuantizeParameters();
-  }
-
-  // Check if there are any problems with learning
-  void CheckHealth() {
-    std::cout << "INFO: observed " << observed_features.count()
-              << " (out of " << kInputDimensions << ") features" << std::endl;
-
-    constexpr LearnFloatType kPreActivationLimit =
-        std::numeric_limits<typename LayerType::WeightType>::max() /
-        kWeightScale;
-    std::cout << "INFO: (min, max) of pre-activations = "
-              << min_pre_activation_ << ", "
-              << max_pre_activation_ << " (limit = "
-              << kPreActivationLimit << ")" << std::endl;
-
-    const auto largest_min_activation = *std::max_element(
-        std::begin(min_activations_), std::end(min_activations_));
-    const auto smallest_max_activation = *std::min_element(
-        std::begin(max_activations_), std::end(max_activations_));
-    std::cout << "INFO: largest min activation = " << largest_min_activation
-              << ", smallest max activation = " << smallest_max_activation
-              << std::endl;
-
-    std::fill(std::begin(min_activations_), std::end(min_activations_),
-              std::numeric_limits<LearnFloatType>::max());
-    std::fill(std::begin(max_activations_), std::end(max_activations_),
-              std::numeric_limits<LearnFloatType>::lowest());
-  }
-
-  // number of input/output dimensions
-  static constexpr IndexType kInputDimensions =
-      Features::Factorizer<RawFeatures>::GetDimensions();
-  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
-  static constexpr IndexType kHalfDimensions = LayerType::kHalfDimensions;
-
-  // Coefficient used for parameterization
-  static constexpr LearnFloatType kActivationScale =
-      std::numeric_limits<std::int8_t>::max();
-  static constexpr LearnFloatType kBiasScale = kActivationScale;
-  static constexpr LearnFloatType kWeightScale = kActivationScale;
-
-  // LearnFloatType constant
-  static constexpr LearnFloatType kZero = static_cast<LearnFloatType>(0.0);
-  static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
-
-  // mini batch
-  const std::vector<Example>* batch_;
-
-  // layer to learn
-  LayerType* const target_layer_;
-
-  // parameter
-  alignas(kCacheLineSize) LearnFloatType biases_[kHalfDimensions];
-  alignas(kCacheLineSize)
-      LearnFloatType weights_[kHalfDimensions * kInputDimensions];
-
-  // Buffer used for updating parameters
-  LearnFloatType biases_diff_[kHalfDimensions];
-  std::vector<LearnFloatType> gradients_;
-
-  // Forward propagation buffer
-  std::vector<LearnFloatType> output_;
-
-  // Features that appeared in the training data
-  std::bitset<kInputDimensions> observed_features;
-
-  // hyper parameter
-  LearnFloatType momentum_;
-  LearnFloatType learning_rate_scale_;
-
-  // Health check statistics
-  LearnFloatType min_pre_activation_;
-  LearnFloatType max_pre_activation_;
-  LearnFloatType min_activations_[kHalfDimensions];
-  LearnFloatType max_activations_[kHalfDimensions];
-};
-
-}  // namespace NNUE
-
-}  // namespace Eval
-
-#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+                    Blas::scopy(
+                        kHalfDimensions, biases_, 1, &output_[output_offset], 1
+                    );
+                    for (const auto& feature : batch_[b].training_features[c]) {
+                        const IndexType weights_offset = kHalfDimensions * feature.get_index();
+                        Blas::saxpy(
+                            kHalfDimensions, (float)feature.get_count(),
+                            &weights_[weights_offset], &output_[output_offset]
+                        );
+                    }
+
+#endif
+                }
+            }
+
+#if defined (USE_SSE2)
+
+            {
+                static_assert(kHalfDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time");
+
+                auto m128_hmin_ps = [](__m128 x3210) {
+                    __m128 x0032 = _mm_shuffle_ps(x3210, x3210, _MM_SHUFFLE(0, 0, 3, 2));
+                    __m128 min_x_x_13_20 = _mm_min_ps(x3210, x0032);
+                    // a = [ # , # , min(x[1], x[3]) , min(x[2], x[0]) ]
+                    __m128 min_x_x_20_13 = _mm_shuffle_ps(min_x_x_13_20, min_x_x_13_20, _MM_SHUFFLE(0, 0, 0, 1));
+                    return _mm_cvtss_f32(_mm_min_ps(min_x_x_13_20, min_x_x_20_13));
+                };
+
+                auto m128_hmax_ps = [](__m128 x3210) {
+                    __m128 x0032 = _mm_shuffle_ps(x3210, x3210, _MM_SHUFFLE(0, 0, 3, 2));
+                    __m128 max_x_x_13_20 = _mm_max_ps(x3210, x0032);
+                    // a = [ # , # , max(x[1], x[3]) , max(x[2], x[0]) ]
+                    __m128 max_x_x_20_13 = _mm_shuffle_ps(max_x_x_13_20, max_x_x_13_20, _MM_SHUFFLE(0, 0, 0, 1));
+                    return _mm_cvtss_f32(_mm_max_ps(max_x_x_13_20, max_x_x_20_13));
+                };
+
+                const __m128 kZero4 = _mm_set1_ps(+kZero);
+                const __m128 kOne4 = _mm_set1_ps(+kOne);
+
+                __m128 min_pre_activation0 = _mm_set1_ps(thread_stat_state.min_pre_activation_);
+                __m128 min_pre_activation1 = _mm_set1_ps(thread_stat_state.min_pre_activation_);
+                __m128 max_pre_activation0 = _mm_set1_ps(thread_stat_state.max_pre_activation_);
+                __m128 max_pre_activation1 = _mm_set1_ps(thread_stat_state.max_pre_activation_);
+
+                for (IndexType b = offset; b < offset + count; ++b)
+                {
+                    const IndexType batch_offset = kOutputDimensions * b;
+                    for (IndexType i = 0; i < kOutputDimensions; i += 16)
+                    {
+                        __m128 out0 = _mm_loadu_ps(&output_[batch_offset + i +  0]);
+                        __m128 out1 = _mm_loadu_ps(&output_[batch_offset + i +  4]);
+                        __m128 out2 = _mm_loadu_ps(&output_[batch_offset + i +  8]);
+                        __m128 out3 = _mm_loadu_ps(&output_[batch_offset + i + 12]);
+
+                        __m128 min01 = _mm_min_ps(out0, out1);
+                        __m128 min23 = _mm_min_ps(out2, out3);
+
+                        __m128 max01 = _mm_max_ps(out0, out1);
+                        __m128 max23 = _mm_max_ps(out2, out3);
+
+                        min_pre_activation0 = _mm_min_ps(min_pre_activation0, min01);
+                        min_pre_activation1 = _mm_min_ps(min_pre_activation1, min23);
+                        max_pre_activation0 = _mm_max_ps(max_pre_activation0, max01);
+                        max_pre_activation1 = _mm_max_ps(max_pre_activation1, max23);
+
+                        out0 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out0));
+                        out1 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out1));
+                        out2 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out2));
+                        out3 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out3));
+
+                        _mm_storeu_ps(&output_[batch_offset + i +  0], out0);
+                        _mm_storeu_ps(&output_[batch_offset + i +  4], out1);
+                        _mm_storeu_ps(&output_[batch_offset + i +  8], out2);
+                        _mm_storeu_ps(&output_[batch_offset + i + 12], out3);
+                    }
+                }
+
+                thread_stat_state.min_pre_activation_ = m128_hmin_ps(_mm_min_ps(min_pre_activation0, min_pre_activation1));
+                thread_stat_state.max_pre_activation_ = m128_hmax_ps(_mm_max_ps(max_pre_activation0, max_pre_activation1));
+
+                for (IndexType b = offset; b < offset + count; ++b)
+                {
+                    const IndexType batch_offset = kOutputDimensions * b;
+
+                    for (IndexType half = 0; half < 2; ++half)
+                    {
+                        const IndexType half_offset = batch_offset + half * kHalfDimensions;
+                        for (IndexType i = 0; i < kHalfDimensions; i += 16)
+                        {
+                            const __m128 out0 = _mm_loadu_ps(&output_[i +  0 + half_offset]);
+                            const __m128 out1 = _mm_loadu_ps(&output_[i +  4 + half_offset]);
+                            const __m128 out2 = _mm_loadu_ps(&output_[i +  8 + half_offset]);
+                            const __m128 out3 = _mm_loadu_ps(&output_[i + 12 + half_offset]);
+
+                            __m128 minact0 = _mm_loadu_ps(&thread_stat_state.min_activations_[i +  0]);
+                            __m128 minact1 = _mm_loadu_ps(&thread_stat_state.min_activations_[i +  4]);
+                            __m128 minact2 = _mm_loadu_ps(&thread_stat_state.min_activations_[i +  8]);
+                            __m128 minact3 = _mm_loadu_ps(&thread_stat_state.min_activations_[i + 12]);
+
+                            __m128 maxact0 = _mm_loadu_ps(&thread_stat_state.max_activations_[i +  0]);
+                            __m128 maxact1 = _mm_loadu_ps(&thread_stat_state.max_activations_[i +  4]);
+                            __m128 maxact2 = _mm_loadu_ps(&thread_stat_state.max_activations_[i +  8]);
+                            __m128 maxact3 = _mm_loadu_ps(&thread_stat_state.max_activations_[i + 12]);
+
+                            minact0 = _mm_min_ps(out0, minact0);
+                            minact1 = _mm_min_ps(out1, minact1);
+                            minact2 = _mm_min_ps(out2, minact2);
+                            minact3 = _mm_min_ps(out3, minact3);
+
+                            maxact0 = _mm_max_ps(out0, maxact0);
+                            maxact1 = _mm_max_ps(out1, maxact1);
+                            maxact2 = _mm_max_ps(out2, maxact2);
+                            maxact3 = _mm_max_ps(out3, maxact3);
+
+                            _mm_storeu_ps(&thread_stat_state.min_activations_[i +  0], minact0);
+                            _mm_storeu_ps(&thread_stat_state.min_activations_[i +  4], minact1);
+                            _mm_storeu_ps(&thread_stat_state.min_activations_[i +  8], minact2);
+                            _mm_storeu_ps(&thread_stat_state.min_activations_[i + 12], minact3);
+
+                            _mm_storeu_ps(&thread_stat_state.max_activations_[i +  0], maxact0);
+                            _mm_storeu_ps(&thread_stat_state.max_activations_[i +  4], maxact1);
+                            _mm_storeu_ps(&thread_stat_state.max_activations_[i +  8], maxact2);
+                            _mm_storeu_ps(&thread_stat_state.max_activations_[i + 12], maxact3);
+                        }
+                    }
+                }
+            }
+
+#else
+
+            // clipped ReLU
+            for (IndexType b = offset; b < offset + count; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                    const IndexType index = batch_offset + i;
+                    thread_stat_state.min_pre_activation_ = std::min(thread_stat_state.min_pre_activation_, output_[index]);
+                    thread_stat_state.max_pre_activation_ = std::max(thread_stat_state.max_pre_activation_, output_[index]);
+                    output_[index] = std::max(+kZero, std::min(+kOne, output_[index]));
+                    const IndexType t = i % kHalfDimensions;
+                    thread_stat_state.min_activations_[t] = std::min(thread_stat_state.min_activations_[t], output_[index]);
+                    thread_stat_state.max_activations_[t] = std::max(thread_stat_state.max_activations_[t], output_[index]);
+                }
+            }
+
+#endif
+        }
+
+        // backpropagation
+        void backpropagate(Thread& th,
+                           const LearnFloatType* gradients,
+                           uint64_t offset,
+                           uint64_t count) {
+
+            auto& thread_stat_state = thread_stat_states_[th.thread_idx()];
+            auto& thread_bias_state = thread_bias_states_[th.thread_idx()];
+
+#if defined (USE_SSE2)
+
+            {
+                static_assert(kHalfDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time");
+
+                const __m128 kZero4 = _mm_set1_ps(+kZero);
+                const __m128 kOne4 = _mm_set1_ps(+kOne);
+
+                for (IndexType b = offset; b < offset + count; ++b)
+                {
+                    const IndexType batch_offset = kOutputDimensions * b;
+                    for (IndexType i = 0; i < kOutputDimensions; i += 16)
+                    {
+                        __m128 out0 = _mm_loadu_ps(&output_[batch_offset + i + 0]);
+                        __m128 out1 = _mm_loadu_ps(&output_[batch_offset + i + 4]);
+                        __m128 out2 = _mm_loadu_ps(&output_[batch_offset + i + 8]);
+                        __m128 out3 = _mm_loadu_ps(&output_[batch_offset + i + 12]);
+
+                        __m128 clipped0 = _mm_or_ps(_mm_cmple_ps(out0, kZero4), _mm_cmpge_ps(out0, kOne4));
+                        __m128 clipped1 = _mm_or_ps(_mm_cmple_ps(out1, kZero4), _mm_cmpge_ps(out1, kOne4));
+                        __m128 clipped2 = _mm_or_ps(_mm_cmple_ps(out2, kZero4), _mm_cmpge_ps(out2, kOne4));
+                        __m128 clipped3 = _mm_or_ps(_mm_cmple_ps(out3, kZero4), _mm_cmpge_ps(out3, kOne4));
+
+                        __m128 grad0 = _mm_loadu_ps(&gradients[batch_offset + i + 0]);
+                        __m128 grad1 = _mm_loadu_ps(&gradients[batch_offset + i + 4]);
+                        __m128 grad2 = _mm_loadu_ps(&gradients[batch_offset + i + 8]);
+                        __m128 grad3 = _mm_loadu_ps(&gradients[batch_offset + i + 12]);
+
+                        grad0 = _mm_andnot_ps(clipped0, grad0);
+                        grad1 = _mm_andnot_ps(clipped1, grad1);
+                        grad2 = _mm_andnot_ps(clipped2, grad2);
+                        grad3 = _mm_andnot_ps(clipped3, grad3);
+
+                        _mm_storeu_ps(&gradients_[batch_offset + i + 0], grad0);
+                        _mm_storeu_ps(&gradients_[batch_offset + i + 4], grad1);
+                        _mm_storeu_ps(&gradients_[batch_offset + i + 8], grad2);
+                        _mm_storeu_ps(&gradients_[batch_offset + i + 12], grad3);
+
+                        const int clipped_mask =
+                            (_mm_movemask_ps(clipped0) << 0)
+                            | (_mm_movemask_ps(clipped1) << 4)
+                            | (_mm_movemask_ps(clipped2) << 8)
+                            | (_mm_movemask_ps(clipped3) << 12);
+
+                        thread_stat_state.num_clipped_ += popcount(clipped_mask);
+                    }
+                }
+            }
+
+#else
+
+            for (IndexType b = offset; b < offset + count; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                    const IndexType index = batch_offset + i;
+                    const bool clipped = (output_[index] <= kZero) | (output_[index] >= kOne);
+                    gradients_[index] = gradients[index] * !clipped;
+                    thread_stat_state.num_clipped_ += clipped;
+                }
+            }
+
+#endif
+
+            thread_stat_state.num_total_ += count * kOutputDimensions;
+
+#if defined(USE_BLAS)
+
+            for (IndexType b = offset; b < offset + count; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType c = 0; c < 2; ++c) {
+                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
+                    cblas_saxpy(
+                        kHalfDimensions, 1.0,
+                        &gradients_[output_offset], 1, thread_bias_state.biases_diff_, 1
+                    );
+                }
+            }
+
+#else
+
+            for (IndexType b = offset; b < offset + count; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType c = 0; c < 2; ++c) {
+                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
+                    Blas::saxpy(
+                        kHalfDimensions, 1.0,
+                        &gradients_[output_offset], 1, thread_bias_state.biases_diff_, 1
+                    );
+                }
+            }
+
+#endif
+        }
+
+        void reduce_thread_stat_state()
+        {
+            for (IndexType i = 1; i < thread_stat_states_.size(); ++i)
+            {
+                thread_stat_states_[0] += thread_stat_states_[i];
+            }
+        }
+
+        void reduce_thread_bias_state()
+        {
+            for (IndexType i = 1; i < thread_bias_states_.size(); ++i)
+            {
+                thread_bias_states_[0] += thread_bias_states_[i];
+            }
+        }
+
+        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate) {
+
+            const LearnFloatType local_learning_rate =
+                learning_rate * learning_rate_scale_;
+
+            // Since the weight matrix updates only the columns corresponding to the features that appeared in the input,
+            // Correct the learning rate and adjust the scale without using momentum
+            const LearnFloatType effective_learning_rate =
+                static_cast<LearnFloatType>(local_learning_rate / (1.0 - momentum_));
+
+            reduce_thread_bias_state();
+
+            auto& main_thread_state = thread_bias_states_[0];
+
+#if defined(USE_BLAS)
+
+            cblas_saxpy(
+                kHalfDimensions, -local_learning_rate,
+                main_thread_state.biases_diff_, 1, biases_, 1
+            );
+
+#else
+
+            Blas::saxpy(
+                kHalfDimensions, -local_learning_rate,
+                main_thread_state.biases_diff_, 1, biases_, 1
+            );
+
+#endif
+
+            thread_pool.execute_with_workers(
+                [&, num_threads = thread_pool.size()](Thread& th) {
+                    const auto thread_index = th.thread_idx();
+
+                    for (IndexType b = 0; b < batch_size_; ++b) {
+                        const IndexType batch_offset = kOutputDimensions * b;
+
+                        for (IndexType c = 0; c < 2; ++c) {
+                            const IndexType output_offset = batch_offset + kHalfDimensions * c;
+                            for (const auto& feature : batch_[b].training_features[c]) {
+                                const IndexType feature_index = feature.get_index();
+                                const IndexType weights_offset =
+                                    kHalfDimensions * feature_index;
+#if defined (USE_SSE2)
+                                _mm_prefetch(reinterpret_cast<const char*>(&weights_[weights_offset]), _MM_HINT_T2);
+#endif
+
+                                // We assign each bucket a continuous range of bits at least
+                                // of cache line size to prevent false sharing.
+                                // For HalfKP this is enough to saturate about 80 threads.
+                                const IndexType thread_bucket =
+                                    (feature_index / BitsetType::best_concurrent_access_stride)
+                                    % num_threads;
+
+                                if (thread_bucket != thread_index)
+                                    continue;
+
+                                // This operation can be performed safely because
+                                // each thread accesses a different memory location
+                                // (even a different cache line)
+                                observed_features.set(feature_index);
+
+                                const auto scale = static_cast<LearnFloatType>(
+                                    effective_learning_rate / feature.get_count());
+
+#if defined (USE_BLAS)
+
+                                cblas_saxpy(
+                                    kHalfDimensions, -scale,
+                                    &gradients_[output_offset], 1,
+                                    &weights_[weights_offset], 1
+                                );
+
+#else
+
+                                Blas::saxpy(
+                                    kHalfDimensions, -scale,
+                                    &gradients_[output_offset],
+                                    &weights_[weights_offset]
+                                );
+
+#endif
+                            }
+                        }
+                    }
+                }
+            );
+
+            thread_pool.wait_for_workers_finished();
+        }
+
+    private:
+        // constructor
+        Trainer(LayerType* target_layer) :
+            batch_(nullptr),
+            batch_size_(0),
+            target_layer_(target_layer),
+            biases_(),
+            weights_(),
+            momentum_(0.2),
+            learning_rate_scale_(1.0) {
+
+            dequantize_parameters();
+        }
+
+        // Weight saturation and parameterization
+        void quantize_parameters() {
+            for (IndexType i = 0; i < kHalfDimensions; ++i) {
+                target_layer_->biases_[i] =
+                    round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
+            }
+
+            std::vector<TrainingFeature> training_features;
+
+            Threads.for_each_index_with_workers(
+                0, RawFeatures::kDimensions,
+                [this, training_features](Thread&, int j) mutable {
+                    training_features.clear();
+                    Features::Factorizer<RawFeatures>::append_training_features(
+                        j, &training_features);
+
+                    for (IndexType i = 0; i < kHalfDimensions; ++i) {
+                        double sum = 0.0;
+                        for (const auto& feature : training_features) {
+                            sum += weights_[kHalfDimensions * feature.get_index() + i];
+                        }
+
+                        target_layer_->weights_[kHalfDimensions * j + i] =
+                            round<typename LayerType::WeightType>(sum * kWeightScale);
+                    }
+                }
+            );
+            Threads.wait_for_workers_finished();
+        }
+
+        void reset_stats() {
+            for (auto& state : thread_stat_states_)
+                state.reset();
+        }
+
+        // read parameterized integer
+        void dequantize_parameters() {
+            for (IndexType i = 0; i < kHalfDimensions; ++i) {
+                biases_[i] = static_cast<LearnFloatType>(
+                    target_layer_->biases_[i] / kBiasScale);
+            }
+
+            std::fill(std::begin(weights_), std::end(weights_), +kZero);
+
+            for (IndexType i = 0; i < kHalfDimensions * RawFeatures::kDimensions; ++i) {
+                weights_[i] = static_cast<LearnFloatType>(
+                    target_layer_->weights_[i] / kWeightScale);
+            }
+
+            reset_stats();
+
+            for (auto& state : thread_bias_states_)
+                state.reset();
+        }
+
+        // Set the weight corresponding to the feature that does not appear in the learning data to 0
+        void clear_unobserved_feature_weights() {
+            for (IndexType i = 0; i < kInputDimensions; ++i) {
+                if (!observed_features.test(i)) {
+                    std::fill(std::begin(weights_) + kHalfDimensions * i,
+                              std::begin(weights_) + kHalfDimensions * (i + 1), +kZero);
+                }
+            }
+
+            quantize_parameters();
+        }
+
+        // Check if there are any problems with learning
+        void check_health() {
+
+            constexpr LearnFloatType kPreActivationLimit =
+                std::numeric_limits<typename LayerType::WeightType>::max() /
+                kWeightScale;
+
+            reduce_thread_stat_state();
+
+            auto& main_thread_state = thread_stat_states_[0];
+
+            const auto largest_min_activation = *std::max_element(
+                std::begin(main_thread_state.min_activations_), std::end(main_thread_state.min_activations_));
+            const auto smallest_max_activation = *std::min_element(
+                std::begin(main_thread_state.max_activations_), std::end(main_thread_state.max_activations_));
+
+            double abs_bias_sum = 0.0;
+            double abs_weight_sum = 0.0;
+
+            for(auto b : biases_)
+                abs_bias_sum += std::abs(b);
+
+            for(auto w : weights_)
+                abs_weight_sum += std::abs(w);
+
+            auto out = sync_region_cout.new_region();
+
+            out << "INFO (check_health):"
+                << " layer " << LayerType::kLayerIndex
+                << " - " << LayerType::get_name()
+                << std::endl;
+
+            out << "  - observed " << observed_features.count()
+                << " (out of " << kInputDimensions << ") features"
+                << std::endl;
+
+            out << "  - (min, max) of pre-activations = "
+                << main_thread_state.min_pre_activation_ << ", "
+                << main_thread_state.max_pre_activation_ << " (limit = "
+                << kPreActivationLimit << ")"
+                << std::endl;
+
+            out << "  - largest min activation = " << largest_min_activation
+                << " , smallest max activation = " << smallest_max_activation
+                << std::endl;
+
+            out << "  - avg_abs_bias   = " << abs_bias_sum / std::size(biases_) << std::endl;
+            out << "  - avg_abs_weight = " << abs_weight_sum / std::size(weights_) << std::endl;
+
+            out << "  - clipped " << static_cast<double>(main_thread_state.num_clipped_) / main_thread_state.num_total_ * 100.0 << "% of outputs"
+                << std::endl;
+
+            out.unlock();
+
+            reset_stats();
+        }
+
+        // number of input/output dimensions
+        static constexpr IndexType kInputDimensions =
+            Features::Factorizer<RawFeatures>::get_dimensions();
+        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+        static constexpr IndexType kHalfDimensions = LayerType::kHalfDimensions;
+
+        // Coefficient used for parameterization
+        static constexpr LearnFloatType kActivationScale =
+            std::numeric_limits<std::int8_t>::max();
+        static constexpr LearnFloatType kBiasScale = kActivationScale;
+        static constexpr LearnFloatType kWeightScale = kActivationScale;
+
+        // LearnFloatType constant
+        static constexpr LearnFloatType kZero = static_cast<LearnFloatType>(0.0);
+        static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
+
+        // mini batch
+        const Example* batch_;
+        IndexType batch_size_;
+
+        // layer to learn
+        LayerType* const target_layer_;
+
+        IndexType num_total_;
+
+        // parameter
+        alignas(kCacheLineSize) LearnFloatType biases_[kHalfDimensions];
+        alignas(kCacheLineSize)
+            LearnFloatType weights_[kHalfDimensions * kInputDimensions];
+
+        // Buffer used for updating parameters
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
+
+        // Forward propagation buffer
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
+
+        // Features that appeared in the training data
+        using BitsetType = LargeBitset<kInputDimensions>;
+        BitsetType observed_features;
+
+        // hyper parameter
+        LearnFloatType momentum_;
+        LearnFloatType learning_rate_scale_;
+
+        struct alignas(kCacheLineSize) ThreadStatState
+        {
+            alignas(kCacheLineSize) LearnFloatType min_activations_[kHalfDimensions];
+            alignas(kCacheLineSize) LearnFloatType max_activations_[kHalfDimensions];
+            LearnFloatType min_pre_activation_;
+            LearnFloatType max_pre_activation_;
+            IndexType num_clipped_;
+            IndexType num_total_;
+
+            ThreadStatState() { reset(); }
+
+            ThreadStatState& operator+=(const ThreadStatState& other)
+            {
+                for (IndexType i = 0; i < kHalfDimensions; ++i)
+                {
+                    min_activations_[i] = std::min(min_activations_[i], other.min_activations_[i]);
+                }
+
+                for (IndexType i = 0; i < kHalfDimensions; ++i)
+                {
+                    max_activations_[i] = std::max(max_activations_[i], other.max_activations_[i]);
+                }
+
+                min_pre_activation_ = std::min(min_pre_activation_, other.min_pre_activation_);
+                max_pre_activation_ = std::max(max_pre_activation_, other.max_pre_activation_);
+
+                num_clipped_ += other.num_clipped_;
+                num_total_ += other.num_total_;
+
+                return *this;
+            }
+
+            void reset()
+            {
+                std::fill(std::begin(min_activations_), std::end(min_activations_), std::numeric_limits<float>::max());
+                std::fill(std::begin(max_activations_), std::end(max_activations_), std::numeric_limits<float>::lowest());
+                min_pre_activation_ = std::numeric_limits<float>::max();
+                max_pre_activation_ = std::numeric_limits<float>::lowest();
+                num_clipped_ = 0;
+                num_total_ = 0;
+            }
+        };
+
+        struct alignas(kCacheLineSize) ThreadBiasState
+        {
+            alignas(kCacheLineSize) LearnFloatType biases_diff_[kHalfDimensions];
+
+            ThreadBiasState() { reset(); }
+
+            ThreadBiasState& operator+=(const ThreadBiasState& other)
+            {
+                for (IndexType i = 0; i < kHalfDimensions; ++i)
+                {
+                    biases_diff_[i] += other.biases_diff_[i];
+                }
+
+                return *this;
+            }
+
+            void reset()
+            {
+                std::fill(std::begin(biases_diff_), std::end(biases_diff_), 0.0f);
+            }
+        };
+
+        std::vector<ThreadStatState, CacheLineAlignedAllocator<ThreadStatState>> thread_stat_states_;
+        std::vector<ThreadBiasState, CacheLineAlignedAllocator<ThreadBiasState>> thread_bias_states_;
+    };
+
+}  // namespace Eval::NNUE
 
 #endif
diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
index 33e39244..62a761a7 100644
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -1,251 +1,377 @@
-﻿// Specialization of NNUE evaluation function learning class template for InputSlice
-
-#ifndef _NNUE_TRAINER_INPUT_SLICE_H_
+﻿#ifndef _NNUE_TRAINER_INPUT_SLICE_H_
 #define _NNUE_TRAINER_INPUT_SLICE_H_
 
-#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
-
-#include "../../learn/learn.h"
-#include "../layers/input_slice.h"
 #include "trainer.h"
 
-namespace Eval {
+#include "extra/stockfish_blas.h"
 
-namespace NNUE {
+#include "learn/learn.h"
 
-// Learning: Input layer
-class SharedInputTrainer {
- public:
-  // factory function
-  static std::shared_ptr<SharedInputTrainer> Create(
-      FeatureTransformer* feature_transformer) {
-    static std::shared_ptr<SharedInputTrainer> instance;
-    if (!instance) {
-      instance.reset(new SharedInputTrainer(feature_transformer));
-    }
-    ++instance->num_referrers_;
-    return instance;
-  }
+#include "nnue/layers/input_slice.h"
 
-  // Set options such as hyperparameters
-  void SendMessage(Message* message) {
-    if (num_calls_ == 0) {
-      current_operation_ = Operation::kSendMessage;
-      feature_transformer_trainer_->SendMessage(message);
-    }
-    assert(current_operation_ == Operation::kSendMessage);
-    if (++num_calls_ == num_referrers_) {
-      num_calls_ = 0;
-      current_operation_ = Operation::kNone;
-    }
-  }
+#include "thread.h"
 
-  // Initialize the parameters with random numbers
-  template <typename RNG>
-  void Initialize(RNG& rng) {
-    if (num_calls_ == 0) {
-      current_operation_ = Operation::kInitialize;
-      feature_transformer_trainer_->Initialize(rng);
-    }
-    assert(current_operation_ == Operation::kInitialize);
-    if (++num_calls_ == num_referrers_) {
-      num_calls_ = 0;
-      current_operation_ = Operation::kNone;
-    }
-  }
+// Specialization of NNUE evaluation function learning class template for InputSlice
+namespace Eval::NNUE {
 
-  // forward propagation
-  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
-    if (gradients_.size() < kInputDimensions * batch.size()) {
-      gradients_.resize(kInputDimensions * batch.size());
-    }
-    batch_size_ = static_cast<IndexType>(batch.size());
-    if (num_calls_ == 0) {
-      current_operation_ = Operation::kPropagate;
-      output_ = feature_transformer_trainer_->Propagate(batch);
-    }
-    assert(current_operation_ == Operation::kPropagate);
-    if (++num_calls_ == num_referrers_) {
-      num_calls_ = 0;
-      current_operation_ = Operation::kNone;
-    }
-    return output_;
-  }
+    // Learning: Input layer
+    // This is tricky. It exists because when there's more than one trainer
+    // on top of a single feature transformer we want to only call propagate/backpropagate
+    // on the feature transformer once. This is straightforward in the old
+    // multithreading case, because propagate/backpropagate is called just once from the
+    // main thread. But with the current implementation of coarser multithreading
+    // we end up calling each method from each thread. Therefore we have to keep
+    // the num_calls and current_operation per thread basis, each thread must work
+    // on its designated batch slice, and the only synchronization points are
+    // step_start and step_end - for which we use state of the first thread.
+    // Each thread requires their own bookkeeping because it's possible that
+    // one thread is still in propagate of some batch slice while the other thread
+    // is doing backpropagate of some other slice. We also ensure the thread state
+    // isn't suspectible to false sharing by using a full cache line for the state.
+    class SharedInputTrainer {
+    public:
+        // factory function
+        static std::shared_ptr<SharedInputTrainer> create(
+            FeatureTransformer* ft) {
 
-  // backpropagation
-  void Backpropagate(const LearnFloatType* gradients,
-                     LearnFloatType learning_rate) {
-    if (num_referrers_ == 1) {
-      feature_transformer_trainer_->Backpropagate(gradients, learning_rate);
-      return;
-    }
-    if (num_calls_ == 0) {
-      current_operation_ = Operation::kBackPropagate;
-      for (IndexType b = 0; b < batch_size_; ++b) {
-        const IndexType batch_offset = kInputDimensions * b;
-        for (IndexType i = 0; i < kInputDimensions; ++i) {
-          gradients_[batch_offset + i] = static_cast<LearnFloatType>(0.0);
+            static std::shared_ptr<SharedInputTrainer> instance;
+
+            if (!instance) {
+                instance.reset(new SharedInputTrainer(ft));
+            }
+
+            ++instance->num_referrers_;
+
+            return instance;
         }
-      }
-    }
-    assert(current_operation_ == Operation::kBackPropagate);
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType batch_offset = kInputDimensions * b;
-      for (IndexType i = 0; i < kInputDimensions; ++i) {
-        gradients_[batch_offset + i] += gradients[batch_offset + i];
-      }
-    }
-    if (++num_calls_ == num_referrers_) {
-      feature_transformer_trainer_->Backpropagate(
-          gradients_.data(), learning_rate);
-      num_calls_ = 0;
-      current_operation_ = Operation::kNone;
-    }
-  }
 
- private:
-  // constructor
-  SharedInputTrainer(FeatureTransformer* feature_transformer) :
-      batch_size_(0),
-      num_referrers_(0),
-      num_calls_(0),
-      current_operation_(Operation::kNone),
-      feature_transformer_trainer_(Trainer<FeatureTransformer>::Create(
-          feature_transformer)),
-      output_(nullptr) {
-  }
+        // Set options such as hyperparameters
+        void send_message(Message* message) {
+            auto& thread_state = thread_states_[0];
 
-  // number of input/output dimensions
-  static constexpr IndexType kInputDimensions =
-      FeatureTransformer::kOutputDimensions;
+            if (thread_state.num_calls == 0) {
+                thread_state.current_operation = Operation::kSendMessage;
+                feature_transformer_trainer_->send_message(message);
+            }
 
-  // type of processing
-  enum class Operation {
-    kNone,
-    kSendMessage,
-    kInitialize,
-    kPropagate,
-    kBackPropagate,
-  };
+            assert(thread_state.current_operation == Operation::kSendMessage);
 
-  // number of samples in mini-batch
-  IndexType batch_size_;
+            if (++thread_state.num_calls == num_referrers_) {
+                thread_state.num_calls = 0;
+                thread_state.current_operation = Operation::kNone;
+            }
+        }
 
-  // number of layers sharing this layer as input
-  std::uint32_t num_referrers_;
+        // Initialize the parameters with random numbers
+        template <typename RNG>
+        void initialize(RNG& rng) {
+            auto& thread_state = thread_states_[0];
 
-  // Number of times the current process has been called
-  std::uint32_t num_calls_;
+            if (thread_state.num_calls == 0) {
+                thread_state.current_operation = Operation::kInitialize;
+                feature_transformer_trainer_->initialize(rng);
+            }
 
-  // current processing type
-  Operation current_operation_;
+            assert(thread_state.current_operation == Operation::kInitialize);
 
-  // Trainer of input feature converter
-  const std::shared_ptr<Trainer<FeatureTransformer>>
-      feature_transformer_trainer_;
+            if (++thread_state.num_calls == num_referrers_) {
+                thread_state.num_calls = 0;
+                thread_state.current_operation = Operation::kNone;
+            }
+        }
 
-  // pointer to output shared for forward propagation
-  const LearnFloatType* output_;
+        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
+        {
+            const auto size = batch_end - batch_begin;
 
-  // buffer for back propagation
-  std::vector<LearnFloatType> gradients_;
-};
+            if ((long)gradients_.size() < (long)kInputDimensions * size) {
+                gradients_.resize(kInputDimensions * size);
+            }
 
-// Learning: Input layer
-template <IndexType OutputDimensions, IndexType Offset>
-class Trainer<Layers::InputSlice<OutputDimensions, Offset>> {
- private:
-  // Type of layer to learn
-  using LayerType = Layers::InputSlice<OutputDimensions, Offset>;
+            if (thread_states_.size() < thread_pool.size())
+            {
+                thread_states_.resize(thread_pool.size());
+            }
 
- public:
-  // factory function
-  static std::shared_ptr<Trainer> Create(
-      LayerType* /*target_layer*/, FeatureTransformer* feature_transformer) {
-    return std::shared_ptr<Trainer>(new Trainer(feature_transformer));
-  }
+            batch_size_ = size;
 
-  // Set options such as hyperparameters
-  void SendMessage(Message* message) {
-    shared_input_trainer_->SendMessage(message);
-  }
+            auto& thread_state = thread_states_[0];
 
-  // Initialize the parameters with random numbers
-  template <typename RNG>
-  void Initialize(RNG& rng) {
-    shared_input_trainer_->Initialize(rng);
-  }
+            if (thread_state.num_calls == 0) {
+                thread_state.current_operation = Operation::kStepStart;
+                output_ = feature_transformer_trainer_->step_start(thread_pool, batch_begin, batch_end);
+            }
+
+            assert(thread_state.current_operation == Operation::kStepStart);
+
+            if (++thread_state.num_calls == num_referrers_) {
+                thread_state.num_calls = 0;
+                thread_state.current_operation = Operation::kNone;
+            }
+
+            return output_;
+        }
+
+        // forward propagation
+        void propagate(Thread& th, uint64_t offset, uint64_t count) {
+            const auto thread_id = th.thread_idx();
+
+            auto& thread_state = thread_states_[thread_id];
+
+            if (thread_state.num_calls == 0) {
+                thread_state.current_operation = Operation::kPropagate;
+                feature_transformer_trainer_->propagate(th, offset, count);
+            }
+
+            assert(thread_state.current_operation == Operation::kPropagate);
+
+            if (++thread_state.num_calls == num_referrers_) {
+                thread_state.num_calls = 0;
+                thread_state.current_operation = Operation::kNone;
+            }
+        }
+
+        // backpropagation
+        void backpropagate(Thread& th,
+                           const LearnFloatType* gradients,
+                           uint64_t offset,
+                           uint64_t count) {
+
+            const auto thread_id = th.thread_idx();
+
+            auto& thread_state = thread_states_[thread_id];
+
+            if (num_referrers_ == 1) {
+                feature_transformer_trainer_->backpropagate(th, gradients, offset, count);
+                return;
+            }
+
+            if (thread_state.num_calls == 0) {
+                thread_state.current_operation = Operation::kBackPropagate;
+                for (IndexType b = offset; b < offset + count; ++b) {
+                    const IndexType batch_offset = kInputDimensions * b;
+                    for (IndexType i = 0; i < kInputDimensions; ++i) {
+                        gradients_[batch_offset + i] = static_cast<LearnFloatType>(0.0);
+                    }
+                }
+            }
+
+            assert(thread_state.current_operation == Operation::kBackPropagate);
+
+            for (IndexType b = offset; b < offset + count; ++b) {
+                const IndexType batch_offset = kInputDimensions * b;
+                for (IndexType i = 0; i < kInputDimensions; ++i) {
+                    gradients_[batch_offset + i] += gradients[batch_offset + i];
+                }
+            }
+
+            if (++thread_state.num_calls == num_referrers_) {
+                feature_transformer_trainer_->backpropagate(
+                    th, gradients_.data(), offset, count);
+                thread_state.num_calls = 0;
+                thread_state.current_operation = Operation::kNone;
+            }
+        }
+
+        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate) {
+            auto& thread_state = thread_states_[0];
+
+            if (thread_state.num_calls == 0) {
+                thread_state.current_operation = Operation::kStepEnd;
+                feature_transformer_trainer_->step_end(thread_pool, learning_rate);
+            }
+
+            assert(thread_state.current_operation == Operation::kStepEnd);
+
+            if (++thread_state.num_calls == num_referrers_) {
+                thread_state.num_calls = 0;
+                thread_state.current_operation = Operation::kNone;
+            }
+        }
+
+    private:
+        // constructor
+        SharedInputTrainer(FeatureTransformer* ft) :
+            batch_size_(0),
+            num_referrers_(0),
+            thread_states_(1),
+            feature_transformer_trainer_(Trainer<FeatureTransformer>::create(
+                ft)),
+            output_(nullptr) {
+        }
+
+        // number of input/output dimensions
+        static constexpr IndexType kInputDimensions =
+            FeatureTransformer::kOutputDimensions;
+
+        // type of processing
+        enum class Operation {
+            kNone,
+            kSendMessage,
+            kInitialize,
+            kStepStart,
+            kPropagate,
+            kBackPropagate,
+            kStepEnd,
+        };
+
+        // number of samples in mini-batch
+        IndexType batch_size_;
+
+        // number of layers sharing this layer as input
+        std::uint32_t num_referrers_;
+
+        struct alignas(kCacheLineSize) ThreadState
+        {
+            std::uint32_t num_calls{0};
+
+            // current processing type
+            Operation current_operation = Operation::kNone;
+        };
+
+        // Number of times the current process has been called
+        std::vector<ThreadState, CacheLineAlignedAllocator<ThreadState>> thread_states_;
+
+        // Trainer of input feature converter
+        const std::shared_ptr<Trainer<FeatureTransformer>>
+            feature_transformer_trainer_;
+
+        // pointer to output shared for forward propagation
+        const LearnFloatType* output_;
+
+        // buffer for back propagation
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
+    };
+
+    // Learning: Input layer
+    template <IndexType OutputDimensions, IndexType Offset>
+    class Trainer<Layers::InputSlice<OutputDimensions, Offset>> {
+    private:
+        // Type of layer to learn
+        using LayerType = Layers::InputSlice<OutputDimensions, Offset>;
+
+    public:
+        // factory function
+        static std::shared_ptr<Trainer> create(
+            LayerType* /*target_layer*/, FeatureTransformer* ft) {
+
+            return std::shared_ptr<Trainer>(new Trainer(ft));
+        }
+
+        // Set options such as hyperparameters
+        void send_message(Message* message) {
+            shared_input_trainer_->send_message(message);
+        }
+
+        // Initialize the parameters with random numbers
+        template <typename RNG>
+        void initialize(RNG& rng) {
+            shared_input_trainer_->initialize(rng);
+        }
+
+        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
+        {
+            const auto size = batch_end - batch_begin;
+
+            if ((long)output_.size() < (long)kOutputDimensions * size) {
+              output_.resize(kOutputDimensions * size);
+              gradients_.resize(kInputDimensions * size);
+            }
+
+            batch_size_ = size;
+
+            input_ = shared_input_trainer_->step_start(thread_pool, batch_begin, batch_end);
+
+            return output_.data();
+        }
+
+        // forward propagation
+        void propagate(Thread& th, uint64_t offset, uint64_t count) {
+
+            shared_input_trainer_->propagate(th, offset, count);
+
+            for (IndexType b = offset; b < offset + count; ++b) {
+                const IndexType input_offset = kInputDimensions * b;
+                const IndexType output_offset = kOutputDimensions * b;
 
-  // forward propagation
-  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
-    if (output_.size() < kOutputDimensions * batch.size()) {
-      output_.resize(kOutputDimensions * batch.size());
-      gradients_.resize(kInputDimensions * batch.size());
-    }
-    batch_size_ = static_cast<IndexType>(batch.size());
-    const auto input = shared_input_trainer_->Propagate(batch);
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType input_offset = kInputDimensions * b;
-      const IndexType output_offset = kOutputDimensions * b;
 #if defined(USE_BLAS)
-      cblas_scopy(kOutputDimensions, &input[input_offset + Offset], 1,
-                  &output_[output_offset], 1);
+
+                cblas_scopy(
+                    kOutputDimensions, &input_[input_offset + Offset], 1,
+                    &output_[output_offset], 1
+                );
 #else
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        output_[output_offset + i] = input[input_offset + Offset + i];
-      }
-#endif
-    }
-    return output_.data();
-  }
 
-  // backpropagation
-  void Backpropagate(const LearnFloatType* gradients,
-                     LearnFloatType learning_rate) {
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType input_offset = kInputDimensions * b;
-      const IndexType output_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kInputDimensions; ++i) {
-        if (i < Offset || i >= Offset + kOutputDimensions) {
-          gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
-        } else {
-          gradients_[input_offset + i] = gradients[output_offset + i - Offset];
+                Blas::scopy(
+                    kOutputDimensions, &input_[input_offset + Offset], 1,
+                    &output_[output_offset], 1
+                );
+
+#endif
+            }
         }
-      }
-    }
-    shared_input_trainer_->Backpropagate(gradients_.data(), learning_rate);
-  }
 
- private:
-  // constructor
-  Trainer(FeatureTransformer* feature_transformer):
-      batch_size_(0),
-      shared_input_trainer_(SharedInputTrainer::Create(feature_transformer)) {
-  }
+        // backpropagation
+        void backpropagate(Thread& th,
+                           const LearnFloatType* gradients,
+                           uint64_t offset,
+                           uint64_t count) {
 
-  // number of input/output dimensions
-  static constexpr IndexType kInputDimensions =
-      FeatureTransformer::kOutputDimensions;
-  static constexpr IndexType kOutputDimensions = OutputDimensions;
-  static_assert(Offset + kOutputDimensions <= kInputDimensions, "");
+            for (IndexType b = offset; b < offset + count; ++b)
+            {
+                const IndexType input_offset = kInputDimensions * b;
+                const IndexType output_offset = kOutputDimensions * b;
 
-  // number of samples in mini-batch
-  IndexType batch_size_;
+                IndexType i = 0;
+                for (; i < Offset; ++i) {
+                    gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
+                }
 
-  // Trainer of shared input layer
-  const std::shared_ptr<SharedInputTrainer> shared_input_trainer_;
+                for (; i < Offset + kOutputDimensions; ++i) {
+                    gradients_[input_offset + i] = gradients[output_offset + i - Offset];
+                }
 
-  // Forward propagation buffer
-  std::vector<LearnFloatType> output_;
+                for (; i < kInputDimensions; ++i)
+                {
+                    gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
+                }
+            }
 
-  // buffer for back propagation
-  std::vector<LearnFloatType> gradients_;
-};
+            shared_input_trainer_->backpropagate(th, gradients_.data(), offset, count);
+        }
 
-}  // namespace NNUE
+        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate) {
+            shared_input_trainer_->step_end(thread_pool, learning_rate);
+        }
 
-}  // namespace Eval
+    private:
+        // constructor
+        Trainer(FeatureTransformer* ft) :
+            batch_size_(0),
+            shared_input_trainer_(SharedInputTrainer::create(ft)) {
+        }
 
-#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+        // number of input/output dimensions
+        static constexpr IndexType kInputDimensions =
+            FeatureTransformer::kOutputDimensions;
+        static constexpr IndexType kOutputDimensions = OutputDimensions;
+        static_assert(Offset + kOutputDimensions <= kInputDimensions, "");
+
+        // number of samples in mini-batch
+        IndexType batch_size_;
+
+        const LearnFloatType* input_;
+
+        // Trainer of shared input layer
+        const std::shared_ptr<SharedInputTrainer> shared_input_trainer_;
+
+        // Forward propagation buffer
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
+
+        // buffer for back propagation
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
+    };
+
+}  // namespace Eval::NNUE
 
 #endif
diff --git a/src/nnue/trainer/trainer_sum.h b/src/nnue/trainer/trainer_sum.h
index fb5b1532..88ff302c 100644
--- a/src/nnue/trainer/trainer_sum.h
+++ b/src/nnue/trainer/trainer_sum.h
@@ -1,190 +1,201 @@
-﻿// Specialization of NNUE evaluation function learning class template for Sum
-
-#ifndef _NNUE_TRAINER_SUM_H_
+﻿#ifndef _NNUE_TRAINER_SUM_H_
 #define _NNUE_TRAINER_SUM_H_
 
-#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
-
-#include "../../learn/learn.h"
-#include "../layers/sum.h"
 #include "trainer.h"
 
-namespace Eval {
+#include "extra/stockfish_blas.h"
 
-namespace NNUE {
+#include "learn/learn.h"
 
-// Learning: A layer that sums the outputs of multiple layers
-template <typename FirstPreviousLayer, typename... RemainingPreviousLayers>
-class Trainer<Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>> :
-      Trainer<Layers::Sum<RemainingPreviousLayers...>> {
- private:
-  // Type of layer to learn
-  using LayerType = Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>;
-  using Tail = Trainer<Layers::Sum<RemainingPreviousLayers...>>;
+#include "nnue/layers/sum.h"
 
- public:
-  // factory function
-  static std::shared_ptr<Trainer> Create(
-      LayerType* target_layer, FeatureTransformer* feature_transformer) {
-    return std::shared_ptr<Trainer>(
-        new Trainer(target_layer, feature_transformer));
-  }
+#include "thread.h"
 
-  // Set options such as hyperparameters
-  void SendMessage(Message* message) {
-    // The results of other member functions do not depend on the processing order, so
-    // Tail is processed first for the purpose of simplifying the implementation, but
-    // SendMessage processes Head first to make it easier to understand subscript correspondence
-    previous_layer_trainer_->SendMessage(message);
-    Tail::SendMessage(message);
-  }
+// Specialization of NNUE evaluation function learning class template for Sum
+namespace Eval::NNUE {
 
-  // Initialize the parameters with random numbers
-  template <typename RNG>
-  void Initialize(RNG& rng) {
-    Tail::Initialize(rng);
-    previous_layer_trainer_->Initialize(rng);
-  }
+    // Learning: A layer that sums the outputs of multiple layers
+    template <typename FirstPreviousLayer, typename... RemainingPreviousLayers>
+    class Trainer<Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>> :
+          Trainer<Layers::Sum<RemainingPreviousLayers...>> {
+    private:
+        // Type of layer to learn
+        using LayerType = Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>;
+        using Tail = Trainer<Layers::Sum<RemainingPreviousLayers...>>;
+
+    public:
+        // factory function
+        static std::shared_ptr<Trainer> create(
+            LayerType* target_layer, FeatureTransformer* ft) {
+
+            return std::shared_ptr<Trainer>(
+                new Trainer(target_layer, ft));
+        }
+
+        // Set options such as hyperparameters
+        void send_message(Message* message) {
+            // The results of other member functions do not depend on the processing order, so
+            // Tail is processed first for the purpose of simplifying the implementation, but
+            // SendMessage processes Head first to make it easier to understand subscript correspondence
+            previous_layer_trainer_->send_message(message);
+            Tail::send_message(message);
+        }
+
+        // Initialize the parameters with random numbers
+        template <typename RNG>
+        void initialize(RNG& rng) {
+            Tail::initialize(rng);
+            previous_layer_trainer_->initialize(rng);
+        }
+
+        // forward propagation
+        /*const*/ LearnFloatType* propagate(ThreadPool& thread_pool, const std::vector<Example>& batch) {
+            batch_size_ = static_cast<IndexType>(batch.size());
+            auto output = Tail::propagate(thread_pool, batch);
+            const auto head_output = previous_layer_trainer_->propagate(thread_pool, batch);
 
-  // forward propagation
-  /*const*/ LearnFloatType* Propagate(const std::vector<Example>& batch) {
-    batch_size_ = static_cast<IndexType>(batch.size());
-    auto output = Tail::Propagate(batch);
-    const auto head_output = previous_layer_trainer_->Propagate(batch);
 #if defined(USE_BLAS)
-    cblas_saxpy(kOutputDimensions * batch_size_, 1.0,
-                head_output, 1, output, 1);
+
+            cblas_saxpy(
+                kOutputDimensions * batch_size_, 1.0,
+                head_output, 1, output, 1
+            );
+
 #else
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        output[batch_offset + i] += head_output[batch_offset + i];
-      }
-    }
+
+            Blas::saxpy(
+                thread_pool,
+                kOutputDimensions * batch_size_, 1.0,
+                head_output, 1, output, 1
+            );
+
 #endif
-    return output;
-  }
+            return output;
+        }
 
-  // backpropagation
-  void Backpropagate(const LearnFloatType* gradients,
-                     LearnFloatType learning_rate) {
-    Tail::Backpropagate(gradients, learning_rate);
-    previous_layer_trainer_->Backpropagate(gradients, learning_rate);
-  }
+        // backpropagation
+        void backpropagate(ThreadPool& thread_pool,
+                           const LearnFloatType* gradients,
+                           LearnFloatType learning_rate) {
 
- private:
-  // constructor
-  Trainer(LayerType* target_layer, FeatureTransformer* feature_transformer):
-      Tail(target_layer, feature_transformer),
-      batch_size_(0),
-      previous_layer_trainer_(Trainer<FirstPreviousLayer>::Create(
-          &target_layer->previous_layer_, feature_transformer)),
-      target_layer_(target_layer) {
-  }
+            Tail::backpropagate(thread_pool, gradients, learning_rate);
+            previous_layer_trainer_->backpropagate(thread_pool, gradients, learning_rate);
+        }
 
-  // number of input/output dimensions
-  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+    private:
+        // constructor
+        Trainer(LayerType* target_layer, FeatureTransformer* ft):
+            Tail(target_layer, ft),
+            batch_size_(0),
+            previous_layer_trainer_(Trainer<FirstPreviousLayer>::create(
+                &target_layer->previous_layer_, ft)),
+            target_layer_(target_layer) {
+        }
 
-  // make subclass friend
-  template <typename SumLayer>
-  friend class Trainer;
+        // number of input/output dimensions
+        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
 
-  // number of samples in mini-batch
-  IndexType batch_size_;
+        // make subclass friend
+        template <typename SumLayer>
+        friend class Trainer;
 
-  // Trainer of the previous layer
-  const std::shared_ptr<Trainer<FirstPreviousLayer>> previous_layer_trainer_;
+        // number of samples in mini-batch
+        IndexType batch_size_;
 
-  // layer to learn
-  LayerType* const target_layer_;
-};
+        // Trainer of the previous layer
+        const std::shared_ptr<Trainer<FirstPreviousLayer>> previous_layer_trainer_;
+
+        // layer to learn
+        LayerType* const target_layer_;
+    };
 
 
-// Learning: Layer that takes the sum of the outputs of multiple layers (when there is one template argument)
-template <typename PreviousLayer>
-class Trainer<Layers::Sum<PreviousLayer>> {
- private:
-  // Type of layer to learn
-  using LayerType = Layers::Sum<PreviousLayer>;
+    // Learning: Layer that takes the sum of the outputs of multiple layers (when there is one template argument)
+    template <typename PreviousLayer>
+    class Trainer<Layers::Sum<PreviousLayer>> {
+    private:
+        // Type of layer to learn
+        using LayerType = Layers::Sum<PreviousLayer>;
 
- public:
-  // factory function
-  static std::shared_ptr<Trainer> Create(
-      LayerType* target_layer, FeatureTransformer* feature_transformer) {
-    return std::shared_ptr<Trainer>(
-        new Trainer(target_layer, feature_transformer));
-  }
+    public:
+        // factory function
+        static std::shared_ptr<Trainer> create(
+            LayerType* target_layer, FeatureTransformer* ft) {
 
-  // Set options such as hyperparameters
-  void SendMessage(Message* message) {
-    previous_layer_trainer_->SendMessage(message);
-  }
+            return std::shared_ptr<Trainer>(
+                new Trainer(target_layer, ft));
+        }
 
-  // Initialize the parameters with random numbers
-  template <typename RNG>
-  void Initialize(RNG& rng) {
-    previous_layer_trainer_->Initialize(rng);
-  }
+        // Set options such as hyperparameters
+        void send_message(Message* message) {
+            previous_layer_trainer_->send_message(message);
+        }
+
+        // Initialize the parameters with random numbers
+        template <typename RNG>
+        void initialize(RNG& rng) {
+            previous_layer_trainer_->initialize(rng);
+        }
+
+        // forward propagation
+        /*const*/ LearnFloatType* propagate(const std::vector<Example>& batch) {
+            if (output_.size() < kOutputDimensions * batch.size()) {
+                output_.resize(kOutputDimensions * batch.size());
+            }
+
+            batch_size_ = static_cast<IndexType>(batch.size());
+            const auto output = previous_layer_trainer_->propagate(batch);
 
-  // forward propagation
-  /*const*/ LearnFloatType* Propagate(const std::vector<Example>& batch) {
-    if (output_.size() < kOutputDimensions * batch.size()) {
-      output_.resize(kOutputDimensions * batch.size());
-    }
-    batch_size_ = static_cast<IndexType>(batch.size());
-    const auto output = previous_layer_trainer_->Propagate(batch);
 #if defined(USE_BLAS)
-    cblas_scopy(kOutputDimensions * batch_size_, output, 1, &output_[0], 1);
+            cblas_scopy(kOutputDimensions * batch_size_, output, 1, &output_[0], 1);
 #else
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        output_[batch_offset + i] = output[batch_offset + i];
-      }
-    }
-#endif
-    return output_.data();
-  }
-
-  // backpropagation
-  void Backpropagate(const LearnFloatType* gradients,
-                     LearnFloatType learning_rate) {
-    previous_layer_trainer_->Backpropagate(gradients, learning_rate);
-  }
-
- private:
-  // constructor
-  Trainer(LayerType* target_layer, FeatureTransformer* feature_transformer) :
-      batch_size_(0),
-      previous_layer_trainer_(Trainer<PreviousLayer>::Create(
-          &target_layer->previous_layer_, feature_transformer)),
-      target_layer_(target_layer) {
-  }
-
-  // number of input/output dimensions
-  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
-
-  // make subclass friend
-  template <typename SumLayer>
-  friend class Trainer;
-
-  // number of samples in mini-batch
-  IndexType batch_size_;
-
-  // Trainer of the previous layer
-  const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
-
-  // layer to learn
-  LayerType* const target_layer_;
-
-  // Forward propagation buffer
-  std::vector<LearnFloatType> output_;
-};
-
-}  // namespace NNUE
-
-}  // namespace Eval
-
-#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+            for (IndexType b = 0; b < batch_size_; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                    output_[batch_offset + i] = output[batch_offset + i];
+                }
+            }
+
+#endif
+            return output_.data();
+        }
+
+        // backpropagation
+        void backpropagate(const LearnFloatType* gradients,
+                           LearnFloatType learning_rate) {
+
+            previous_layer_trainer_->backpropagate(gradients, learning_rate);
+        }
+
+    private:
+        // constructor
+        Trainer(LayerType* target_layer, FeatureTransformer* ft) :
+            batch_size_(0),
+            previous_layer_trainer_(Trainer<PreviousLayer>::create(
+                &target_layer->previous_layer_, ft)),
+            target_layer_(target_layer) {
+        }
+
+        // number of input/output dimensions
+        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+
+        // make subclass friend
+        template <typename SumLayer>
+        friend class Trainer;
+
+        // number of samples in mini-batch
+        IndexType batch_size_;
+
+        // Trainer of the previous layer
+        const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
+
+        // layer to learn
+        LayerType* const target_layer_;
+
+        // Forward propagation buffer
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
+    };
+
+}  // namespace Eval::NNUE
 
 #endif
diff --git a/src/pawns.cpp b/src/pawns.cpp
index af0f6618..68aaf331 100644
--- a/src/pawns.cpp
+++ b/src/pawns.cpp
@@ -30,29 +30,29 @@ namespace {
   #define S(mg, eg) make_score(mg, eg)
 
   // Pawn penalties
-  constexpr Score Backward      = S( 8, 27);
-  constexpr Score Doubled       = S(11, 55);
-  constexpr Score Isolated      = S( 5, 17);
-  constexpr Score WeakLever     = S( 2, 54);
-  constexpr Score WeakUnopposed = S(15, 25);
+  constexpr Score Backward      = S( 8, 25);
+  constexpr Score Doubled       = S(10, 55);
+  constexpr Score Isolated      = S( 3, 15);
+  constexpr Score WeakLever     = S( 3, 55);
+  constexpr Score WeakUnopposed = S(13, 25);
 
   // Bonus for blocked pawns at 5th or 6th rank
-  constexpr Score BlockedPawn[2] = { S(-13, -4), S(-4, 3) };
+  constexpr Score BlockedPawn[2] = { S(-13, -4), S(-5, 2) };
 
   constexpr Score BlockedStorm[RANK_NB] = {
     S(0, 0), S(0, 0), S(76, 78), S(-10, 15), S(-7, 10), S(-4, 6), S(-1, 2)
   };
 
   // Connected pawn bonus
-  constexpr int Connected[RANK_NB] = { 0, 7, 8, 11, 24, 45, 85 };
+  constexpr int Connected[RANK_NB] = { 0, 5, 7, 11, 24, 48, 86 };
 
   // Strength of pawn shelter for our king by [distance from edge][rank].
   // RANK_1 = 0 is used for files where we have no pawn, or pawn is behind our king.
   constexpr Value ShelterStrength[int(FILE_NB) / 2][RANK_NB] = {
-    { V( -6), V( 81), V( 93), V( 58), V( 39), V( 18), V(  25) },
-    { V(-43), V( 61), V( 35), V(-49), V(-29), V(-11), V( -63) },
-    { V(-10), V( 75), V( 23), V( -2), V( 32), V(  3), V( -45) },
-    { V(-39), V(-13), V(-29), V(-52), V(-48), V(-67), V(-166) }
+    { V( -5), V( 82), V( 92), V( 54), V( 36), V( 22), V(  28) },
+    { V(-44), V( 63), V( 33), V(-50), V(-30), V(-12), V( -62) },
+    { V(-11), V( 77), V( 22), V( -6), V( 31), V(  8), V( -45) },
+    { V(-39), V(-12), V(-29), V(-50), V(-43), V(-68), V(-164) }
   };
 
   // Danger of enemy pawns moving toward our king by [distance from edge][rank].
@@ -60,12 +60,17 @@ namespace {
   // is behind our king. Note that UnblockedStorm[0][1-2] accommodate opponent pawn
   // on edge, likely blocked by our king.
   constexpr Value UnblockedStorm[int(FILE_NB) / 2][RANK_NB] = {
-    { V( 85), V(-289), V(-166), V(97), V(50), V( 45), V( 50) },
-    { V( 46), V( -25), V( 122), V(45), V(37), V(-10), V( 20) },
-    { V( -6), V(  51), V( 168), V(34), V(-2), V(-22), V(-14) },
-    { V(-15), V( -11), V( 101), V( 4), V(11), V(-15), V(-29) }
+    { V( 87), V(-288), V(-168), V( 96), V( 47), V( 44), V( 46) },
+    { V( 42), V( -25), V( 120), V( 45), V( 34), V( -9), V( 24) },
+    { V( -8), V(  51), V( 167), V( 35), V( -4), V(-16), V(-12) },
+    { V(-17), V( -13), V( 100), V(  4), V(  9), V(-16), V(-31) }
   };
 
+  // KingOnFile[semi-open Us][semi-open Them] contains bonuses/penalties
+  // for king when the king is on a semi-open or open file.
+  constexpr Score KingOnFile[2][2] = {{ S(-19,12), S(-6, 7)  },
+                                     {  S(  0, 2), S( 6,-5) }};
+
   #undef S
   #undef V
 
@@ -147,7 +152,7 @@ namespace {
         if (support | phalanx)
         {
             int v =  Connected[r] * (2 + bool(phalanx) - bool(opposed))
-                   + 21 * popcount(support);
+                   + 22 * popcount(support);
 
             score += make_score(v, v * (r - 2) / 4);
         }
@@ -171,8 +176,8 @@ namespace {
             score -=  Doubled * doubled
                     + WeakLever * more_than_one(lever);
 
-        if (blocked && r > RANK_4)
-            score += BlockedPawn[r-4];
+        if (blocked && r >= RANK_5)
+            score += BlockedPawn[r - RANK_5];
     }
 
     return score;
@@ -237,6 +242,9 @@ Score Entry::evaluate_shelter(const Position& pos, Square ksq) const {
           bonus -= make_score(UnblockedStorm[d][theirRank], 0);
   }
 
+  // King On File
+  bonus -= KingOnFile[pos.is_on_semiopen_file(Us, ksq)][pos.is_on_semiopen_file(Them, ksq)];
+
   return bonus;
 }
 
diff --git a/src/position.cpp b/src/position.cpp
index fe89b753..934c1403 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -23,6 +23,8 @@
 #include <iomanip>
 #include <sstream>
 
+#include "nnue/evaluate_nnue.h"
+
 #include "bitboard.h"
 #include "misc.h"
 #include "movegen.h"
@@ -32,6 +34,9 @@
 #include "uci.h"
 #include "syzygy/tbprobe.h"
 
+#include "learn/packed_sfen.h"
+#include "learn/sfen_packer.h"
+
 using std::string;
 
 namespace Zobrist {
@@ -77,6 +82,8 @@ std::ostream& operator<<(std::ostream& os, const Position& pos) {
       && !pos.can_castle(ANY_CASTLING))
   {
       StateInfo st;
+      ASSERT_ALIGNED(&st, Eval::NNUE::kCacheLineSize);
+
       Position p;
       p.set(pos.fen(), pos.is_chess960(), &st, pos.this_thread());
       Tablebases::ProbeState s1, s2;
@@ -704,7 +711,6 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
 
   // Used by NNUE
   st->accumulator.computed_accumulation = false;
-  st->accumulator.computed_score = false;
   auto& dp = st->dirtyPiece;
   dp.dirty_num = 1;
 
@@ -755,7 +761,7 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
       else
           st->nonPawnMaterial[them] -= PieceValue[MG][captured];
 
-      if (Eval::useNNUE)
+      if (Eval::NNUE::useNNUE != Eval::NNUE::UseNNUEMode::False)
       {
           dp.dirty_num = 2;  // 1 piece moved, 1 piece captured
           dp.piece[1] = captured;
@@ -799,7 +805,7 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
   // Move the piece. The tricky Chess960 castling is handled earlier
   if (type_of(m) != CASTLING)
   {
-      if (Eval::useNNUE)
+      if (Eval::NNUE::useNNUE != Eval::NNUE::UseNNUEMode::False)
       {
           dp.piece[0] = pc;
           dp.from[0] = from;
@@ -830,7 +836,7 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
           remove_piece(to);
           put_piece(promotion, to);
 
-          if (Eval::useNNUE)
+          if (Eval::NNUE::useNNUE != Eval::NNUE::UseNNUEMode::False)
           {
               // Promoting pawn to SQ_NONE, promoted piece from SQ_NONE
               dp.to[0] = SQ_NONE;
@@ -968,7 +974,7 @@ void Position::do_castling(Color us, Square from, Square& to, Square& rfrom, Squ
   rto = relative_square(us, kingSide ? SQ_F1 : SQ_D1);
   to = relative_square(us, kingSide ? SQ_G1 : SQ_C1);
 
-  if (Do && Eval::useNNUE)
+  if (Do && Eval::NNUE::useNNUE != Eval::NNUE::UseNNUEMode::False)
   {
       auto& dp = st->dirtyPiece;
       dp.piece[0] = make_piece(us, KING);
@@ -997,17 +1003,16 @@ void Position::do_null_move(StateInfo& newSt) {
   assert(!checkers());
   assert(&newSt != st);
 
-  if (Eval::useNNUE)
-  {
-      std::memcpy(&newSt, st, sizeof(StateInfo));
-      st->accumulator.computed_score = false;
-  }
-  else
-      std::memcpy(&newSt, st, offsetof(StateInfo, accumulator));
+  std::memcpy(&newSt, st, offsetof(StateInfo, accumulator));
 
   newSt.previous = st;
   st = &newSt;
 
+  // Used by NNUE
+  st->accumulator.computed_accumulation = false;
+  auto& dp = st->dirtyPiece;
+  dp.dirty_num = 0;
+
   if (st->epSquare != SQ_NONE)
   {
       st->key ^= Zobrist::enpassant[file_of(st->epSquare)];
@@ -1317,6 +1322,8 @@ bool Position::pos_is_ok() const {
               assert(0 && "pos_is_ok: Bitboards");
 
   StateInfo si = *st;
+  ASSERT_ALIGNED(&si, Eval::NNUE::kCacheLineSize);
+
   set_state(&si);
   if (std::memcmp(&si, st, sizeof(StateInfo)))
       assert(0 && "pos_is_ok: State");
@@ -1346,3 +1353,17 @@ bool Position::pos_is_ok() const {
 
   return true;
 }
+
+// Add a function that directly unpacks for speed. It's pretty tough.
+// Write it by combining packer::unpack() and Position::set().
+// If there is a problem with the passed phase and there is an error, non-zero is returned.
+int Position::set_from_packed_sfen(const Learner::PackedSfen& sfen , StateInfo* si, Thread* th)
+{
+  return Learner::set_from_packed_sfen(*this, sfen, si, th);
+}
+
+// Get the packed sfen. Returns to the buffer specified in the argument.
+void Position::sfen_pack(Learner::PackedSfen& sfen)
+{
+  sfen = Learner::sfen_pack(*this);
+}
diff --git a/src/position.h b/src/position.h
index e3f758e0..e7513eb1 100644
--- a/src/position.h
+++ b/src/position.h
@@ -30,6 +30,9 @@
 
 #include "nnue/nnue_accumulator.h"
 
+#include "learn/packed_sfen.h"
+#include "learn/sfen_packer.h"
+
 
 /// StateInfo struct stores information needed to restore a Position object to
 /// its previous state when we retract a move. Whenever a move is made on the
@@ -75,9 +78,6 @@ typedef std::unique_ptr<std::deque<StateInfo>> StateListPtr;
 /// traversing the search tree.
 class Thread;
 
-// packed sfen
-struct PackedSfen { uint8_t data[32]; }; 
-
 class Position {
 public:
   static void init();
@@ -175,25 +175,27 @@ public:
   // Used by NNUE
   StateInfo* state() const;
 
-#if defined(EVAL_LEARN)
   // --sfenization helper
 
+  friend int Learner::set_from_packed_sfen(Position& pos, const Learner::PackedSfen& sfen, StateInfo* si, Thread* th);
+
   // Get the packed sfen. Returns to the buffer specified in the argument.
   // Do not include gamePly in pack.
-  void sfen_pack(PackedSfen& sfen);
+  void sfen_pack(Learner::PackedSfen& sfen);
 
   // It is slow to go through sfen, so I made a function to set packed sfen directly.
   // Equivalent to pos.set(sfen_unpack(data),si,th);.
   // If there is a problem with the passed phase and there is an error, non-zero is returned.
   // PackedSfen does not include gamePly so it cannot be restored. If you want to set it, specify it with an argument.
-  int set_from_packed_sfen(const PackedSfen& sfen, StateInfo* si, Thread* th, bool mirror = false);
+  int set_from_packed_sfen(const Learner::PackedSfen& sfen, StateInfo* si, Thread* th);
+
+  void clear() { std::memset(this, 0, sizeof(Position)); }
 
   // Give the board, hand piece, and turn, and return the sfen.
   //static std::string sfen_from_rawdata(Piece board[81], Hand hands[2], Color turn, int gamePly);
 
   // Returns the position of the ball on the c side.
   Square king_square(Color c) const { return pieceList[make_piece(c, KING)][0]; }
-#endif // EVAL_LEARN
 
 private:
   // Initialization helpers (used while setting up a position)
diff --git a/src/search.cpp b/src/search.cpp
index 2d848bcd..30384868 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -23,6 +23,8 @@
 #include <iostream>
 #include <sstream>
 
+#include "nnue/evaluate_nnue.h"
+
 #include "evaluate.h"
 #include "misc.h"
 #include "movegen.h"
@@ -40,20 +42,12 @@ namespace Search {
   LimitsType Limits;
 }
 
-namespace Tablebases {
-
-  int Cardinality;
-  bool RootInTB;
-  bool UseRule50;
-  Depth ProbeDepth;
-}
-
-namespace TB = Tablebases;
-
 using std::string;
 using Eval::evaluate;
 using namespace Search;
 
+bool Search::prune_at_shallow_depth = true;
+
 namespace {
 
   // Different node types, used as a template parameter
@@ -65,17 +59,15 @@ namespace {
   // Razor and futility margins
   constexpr int RazorMargin = 510;
   Value futility_margin(Depth d, bool improving) {
-    return Value(223 * (d - improving));
+    return Value(234 * (d - improving));
   }
 
-  bool training;
-
   // Reductions lookup table, initialized at startup
   int Reductions[MAX_MOVES]; // [depth or moveNumber]
 
   Depth reduction(bool i, Depth d, int mn) {
     int r = Reductions[d] * Reductions[mn];
-    return (r + 509) / 1024 + (!i && r > 894);
+    return (r + 503) / 1024 + (!i && r > 915);
   }
 
   constexpr int futility_move_count(bool improving, Depth depth) {
@@ -166,6 +158,8 @@ namespace {
   uint64_t perft(Position& pos, Depth depth) {
 
     StateInfo st;
+    ASSERT_ALIGNED(&st, Eval::NNUE::kCacheLineSize);
+
     uint64_t cnt, nodes = 0;
     const bool leaf = (depth == 2);
 
@@ -194,9 +188,7 @@ namespace {
 void Search::init() {
 
   for (int i = 1; i < MAX_MOVES; ++i)
-      Reductions[i] = int((22.0 + std::log(Threads.size())) * std::log(i));
-
-  training = Options["Training"];
+      Reductions[i] = int((21.3 + 2 * std::log(Threads.size())) * std::log(i + 0.25 * std::log(i)));
 }
 
 
@@ -229,7 +221,7 @@ void MainThread::search() {
   Time.init(Limits, us, rootPos.game_ply());
   TT.new_search();
 
-  Eval::verify_NNUE();
+  Eval::NNUE::verify_eval_file_loaded();
 
   if (rootMoves.empty())
   {
@@ -412,7 +404,7 @@ void Thread::search() {
               beta  = std::min(prev + delta, VALUE_INFINITE);
 
               // Adjust contempt based on root move's previousScore (dynamic contempt)
-              int dct = ct + (105 - ct / 2) * prev / (abs(prev) + 149);
+              int dct = ct + (113 - ct / 2) * prev / (abs(prev) + 147);
 
               contempt = (us == WHITE ?  make_score(dct, dct / 2)
                                       : -make_score(dct, dct / 2));
@@ -421,7 +413,7 @@ void Thread::search() {
           // Start with a small aspiration window and, in the case of a fail
           // high/low, re-search with a bigger window until we don't fail
           // high/low anymore.
-          int failedHighCnt = 0;
+          failedHighCnt = 0;
           while (true)
           {
               Depth adjustedDepth = std::max(1, rootDepth - failedHighCnt - searchAgainCounter);
@@ -466,10 +458,7 @@ void Thread::search() {
                   ++failedHighCnt;
               }
               else
-              {
-                  ++rootMoves[pvIdx].bestMoveCount;
                   break;
-              }
 
               delta += delta / 4 + 5;
 
@@ -524,12 +513,16 @@ void Thread::search() {
               totBestMoveChanges += th->bestMoveChanges;
               th->bestMoveChanges = 0;
           }
-          double bestMoveInstability = 1 + totBestMoveChanges / Threads.size();
+          double bestMoveInstability = 1 + 2 * totBestMoveChanges / Threads.size();
 
-          double totalTime = rootMoves.size() == 1 ? 0 :
-                             Time.optimum() * fallingEval * reduction * bestMoveInstability;
+          double totalTime = Time.optimum() * fallingEval * reduction * bestMoveInstability;
 
-          // Stop the search if we have exceeded the totalTime, at least 1ms search
+          // Cap used time in case of a single legal move for a better viewer experience in tournaments
+          // yielding correct scores and sufficiently fast moves.
+          if (rootMoves.size() == 1)
+              totalTime = std::min(500.0, totalTime);
+
+          // Stop the search if we have exceeded the totalTime
           if (Time.elapsed() > totalTime)
           {
               // If we are allowed to ponder do not stop the search now but
@@ -572,6 +565,7 @@ namespace {
 
     constexpr bool PvNode = NT == PV;
     const bool rootNode = PvNode && ss->ply == 0;
+    const Depth maxNextDepth = rootNode ? depth : depth + 1;
 
     // Check if we have an upcoming move which draws by repetition, or
     // if the opponent had an alternative move earlier to this position.
@@ -596,12 +590,14 @@ namespace {
 
     Move pv[MAX_PLY+1], capturesSearched[32], quietsSearched[64];
     StateInfo st;
+    ASSERT_ALIGNED(&st, Eval::NNUE::kCacheLineSize);
+
     TTEntry* tte;
     Key posKey;
     Move ttMove, move, excludedMove, bestMove;
     Depth extension, newDepth;
     Value bestValue, value, ttValue, eval, maxValue, probCutBeta;
-    bool ttHit, ttPv, formerPv, givesCheck, improving, didLMR, priorCapture;
+    bool formerPv, givesCheck, improving, didLMR, priorCapture;
     bool captureOrPromotion, doFullDepthSearch, moveCountPruning,
          ttCapture, singularQuietLMR;
     Piece movedPiece;
@@ -648,6 +644,7 @@ namespace {
     assert(0 <= ss->ply && ss->ply < MAX_PLY);
 
     (ss+1)->ply = ss->ply + 1;
+    (ss+1)->ttPv = false;
     (ss+1)->excludedMove = bestMove = MOVE_NONE;
     (ss+2)->killers[0] = (ss+2)->killers[1] = MOVE_NONE;
     Square prevSq = to_sq((ss-1)->currentMove);
@@ -657,9 +654,7 @@ namespace {
     // starts with statScore = 0. Later grandchildren start with the last calculated
     // statScore of the previous grandchild. This influences the reduction rules in
     // LMR which are based on the statScore of parent position.
-    if (rootNode)
-        (ss+4)->statScore = 0;
-    else
+    if (!rootNode)
         (ss+2)->statScore = 0;
 
     // Step 4. Transposition table lookup. We don't want the score of a partial
@@ -667,14 +662,15 @@ namespace {
     // position key in case of an excluded move.
     excludedMove = ss->excludedMove;
     posKey = excludedMove == MOVE_NONE ? pos.key() : pos.key() ^ make_key(excludedMove);
-    tte = TT.probe(posKey, ttHit);
-    ttValue = ttHit ? value_from_tt(tte->value(), ss->ply, pos.rule50_count()) : VALUE_NONE;
+    tte = TT.probe(posKey, ss->ttHit);
+    ttValue = ss->ttHit ? value_from_tt(tte->value(), ss->ply, pos.rule50_count()) : VALUE_NONE;
     ttMove =  rootNode ? thisThread->rootMoves[thisThread->pvIdx].pv[0]
-            : ttHit    ? tte->move() : MOVE_NONE;
-    ttPv = PvNode || (ttHit && tte->is_pv());
-    formerPv = ttPv && !PvNode;
+            : ss->ttHit    ? tte->move() : MOVE_NONE;
+    if (!excludedMove)
+        ss->ttPv = PvNode || (ss->ttHit && tte->is_pv());
+    formerPv = ss->ttPv && !PvNode;
 
-    if (   ttPv
+    if (   ss->ttPv
         && depth > 12
         && ss->ply - 1 < MAX_LPH
         && !priorCapture
@@ -683,11 +679,11 @@ namespace {
 
     // thisThread->ttHitAverage can be used to approximate the running average of ttHit
     thisThread->ttHitAverage =   (TtHitAverageWindow - 1) * thisThread->ttHitAverage / TtHitAverageWindow
-                                + TtHitAverageResolution * ttHit;
+                                + TtHitAverageResolution * ss->ttHit;
 
     // At non-PV nodes we check for an early TT cutoff
     if (  !PvNode
-        && ttHit
+        && ss->ttHit
         && tte->depth() >= depth
         && ttValue != VALUE_NONE // Possible in case of TT access race
         && (ttValue >= beta ? (tte->bound() & BOUND_LOWER)
@@ -719,27 +715,27 @@ namespace {
     }
 
     // Step 5. Tablebases probe
-    if (!rootNode && TB::Cardinality)
+    if (!rootNode && thisThread->Cardinality)
     {
         int piecesCount = pos.count<ALL_PIECES>();
 
-        if (    piecesCount <= TB::Cardinality
-            && (piecesCount <  TB::Cardinality || depth >= TB::ProbeDepth)
+        if (    piecesCount <= thisThread->Cardinality
+            && (piecesCount <  thisThread->Cardinality || depth >= thisThread->ProbeDepth)
             &&  pos.rule50_count() == 0
             && !pos.can_castle(ANY_CASTLING))
         {
-            TB::ProbeState err;
-            TB::WDLScore wdl = Tablebases::probe_wdl(pos, &err);
+            Tablebases::ProbeState err;
+            Tablebases::WDLScore wdl = Tablebases::probe_wdl(pos, &err);
 
             // Force check of time on the next occasion
             if (thisThread == Threads.main())
                 static_cast<MainThread*>(thisThread)->callsCnt = 0;
 
-            if (err != TB::ProbeState::FAIL)
+            if (err != Tablebases::ProbeState::FAIL)
             {
                 thisThread->tbHits.fetch_add(1, std::memory_order_relaxed);
 
-                int drawScore = TB::UseRule50 ? 1 : 0;
+                int drawScore = thisThread->UseRule50 ? 1 : 0;
 
                 // use the range VALUE_MATE_IN_MAX_PLY to VALUE_TB_WIN_IN_MAX_PLY to score
                 value =  wdl < -drawScore ? VALUE_MATED_IN_MAX_PLY + ss->ply + 1
@@ -752,7 +748,7 @@ namespace {
                 if (    b == BOUND_EXACT
                     || (b == BOUND_LOWER ? value >= beta : value <= alpha))
                 {
-                    tte->save(posKey, value_to_tt(value, ss->ply), ttPv, b,
+                    tte->save(posKey, value_to_tt(value, ss->ply), ss->ttPv, b,
                               std::min(MAX_PLY - 1, depth + 6),
                               MOVE_NONE, VALUE_NONE);
 
@@ -780,7 +776,7 @@ namespace {
         improving = false;
         goto moves_loop;
     }
-    else if (ttHit)
+    else if (ss->ttHit)
     {
         // Never assume anything about values stored in TT
         ss->staticEval = eval = tte->eval();
@@ -802,7 +798,7 @@ namespace {
         else
             ss->staticEval = eval = -(ss-1)->staticEval + 2 * Tempo;
 
-        tte->save(posKey, VALUE_NONE, ttPv, BOUND_NONE, DEPTH_NONE, MOVE_NONE, eval);
+        tte->save(posKey, VALUE_NONE, ss->ttPv, BOUND_NONE, DEPTH_NONE, MOVE_NONE, eval);
     }
 
     // Step 7. Razoring (~1 Elo)
@@ -828,7 +824,7 @@ namespace {
         && (ss-1)->statScore < 22977
         &&  eval >= beta
         &&  eval >= ss->staticEval
-        &&  ss->staticEval >= beta - 30 * depth - 28 * improving + 84 * ttPv + 182
+        &&  ss->staticEval >= beta - 30 * depth - 28 * improving + 84 * ss->ttPv + 168
         && !excludedMove
         &&  pos.non_pawn_material(us)
         && (ss->ply >= thisThread->nmpMinPly || us != thisThread->nmpColor))
@@ -836,7 +832,7 @@ namespace {
         assert(eval - beta >= 0);
 
         // Null move dynamic reduction based on depth and value
-        Depth R = (817 + 71 * depth) / 213 + std::min(int(eval - beta) / 192, 3);
+        Depth R = (1015 + 85 * depth) / 256 + std::min(int(eval - beta) / 191, 3);
 
         ss->currentMove = MOVE_NULL;
         ss->continuationHistory = &thisThread->continuationHistory[0][0][NO_PIECE][0];
@@ -853,7 +849,7 @@ namespace {
             if (nullValue >= VALUE_TB_WIN_IN_MAX_PLY)
                 nullValue = beta;
 
-            if (thisThread->nmpMinPly || (abs(beta) < VALUE_KNOWN_WIN && depth < 13))
+            if (thisThread->nmpMinPly || (abs(beta) < VALUE_KNOWN_WIN && depth < 14))
                 return nullValue;
 
             assert(!thisThread->nmpMinPly); // Recursive verification is not allowed
@@ -872,7 +868,7 @@ namespace {
         }
     }
 
-    probCutBeta = beta + 176 - 49 * improving;
+    probCutBeta = beta + 183 - 49 * improving;
 
     // Step 10. ProbCut (~10 Elo)
     // If we have a good enough capture and a reduced search returns a value
@@ -884,14 +880,14 @@ namespace {
         // there and in further interactions with transposition table cutoff depth is set to depth - 3
         // because probCut search has depth set to depth - 4 but we also do a move before it
         // so effective depth is equal to depth - 3
-        && !(   ttHit
+        && !(   ss->ttHit
              && tte->depth() >= depth - 3
              && ttValue != VALUE_NONE
              && ttValue < probCutBeta))
     {
         // if ttMove is a capture and value from transposition table is good enough produce probCut
         // cutoff without digging into actual probCut search
-        if (   ttHit
+        if (   ss->ttHit
             && tte->depth() >= depth - 3
             && ttValue != VALUE_NONE
             && ttValue >= probCutBeta
@@ -902,6 +898,8 @@ namespace {
         assert(probCutBeta < VALUE_INFINITE);
         MovePicker mp(pos, ttMove, probCutBeta - ss->staticEval, &captureHistory);
         int probCutCount = 0;
+        bool ttPv = ss->ttPv;
+        ss->ttPv = false;
 
         while (   (move = mp.next_move()) != MOVE_NONE
                && probCutCount < 2 + 2 * cutNode)
@@ -933,7 +931,7 @@ namespace {
                 if (value >= probCutBeta)
                 {
                     // if transposition table doesn't have equal or more deep info write probCut data into it
-                    if ( !(ttHit
+                    if ( !(ss->ttHit
                        && tte->depth() >= depth - 3
                        && ttValue != VALUE_NONE))
                         tte->save(posKey, value_to_tt(value, ss->ply), ttPv,
@@ -942,8 +940,15 @@ namespace {
                     return value;
                 }
             }
+         ss->ttPv = ttPv;
     }
 
+    // Step 11. If the position is not in TT, decrease depth by 2
+    if (   PvNode
+        && depth >= 6
+        && !ttMove)
+        depth -= 2;
+
 moves_loop: // When in check, search starts from here
 
     const PieceToHistory* contHist[] = { (ss-1)->continuationHistory, (ss-2)->continuationHistory,
@@ -967,7 +972,7 @@ moves_loop: // When in check, search starts from here
     // Mark this node as being searched
     ThreadHolding th(thisThread, posKey, ss->ply);
 
-    // Step 11. Loop through all pseudo-legal moves until no moves remain
+    // Step 12. Loop through all pseudo-legal moves until no moves remain
     // or a beta cutoff occurs.
     while ((move = mp.next_move(moveCountPruning)) != MOVE_NONE)
     {
@@ -991,9 +996,7 @@ moves_loop: // When in check, search starts from here
       ss->moveCount = ++moveCount;
 
       if (rootNode && thisThread == Threads.main() && Time.elapsed() > 3000
-#if defined(EVAL_LEARN)
           && !Limits.silent
-#endif
           )
           sync_cout << "info depth " << depth
                     << " currmove " << UCI::move(move, pos.is_chess960())
@@ -1009,9 +1012,9 @@ moves_loop: // When in check, search starts from here
       // Calculate new depth for this move
       newDepth = depth - 1;
 
-      // Step 12. Pruning at shallow depth (~200 Elo)
+      // Step 13. Pruning at shallow depth (~200 Elo)
       if (  !rootNode
-          && !(training && PvNode)
+          && (PvNode ? prune_at_shallow_depth : true)
           && pos.non_pawn_material(us)
           && bestValue > VALUE_TB_LOSS_IN_MAX_PLY)
       {
@@ -1033,7 +1036,7 @@ moves_loop: // When in check, search starts from here
               // Futility pruning: parent node (~5 Elo)
               if (   lmrDepth < 7
                   && !ss->inCheck
-                  && ss->staticEval + 283 + 170 * lmrDepth <= alpha
+                  && ss->staticEval + 266 + 170 * lmrDepth <= alpha
                   &&  (*contHist[0])[movedPiece][to_sq(move)]
                     + (*contHist[1])[movedPiece][to_sq(move)]
                     + (*contHist[3])[movedPiece][to_sq(move)]
@@ -1041,7 +1044,7 @@ moves_loop: // When in check, search starts from here
                   continue;
 
               // Prune moves with negative SEE (~20 Elo)
-              if (!pos.see_ge(move, Value(-(29 - std::min(lmrDepth, 18)) * lmrDepth * lmrDepth)))
+              if (!pos.see_ge(move, Value(-(30 - std::min(lmrDepth, 18)) * lmrDepth * lmrDepth)))
                   continue;
           }
           else
@@ -1052,23 +1055,13 @@ moves_loop: // When in check, search starts from here
                   && captureHistory[movedPiece][to_sq(move)][type_of(pos.piece_on(to_sq(move)))] < 0)
                   continue;
 
-              // Futility pruning for captures
-              if (   !givesCheck
-                  && lmrDepth < 6
-                  && !(PvNode && abs(bestValue) < 2)
-                  && PieceValue[MG][type_of(movedPiece)] >= PieceValue[MG][type_of(pos.piece_on(to_sq(move)))]
-                  && !ss->inCheck
-                  && ss->staticEval + 169 + 244 * lmrDepth
-                     + PieceValue[MG][type_of(pos.piece_on(to_sq(move)))] <= alpha)
-                  continue;
-
-              // See based pruning
-              if (!pos.see_ge(move, Value(-221) * depth)) // (~25 Elo)
+              // SEE based pruning
+              if (!pos.see_ge(move, Value(-213) * depth)) // (~25 Elo)
                   continue;
           }
       }
 
-      // Step 13. Extensions (~75 Elo)
+      // Step 14. Extensions (~75 Elo)
 
       // Singular extension search (~70 Elo). If all moves but one fail low on a
       // search of (alpha-s, beta-s), and just one fails high on (alpha, beta),
@@ -1127,11 +1120,6 @@ moves_loop: // When in check, search starts from here
                && pos.non_pawn_material() <= 2 * RookValueMg)
           extension = 1;
 
-      // Castling extension
-      if (   type_of(move) == CASTLING
-          && popcount(pos.pieces(us) & ~pos.pieces(PAWN) & (to_sq(move) & KingSide ? KingSide : QueenSide)) <= 2)
-          extension = 1;
-
       // Late irreversible move extension
       if (   move == ttMove
           && pos.rule50_count() > 80
@@ -1151,41 +1139,37 @@ moves_loop: // When in check, search starts from here
                                                                 [movedPiece]
                                                                 [to_sq(move)];
 
-      // Step 14. Make the move
+      // Step 15. Make the move
       pos.do_move(move, st, givesCheck);
 
-      // Step 15. Reduced depth search (LMR, ~200 Elo). If the move fails high it will be
+      // Step 16. Reduced depth search (LMR, ~200 Elo). If the move fails high it will be
       // re-searched at full depth.
       if (    depth >= 3
-          &&  moveCount > 1 + 2 * rootNode + 2 * (PvNode && abs(bestValue) < 2)
-          && (!rootNode || thisThread->best_move_count(move) == 0)
+          &&  moveCount > 1 + 2 * rootNode
           && (  !captureOrPromotion
               || moveCountPruning
               || ss->staticEval + PieceValue[EG][pos.captured_piece()] <= alpha
               || cutNode
-              || thisThread->ttHitAverage < 427 * TtHitAverageResolution * TtHitAverageWindow / 1024))
+              || thisThread->ttHitAverage < 432 * TtHitAverageResolution * TtHitAverageWindow / 1024))
       {
           Depth r = reduction(improving, depth, moveCount);
 
-          // Decrease reduction at non-check cut nodes for second move at low depths
-          if (   cutNode
-              && depth <= 10
-              && moveCount <= 2
-              && !ss->inCheck)
-              r--;
-
           // Decrease reduction if the ttHit running average is large
-          if (thisThread->ttHitAverage > 509 * TtHitAverageResolution * TtHitAverageWindow / 1024)
+          if (thisThread->ttHitAverage > 537 * TtHitAverageResolution * TtHitAverageWindow / 1024)
               r--;
 
-          // Reduction if other threads are searching this position
+          // Increase reduction if other threads are searching this position
           if (th.marked())
               r++;
 
           // Decrease reduction if position is or has been on the PV (~10 Elo)
-          if (ttPv)
+          if (ss->ttPv)
               r -= 2;
 
+          // Increase reduction at root and non-PV nodes when the best move does not change frequently
+          if ((rootNode || !PvNode) && depth > 10 && thisThread->bestMoveChanges <= 2)
+              r++;
+
           if (moveCountPruning && !formerPv)
               r++;
 
@@ -1195,7 +1179,7 @@ moves_loop: // When in check, search starts from here
 
           // Decrease reduction if ttMove has been singularly extended (~3 Elo)
           if (singularQuietLMR)
-              r -= 1 + formerPv;
+              r--;
 
           if (!captureOrPromotion)
           {
@@ -1203,6 +1187,9 @@ moves_loop: // When in check, search starts from here
               if (ttCapture)
                   r++;
 
+              // Increase reduction at root if failing high
+              r += rootNode ? thisThread->failedHighCnt * thisThread->failedHighCnt * moveCount / 512 : 0;
+
               // Increase reduction for cut nodes (~10 Elo)
               if (cutNode)
                   r += 2;
@@ -1212,7 +1199,7 @@ moves_loop: // When in check, search starts from here
               // hence break make_move(). (~2 Elo)
               else if (    type_of(move) == NORMAL
                        && !pos.see_ge(reverse_move(move)))
-                  r -= 2 + ttPv - (type_of(movedPiece) == PAWN);
+                  r -= 2 + ss->ttPv - (type_of(movedPiece) == PAWN);
 
               ss->statScore =  thisThread->mainHistory[us][from_to(move)]
                              + (*contHist[0])[movedPiece][to_sq(move)]
@@ -1221,10 +1208,10 @@ moves_loop: // When in check, search starts from here
                              - 5287;
 
               // Decrease/increase reduction by comparing opponent's stat score (~10 Elo)
-              if (ss->statScore >= -106 && (ss-1)->statScore < -104)
+              if (ss->statScore >= -105 && (ss-1)->statScore < -103)
                   r--;
 
-              else if ((ss-1)->statScore >= -119 && ss->statScore < -140)
+              else if ((ss-1)->statScore >= -122 && ss->statScore < -129)
                   r++;
 
               // Decrease/increase reduction for moves with a good/bad history (~30 Elo)
@@ -1232,14 +1219,14 @@ moves_loop: // When in check, search starts from here
           }
           else
           {
-            // Increase reduction for captures/promotions if late move and at low depth
-            if (depth < 8 && moveCount > 2)
-                r++;
+              // Increase reduction for captures/promotions if late move and at low depth
+              if (depth < 8 && moveCount > 2)
+                  r++;
 
-            // Unless giving check, this capture is likely bad
-            if (   !givesCheck
-                && ss->staticEval + PieceValue[EG][pos.captured_piece()] + 213 * depth <= alpha)
-                r++;
+              // Unless giving check, this capture is likely bad
+              if (   !givesCheck
+                  && ss->staticEval + PieceValue[EG][pos.captured_piece()] + 210 * depth <= alpha)
+                  r++;
           }
 
           Depth d = std::clamp(newDepth - r, 1, newDepth);
@@ -1257,7 +1244,7 @@ moves_loop: // When in check, search starts from here
           didLMR = false;
       }
 
-      // Step 16. Full depth search when LMR is skipped or fails high
+      // Step 17. Full depth search when LMR is skipped or fails high
       if (doFullDepthSearch)
       {
           value = -search<NonPV>(pos, ss+1, -(alpha+1), -alpha, newDepth, !cutNode);
@@ -1282,15 +1269,16 @@ moves_loop: // When in check, search starts from here
           (ss+1)->pv = pv;
           (ss+1)->pv[0] = MOVE_NONE;
 
-          value = -search<PV>(pos, ss+1, -beta, -alpha, newDepth, false);
+          value = -search<PV>(pos, ss+1, -beta, -alpha,
+                              std::min(maxNextDepth, newDepth), false);
       }
 
-      // Step 17. Undo move
+      // Step 18. Undo move
       pos.undo_move(move);
 
       assert(value > -VALUE_INFINITE && value < VALUE_INFINITE);
 
-      // Step 18. Check for a new best move
+      // Step 19. Check for a new best move
       // Finished searching the move. If a stop occurred, the return value of
       // the search cannot be trusted, and we return immediately without
       // updating best move, PV and TT.
@@ -1367,7 +1355,7 @@ moves_loop: // When in check, search starts from here
         return VALUE_DRAW;
     */
 
-    // Step 19. Check for mate and stalemate
+    // Step 20. Check for mate and stalemate
     // All legal moves have been searched and if there are no legal moves, it
     // must be a mate or a stalemate. If we are in a singular extension search then
     // return a fail low score.
@@ -1390,8 +1378,17 @@ moves_loop: // When in check, search starts from here
     if (PvNode)
         bestValue = std::min(bestValue, maxValue);
 
+    // If no good move is found and the previous position was ttPv, then the previous
+    // opponent move is probably good and the new position is added to the search tree.
+    if (bestValue <= alpha)
+        ss->ttPv = ss->ttPv || ((ss-1)->ttPv && depth > 3);
+    // Otherwise, a counter move has been found and if the position is the last leaf
+    // in the search tree, remove the position from the search tree.
+    else if (depth > 3)
+        ss->ttPv = ss->ttPv && (ss+1)->ttPv;
+
     if (!excludedMove && !(rootNode && thisThread->pvIdx))
-        tte->save(posKey, value_to_tt(bestValue, ss->ply), ttPv,
+        tte->save(posKey, value_to_tt(bestValue, ss->ply), ss->ttPv,
                   bestValue >= beta ? BOUND_LOWER :
                   PvNode && bestMove ? BOUND_EXACT : BOUND_UPPER,
                   depth, bestMove, ss->staticEval);
@@ -1415,12 +1412,14 @@ moves_loop: // When in check, search starts from here
 
     Move pv[MAX_PLY+1];
     StateInfo st;
+    ASSERT_ALIGNED(&st, Eval::NNUE::kCacheLineSize);
+
     TTEntry* tte;
     Key posKey;
     Move ttMove, move, bestMove;
     Depth ttDepth;
     Value bestValue, value, ttValue, futilityValue, futilityBase, oldAlpha;
-    bool ttHit, pvHit, givesCheck, captureOrPromotion;
+    bool pvHit, givesCheck, captureOrPromotion;
     int moveCount;
 
     if (PvNode)
@@ -1450,13 +1449,13 @@ moves_loop: // When in check, search starts from here
                                                   : DEPTH_QS_NO_CHECKS;
     // Transposition table lookup
     posKey = pos.key();
-    tte = TT.probe(posKey, ttHit);
-    ttValue = ttHit ? value_from_tt(tte->value(), ss->ply, pos.rule50_count()) : VALUE_NONE;
-    ttMove = ttHit ? tte->move() : MOVE_NONE;
-    pvHit = ttHit && tte->is_pv();
+    tte = TT.probe(posKey, ss->ttHit);
+    ttValue = ss->ttHit ? value_from_tt(tte->value(), ss->ply, pos.rule50_count()) : VALUE_NONE;
+    ttMove = ss->ttHit ? tte->move() : MOVE_NONE;
+    pvHit = ss->ttHit && tte->is_pv();
 
     if (  !PvNode
-        && ttHit
+        && ss->ttHit
         && tte->depth() >= ttDepth
         && ttValue != VALUE_NONE // Only in case of TT access race
         && (ttValue >= beta ? (tte->bound() & BOUND_LOWER)
@@ -1471,7 +1470,7 @@ moves_loop: // When in check, search starts from here
     }
     else
     {
-        if (ttHit)
+        if (ss->ttHit)
         {
             // Never assume anything about values stored in TT
             if ((ss->staticEval = bestValue = tte->eval()) == VALUE_NONE)
@@ -1490,7 +1489,7 @@ moves_loop: // When in check, search starts from here
         // Stand pat. Return immediately if static value is at least beta
         if (bestValue >= beta)
         {
-            if (!ttHit)
+            if (!ss->ttHit)
                 tte->save(posKey, value_to_tt(bestValue, ss->ply), false, BOUND_LOWER,
                           DEPTH_NONE, MOVE_NONE, ss->staticEval);
 
@@ -1500,7 +1499,7 @@ moves_loop: // When in check, search starts from here
         if (PvNode && bestValue > alpha)
             alpha = bestValue;
 
-        futilityBase = bestValue + 145;
+        futilityBase = bestValue + 155;
     }
 
     const PieceToHistory* contHist[] = { (ss-1)->continuationHistory, (ss-2)->continuationHistory,
@@ -1527,7 +1526,7 @@ moves_loop: // When in check, search starts from here
       moveCount++;
 
       // Futility pruning
-      if (   !ss->inCheck
+      if (    bestValue > VALUE_TB_LOSS_IN_MAX_PLY
           && !givesCheck
           &&  futilityBase > -VALUE_KNOWN_WIN
           && !pos.advanced_pawn_push(move))
@@ -1554,20 +1553,16 @@ moves_loop: // When in check, search starts from here
       }
 
       // Do not search moves with negative SEE values
-      if (  !ss->inCheck && !pos.see_ge(move))
+      if (    bestValue > VALUE_TB_LOSS_IN_MAX_PLY
+          && !(givesCheck && pos.is_discovery_check_on_king(~pos.side_to_move(), move))
+          && !pos.see_ge(move))
           continue;
 
       // Speculative prefetch as early as possible
       prefetch(TT.first_entry(pos.key_after(move)));
 
       // Check for legality just before making the move
-      if (
-#if defined(EVAL_LEARN)
-        // HACK: pos.piece_on(from_sq(m)) sometimes will be NO_PIECE during machine learning.
-        !pos.pseudo_legal(move) ||
-#endif // EVAL_LEARN
-        !pos.legal(move)
-        )
+      if (!pos.legal(move))
       {
           moveCount--;
           continue;
@@ -1579,8 +1574,9 @@ moves_loop: // When in check, search starts from here
                                                                 [pos.moved_piece(move)]
                                                                 [to_sq(move)];
 
+      // CounterMove based pruning
       if (  !captureOrPromotion
-          && moveCount >= abs(depth) + 1
+          && bestValue > VALUE_TB_LOSS_IN_MAX_PLY
           && (*contHist[0])[pos.moved_piece(move)][to_sq(move)] < CounterMovePruneThreshold
           && (*contHist[1])[pos.moved_piece(move)][to_sq(move)] < CounterMovePruneThreshold)
           continue;
@@ -1615,7 +1611,11 @@ moves_loop: // When in check, search starts from here
     // All legal moves have been searched. A special case: if we're in check
     // and no legal moves were found, it is checkmate.
     if (ss->inCheck && bestValue == -VALUE_INFINITE)
+    {
+        assert(!MoveList<LEGAL>(pos).size());
+
         return mated_in(ss->ply); // Plies to mate from the root
+    }
 
     tte->save(posKey, value_to_tt(bestValue, ss->ply), pvHit,
               bestValue >= beta ? BOUND_LOWER :
@@ -1712,8 +1712,8 @@ moves_loop: // When in check, search starts from here
     else
         captureHistory[moved_piece][to_sq(bestMove)][captured] << bonus1;
 
-    // Extra penalty for a quiet TT or main killer move in previous ply when it gets refuted
-    if (   ((ss-1)->moveCount == 1 || ((ss-1)->currentMove == (ss-1)->killers[0]))
+    // Extra penalty for a quiet early move that was not a TT move or main killer move in previous ply when it gets refuted
+    if (   ((ss-1)->moveCount == 1 + (ss-1)->ttHit || ((ss-1)->currentMove == (ss-1)->killers[0]))
         && !pos.captured_piece())
             update_continuation_histories(ss-1, pos.piece_on(prevSq), prevSq, -bonus1);
 
@@ -1850,19 +1850,22 @@ string UCI::pv(const Position& pos, Depth depth, Value alpha, Value beta) {
   size_t pvIdx = pos.this_thread()->pvIdx;
   size_t multiPV = std::min((size_t)Options["MultiPV"], rootMoves.size());
   uint64_t nodesSearched = Threads.nodes_searched();
-  uint64_t tbHits = Threads.tb_hits() + (TB::RootInTB ? rootMoves.size() : 0);
+  uint64_t tbHits = Threads.tb_hits() + (pos.this_thread()->rootInTB ? rootMoves.size() : 0);
 
   for (size_t i = 0; i < multiPV; ++i)
   {
       bool updated = rootMoves[i].score != -VALUE_INFINITE;
 
-      if (depth == 1 && !updated)
+      if (depth == 1 && !updated && i > 0)
           continue;
 
-      Depth d = updated ? depth : depth - 1;
+      Depth d = updated ? depth : std::max(1, depth - 1);
       Value v = updated ? rootMoves[i].score : rootMoves[i].previousScore;
 
-      bool tb = TB::RootInTB && abs(v) < VALUE_MATE_IN_MAX_PLY;
+      if (v == -VALUE_INFINITE)
+          v = VALUE_ZERO;
+
+      bool tb = pos.this_thread()->rootInTB && abs(v) < VALUE_MATE_IN_MAX_PLY;
       v = tb ? rootMoves[i].tbScore : v;
 
       if (ss.rdbuf()->in_avail()) // Not at first line
@@ -1906,6 +1909,8 @@ string UCI::pv(const Position& pos, Depth depth, Value alpha, Value beta) {
 bool RootMove::extract_ponder_from_tt(Position& pos) {
 
     StateInfo st;
+    ASSERT_ALIGNED(&st, Eval::NNUE::kCacheLineSize);
+
     bool ttHit;
 
     assert(pv.size() == 1);
@@ -1929,42 +1934,42 @@ bool RootMove::extract_ponder_from_tt(Position& pos) {
 
 void Tablebases::rank_root_moves(Position& pos, Search::RootMoves& rootMoves) {
 
-    RootInTB = false;
-    UseRule50 = bool(Options["Syzygy50MoveRule"]);
-    ProbeDepth = int(Options["SyzygyProbeDepth"]);
-    Cardinality = int(Options["SyzygyProbeLimit"]);
+    auto& rootInTB = pos.this_thread()->rootInTB;
+    auto& cardinality = pos.this_thread()->Cardinality;
+    auto& probeDepth = pos.this_thread()->ProbeDepth;
+    rootInTB = false;
     bool dtz_available = true;
 
     // Tables with fewer pieces than SyzygyProbeLimit are searched with
     // ProbeDepth == DEPTH_ZERO
-    if (Cardinality > MaxCardinality)
+    if (cardinality > Tablebases::MaxCardinality)
     {
-        Cardinality = MaxCardinality;
-        ProbeDepth = 0;
+        cardinality = Tablebases::MaxCardinality;
+        probeDepth = 0;
     }
 
-    if (Cardinality >= popcount(pos.pieces()) && !pos.can_castle(ANY_CASTLING))
+    if (cardinality >= popcount(pos.pieces()) && !pos.can_castle(ANY_CASTLING))
     {
         // Rank moves using DTZ tables
-        RootInTB = root_probe(pos, rootMoves);
+        rootInTB = root_probe(pos, rootMoves);
 
-        if (!RootInTB)
+        if (!rootInTB)
         {
             // DTZ tables are missing; try to rank moves using WDL tables
             dtz_available = false;
-            RootInTB = root_probe_wdl(pos, rootMoves);
+            rootInTB = root_probe_wdl(pos, rootMoves);
         }
     }
 
-    if (RootInTB)
+    if (rootInTB)
     {
         // Sort moves according to TB rank
-        std::sort(rootMoves.begin(), rootMoves.end(),
+        std::stable_sort(rootMoves.begin(), rootMoves.end(),
                   [](const RootMove &a, const RootMove &b) { return a.tbRank > b.tbRank; } );
 
         // Probe during search only if DTZ is not available and we are winning
         if (dtz_available || rootMoves[0].tbScore <= VALUE_DRAW)
-            Cardinality = 0;
+            cardinality = 0;
     }
     else
     {
@@ -1972,13 +1977,11 @@ void Tablebases::rank_root_moves(Position& pos, Search::RootMoves& rootMoves) {
         for (auto& m : rootMoves)
             m.tbRank = 0;
     }
+
 }
 
 // --- expose the functions such as fixed depth search used for learning to the outside
-
-#if defined (EVAL_LEARN)
-
-namespace Learner
+namespace Search
 {
   // For learning, prepare a stub that can call search,qsearch() from one thread.
   // From now on, it is better to have a Searcher and prepare a substitution table for each thread like Apery.
@@ -1986,7 +1989,7 @@ namespace Learner
 
   // Initialization for learning.
   // Called from Learner::search(),Learner::qsearch().
-  void init_for_search(Position& pos, Stack* ss)
+  static bool init_for_search(Position& pos, Stack* ss)
   {
 
     // RootNode requires ss->ply == 0.
@@ -1994,39 +1997,6 @@ namespace Learner
 
     std::memset(ss - 7, 0, 10 * sizeof(Stack));
 
-    // About Search::Limits
-    // Be careful because this member variable is global and affects other threads.
-    {
-      auto& limits = Search::Limits;
-
-      // Make the search equivalent to the "go infinite" command. (Because it is troublesome if time management is done)
-      limits.infinite = true;
-
-      // Since PV is an obstacle when displayed, erase it.
-      limits.silent = true;
-
-      // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it.
-      limits.nodes = 0;
-
-      // depth is also processed by the one passed as an argument of Learner::search().
-      limits.depth = 0;
-
-      // Set a large value to prevent the draw value from being returned due to the number of moves near the draw.
-      //limits.max_game_ply = 1 << 16;
-
-      // If you do not include the ball entry rule, it will be a draw and it will be difficult to settle.
-      //limits.enteringKingRule = EnteringKingRule::EKR_27_POINT;
-    }
-
-    // Set DrawValue
-    {
-      // Because it is not prepared for each thread
-      // May be overwritten by another thread. There is no help for it.
-      // If that happens, I think it should be 0.
-      //drawValueTable[REPETITION_DRAW][BLACK] = VALUE_ZERO;
-      //drawValueTable[REPETITION_DRAW][WHITE] = VALUE_ZERO;
-    }
-
     // Regarding this_thread.
 
     {
@@ -2035,8 +2005,10 @@ namespace Learner
       th->completedDepth = 0;
       th->selDepth = 0;
       th->rootDepth = 0;
+      th->nmpMinPly = th->bestMoveChanges = th->failedHighCnt = 0;
+      th->ttHitAverage = TtHitAverageWindow * TtHitAverageResolution / 2;
 
-	  // Zero initialization of the number of search nodes
+      // Zero initialization of the number of search nodes
       th->nodes = 0;
 
       // Clear all history types. This initialization takes a little time, and the accuracy of the search is rather low, so the good and bad are not well understood.
@@ -2060,30 +2032,35 @@ namespace Learner
       for (int i = 7; i > 0; i--)
           (ss - i)->continuationHistory = &th->continuationHistory[0][0][NO_PIECE][0]; // Use as a sentinel
 
- // set rootMoves
+      // set rootMoves
       auto& rootMoves = th->rootMoves;
 
       rootMoves.clear();
       for (auto m: MoveList<LEGAL>(pos))
         rootMoves.push_back(Search::RootMove(m));
 
-      assert(!rootMoves.empty());
+      // Check if we're at a terminal node. Otherwise we end up returning
+      // malformed PV later on.
+      if (rootMoves.empty())
+        return false;
 
-      //#if defined(USE_GLOBAL_OPTIONS)
-      // Since the generation of the substitution table for each search thread should be managed,
-      // Increase the generation of the substitution table for this thread because it is a new search.
-            //TT.new_search(th->thread_id());
+      th->UseRule50 = bool(Options["Syzygy50MoveRule"]);
+      th->ProbeDepth = int(Options["SyzygyProbeDepth"]);
+      th->Cardinality = int(Options["SyzygyProbeLimit"]);
 
-            // ª If you call new_search here, it may be a loss because you can't use the previous search result.
-            // Do not do this here, but caller should do TT.new_search(th->thread_id()) for each station ...
+      // Tables with fewer pieces than SyzygyProbeLimit are searched with
+      // ProbeDepth == DEPTH_ZERO
+      if (th->Cardinality > Tablebases::MaxCardinality)
+      {
+          th->Cardinality = Tablebases::MaxCardinality;
+          th->ProbeDepth = 0;
+      }
 
-            // ¨Because we want to avoid reaching the same final diagram, use the substitution table commonly for all threads when generating teachers.
-      //#endif
+      Tablebases::rank_root_moves(pos, rootMoves);
     }
-  }
 
-  // A pair of reader and evaluation value. Returned by Learner::search(),Learner::qsearch().
-  typedef std::pair<Value, std::vector<Move> > ValueAndPV;
+    return true;
+  }
 
   // Stationary search.
   //
@@ -2099,10 +2076,12 @@ namespace Learner
   // As it has a bad effect, I decided to stop allowing the window range to be specified.
   ValueAndPV qsearch(Position& pos)
   {
-    Stack stack[MAX_PLY + 10], * ss = stack + 7;
-    Move pv[MAX_PLY + 1];
+    Stack stack[MAX_PLY+10], *ss = stack+7;
+    Move  pv[MAX_PLY+1];
+
+    if (!init_for_search(pos, ss))
+      return {};
 
-    init_for_search(pos, ss);
     ss->pv = pv; // For the time being, it must be a dummy and somewhere with a buffer.
 
     if (pos.is_draw(0)) {
@@ -2119,7 +2098,7 @@ namespace Learner
 
     auto bestValue = ::qsearch<PV>(pos, ss, -VALUE_INFINITE, VALUE_INFINITE, 0);
 
-  // Returns the PV obtained.
+    // Returns the PV obtained.
     std::vector<Move> pvs;
     for (Move* p = &ss->pv[0]; is_ok(*p); ++p)
       pvs.push_back(*p);
@@ -2157,7 +2136,8 @@ namespace Learner
     Stack stack[MAX_PLY + 10], * ss = stack + 7;
     Move pv[MAX_PLY + 1];
 
-    init_for_search(pos, ss);
+    if (!init_for_search(pos, ss))
+      return {};
 
 	ss->pv = pv; // For the time being, it must be a dummy and somewhere with a buffer.
 
@@ -2185,7 +2165,7 @@ namespace Learner
     Value bestValue = -VALUE_INFINITE;
 
     while ((rootDepth += 1) <= depth
-	  // exit this loop even if the node limit is exceeded
+      // exit this loop even if the node limit is exceeded
       // The number of search nodes is passed in the argument of this function.
       && !(nodesLimit /* limited nodes */ && th->nodes.load(std::memory_order_relaxed) >= nodesLimit)
       )
@@ -2207,46 +2187,36 @@ namespace Learner
               break;
         }
 
-	    // selDepth output with USI info for each depth and PV line
+        // selDepth output with USI info for each depth and PV line
         selDepth = 0;
 
         // Switch to aspiration search for depth 5 and above.
-        if (rootDepth >= 5 * 1)
+        if (rootDepth >= 4)
         {
-          delta = Value(20);
-
-          Value p = rootMoves[pvIdx].previousScore;
-
-          alpha = std::max(p - delta, -VALUE_INFINITE);
-          beta = std::min(p + delta, VALUE_INFINITE);
+            Value prev = rootMoves[pvIdx].previousScore;
+            delta = Value(17);
+            alpha = std::max(prev - delta,-VALUE_INFINITE);
+            beta  = std::min(prev + delta, VALUE_INFINITE);
         }
 
-        // aspiration search
-        int failedHighCnt = 0;
         while (true)
         {
-          Depth adjustedDepth = std::max(1, rootDepth - failedHighCnt * 1);
+          Depth adjustedDepth = std::max(1, rootDepth);
           bestValue = ::search<PV>(pos, ss, alpha, beta, adjustedDepth, false);
 
           stable_sort(rootMoves.begin() + pvIdx, rootMoves.end());
           //my_stable_sort(pos.this_thread()->thread_id(),&rootMoves[0] + pvIdx, rootMoves.size() - pvIdx);
 
-		  // Expand aspiration window for fail low/high.
+          // Expand aspiration window for fail low/high.
           // However, if it is the value specified by the argument, it will be treated as fail low/high and break.
           if (bestValue <= alpha)
           {
             beta = (alpha + beta) / 2;
             alpha = std::max(bestValue - delta, -VALUE_INFINITE);
-
-            failedHighCnt = 0;
-            //if (mainThread)
-            //    mainThread->stopOnPonderhit = false;
-
           }
           else if (bestValue >= beta)
           {
             beta = std::min(bestValue + delta, VALUE_INFINITE);
-            ++failedHighCnt;
           }
           else
             break;
@@ -2267,7 +2237,6 @@ namespace Learner
     }
 
     // Pass PV_is(ok) to eliminate this PV, there may be NULL_MOVE in the middle.
-    // ¨ PV should not be NULL_MOVE because it is PV
     // MOVE_WIN has never been thrust. (For now)
     for (Move move : rootMoves[0].pv)
     {
@@ -2285,4 +2254,3 @@ namespace Learner
   }
 
 }
-#endif
diff --git a/src/search.h b/src/search.h
index 01d8a4c1..13123323 100644
--- a/src/search.h
+++ b/src/search.h
@@ -24,6 +24,7 @@
 #include "misc.h"
 #include "movepick.h"
 #include "types.h"
+#include "uci.h"
 
 class Position;
 
@@ -32,6 +33,7 @@ namespace Search {
 /// Threshold used for countermoves based pruning
 constexpr int CounterMovePruneThreshold = 0;
 
+extern bool prune_at_shallow_depth;
 
 /// Stack struct keeps track of the information we need to remember from nodes
 /// shallower and deeper in the tree during the search. Each search thread has
@@ -48,6 +50,8 @@ struct Stack {
   int statScore;
   int moveCount;
   bool inCheck;
+  bool ttPv;
+  bool ttHit;
 };
 
 
@@ -69,7 +73,6 @@ struct RootMove {
   Value previousScore = -VALUE_INFINITE;
   int selDepth = 0;
   int tbRank = 0;
-  int bestMoveCount = 0;
   Value tbScore;
   std::vector<Move> pv;
 };
@@ -86,9 +89,7 @@ struct LimitsType {
     time[WHITE] = time[BLACK] = inc[WHITE] = inc[BLACK] = npmsec = movetime = TimePoint(0);
     movestogo = depth = mate = perft = infinite = 0;
     nodes = 0;
-#if defined (EVAL_LEARN)
     silent = false;
-#endif
   }
 
   bool use_time_management() const {
@@ -99,11 +100,9 @@ struct LimitsType {
   TimePoint time[COLOR_NB], inc[COLOR_NB], npmsec, movetime, startTime;
   int movestogo, depth, mate, perft, infinite;
   int64_t nodes;
-#if defined (EVAL_LEARN)
   // Silent mode that does not output to the screen (for continuous self-play in process)
   // Do not output PV at this time.
   bool silent;
-#endif
 };
 
 extern LimitsType Limits;
@@ -111,6 +110,12 @@ extern LimitsType Limits;
 void init();
 void clear();
 
-} // namespace Search
+// A pair of reader and evaluation value. Returned by Learner::search(),Learner::qsearch().
+using ValueAndPV = std::pair<Value, std::vector<Move>>;
+
+ValueAndPV qsearch(Position& pos);
+ValueAndPV search(Position& pos, int depth_, size_t multiPV = 1, uint64_t nodesLimit = 0);
+
+}
 
 #endif // #ifndef SEARCH_H_INCLUDED
diff --git a/src/syzygy/tbprobe.cpp b/src/syzygy/tbprobe.cpp
index 20215b96..191986da 100644
--- a/src/syzygy/tbprobe.cpp
+++ b/src/syzygy/tbprobe.cpp
@@ -28,12 +28,12 @@
 #include <type_traits>
 #include <mutex>
 
-#include "../bitboard.h"
-#include "../movegen.h"
-#include "../position.h"
-#include "../search.h"
-#include "../types.h"
-#include "../uci.h"
+#include "bitboard.h"
+#include "movegen.h"
+#include "position.h"
+#include "search.h"
+#include "types.h"
+#include "uci.h"
 
 #include "tbprobe.h"
 
@@ -52,7 +52,7 @@
 
 using namespace Tablebases;
 
-int Tablebases::MaxCardinality;
+int Tablebases::MaxCardinality = 0;
 
 namespace {
 
@@ -223,7 +223,9 @@ public:
 
         *mapping = statbuf.st_size;
         *baseAddress = mmap(nullptr, statbuf.st_size, PROT_READ, MAP_SHARED, fd, 0);
+#if defined(MADV_RANDOM)
         madvise(*baseAddress, statbuf.st_size, MADV_RANDOM);
+#endif
         ::close(fd);
 
         if (*baseAddress == MAP_FAILED)
@@ -758,7 +760,7 @@ Ret do_probe_table(const Position& pos, T* entry, WDLScore wdl, ProbeState* resu
     if (entry->hasPawns) {
         idx = LeadPawnIdx[leadPawnsCnt][squares[0]];
 
-        std::sort(squares + 1, squares + leadPawnsCnt, pawns_comp);
+        std::stable_sort(squares + 1, squares + leadPawnsCnt, pawns_comp);
 
         for (int i = 1; i < leadPawnsCnt; ++i)
             idx += Binomial[i][MapPawns[squares[i]]];
@@ -859,7 +861,7 @@ encode_remaining:
 
     while (d->groupLen[++next])
     {
-        std::sort(groupSq, groupSq + d->groupLen[next]);
+        std::stable_sort(groupSq, groupSq + d->groupLen[next]);
         uint64_t n = 0;
 
         // Map down a square if "comes later" than a square in the previous
diff --git a/src/syzygy/tbprobe.h b/src/syzygy/tbprobe.h
index b998989b..efc4b6b7 100644
--- a/src/syzygy/tbprobe.h
+++ b/src/syzygy/tbprobe.h
@@ -21,7 +21,7 @@
 
 #include <ostream>
 
-#include "../search.h"
+#include "search.h"
 
 namespace Tablebases {
 
diff --git a/src/thread.cpp b/src/thread.cpp
index 1aa66a81..f035186b 100644
--- a/src/thread.cpp
+++ b/src/thread.cpp
@@ -35,6 +35,7 @@ ThreadPool Threads; // Global object
 Thread::Thread(size_t n) : idx(n), stdThread(&Thread::idle_loop, this) {
 
   wait_for_search_finished();
+  wait_for_worker_finished();
 }
 
 
@@ -51,17 +52,6 @@ Thread::~Thread() {
 }
 
 
-/// Thread::bestMoveCount(Move move) return best move counter for the given root move
-
-int Thread::best_move_count(Move move) const {
-
-  auto rm = std::find(rootMoves.begin() + pvIdx,
-                      rootMoves.begin() + pvLast, move);
-
-  return rm != rootMoves.begin() + pvLast ? rm->bestMoveCount : 0;
-}
-
-
 /// Thread::clear() reset histories, usually before a new game
 
 void Thread::clear() {
@@ -91,6 +81,14 @@ void Thread::start_searching() {
   cv.notify_one(); // Wake up the thread in idle_loop()
 }
 
+void Thread::execute_with_worker(std::function<void(Thread&)> t)
+{
+  std::lock_guard<std::mutex> lk(mutex);
+  worker = std::move(t);
+  searching = true;
+  cv.notify_one(); // Wake up the thread in idle_loop()
+}
+
 
 /// Thread::wait_for_search_finished() blocks on the condition variable
 /// until the thread has finished searching.
@@ -102,6 +100,12 @@ void Thread::wait_for_search_finished() {
 }
 
 
+void Thread::wait_for_worker_finished() {
+
+  std::unique_lock<std::mutex> lk(mutex);
+  cv.wait(lk, [&]{ return !searching; });
+}
+
 /// Thread::idle_loop() is where the thread is parked, blocked on the
 /// condition variable, when it has no work to do.
 
@@ -119,15 +123,25 @@ void Thread::idle_loop() {
   {
       std::unique_lock<std::mutex> lk(mutex);
       searching = false;
+      worker = nullptr;
       cv.notify_one(); // Wake up anyone waiting for search finished
       cv.wait(lk, [&]{ return searching; });
 
       if (exit)
           return;
 
+      auto wrk = std::move(worker);
+
       lk.unlock();
 
-      search();
+      if (wrk)
+      {
+        wrk(*this);
+      }
+      else
+      {
+        search();
+      }
   }
 }
 
@@ -172,6 +186,13 @@ void ThreadPool::clear() {
   main()->previousTimeReduction = 1.0;
 }
 
+void ThreadPool::execute_with_workers(const std::function<void(Thread&)>& worker)
+{
+  for(Thread* th : *this)
+  {
+    th->execute_with_worker(worker);
+  }
+}
 
 /// ThreadPool::start_thinking() wakes up main thread waiting in idle_loop() and
 /// returns immediately. Main thread will wake up other threads and start the search.
@@ -192,9 +213,6 @@ void ThreadPool::start_thinking(Position& pos, StateListPtr& states,
           || std::count(limits.searchmoves.begin(), limits.searchmoves.end(), m))
           rootMoves.emplace_back(m);
 
-  if (!rootMoves.empty())
-      Tablebases::rank_root_moves(pos, rootMoves);
-
   // After ownership transfer 'states' becomes empty, so if we stop the search
   // and call 'go' again without setting a new position states.get() == NULL.
   assert(states.get() || setupStates.get());
@@ -214,6 +232,24 @@ void ThreadPool::start_thinking(Position& pos, StateListPtr& states,
       th->rootMoves = rootMoves;
       th->rootPos.set(pos.fen(), pos.is_chess960(), &th->rootState, th);
       th->rootState = setupStates->back();
+      // This is also set by rank_root_moves but we need to set it
+      // also when there is no legal moves.
+      th->rootInTB = false;
+      th->UseRule50 = bool(Options["Syzygy50MoveRule"]);
+      th->ProbeDepth = int(Options["SyzygyProbeDepth"]);
+      th->Cardinality = int(Options["SyzygyProbeLimit"]);
+
+      // Tables with fewer pieces than SyzygyProbeLimit are searched with
+      // ProbeDepth == DEPTH_ZERO
+      if (th->Cardinality > Tablebases::MaxCardinality)
+      {
+          th->Cardinality = Tablebases::MaxCardinality;
+          th->ProbeDepth = 0;
+      }
+
+      if (!rootMoves.empty())
+          Tablebases::rank_root_moves(pos, rootMoves);
+
   }
 
   main()->start_searching();
@@ -235,16 +271,16 @@ Thread* ThreadPool::get_best_thread() const {
         votes[th->rootMoves[0].pv[0]] +=
             (th->rootMoves[0].score - minScore + 14) * int(th->completedDepth);
 
-          if (abs(bestThread->rootMoves[0].score) >= VALUE_TB_WIN_IN_MAX_PLY)
-          {
-              // Make sure we pick the shortest mate / TB conversion or stave off mate the longest
-              if (th->rootMoves[0].score > bestThread->rootMoves[0].score)
-                  bestThread = th;
-          }
-          else if (   th->rootMoves[0].score >= VALUE_TB_WIN_IN_MAX_PLY
-                   || (   th->rootMoves[0].score > VALUE_TB_LOSS_IN_MAX_PLY
-                       && votes[th->rootMoves[0].pv[0]] > votes[bestThread->rootMoves[0].pv[0]]))
-              bestThread = th;
+        if (abs(bestThread->rootMoves[0].score) >= VALUE_TB_WIN_IN_MAX_PLY)
+        {
+            // Make sure we pick the shortest mate / TB conversion or stave off mate the longest
+            if (th->rootMoves[0].score > bestThread->rootMoves[0].score)
+                bestThread = th;
+        }
+        else if (   th->rootMoves[0].score >= VALUE_TB_WIN_IN_MAX_PLY
+                 || (   th->rootMoves[0].score > VALUE_TB_LOSS_IN_MAX_PLY
+                     && votes[th->rootMoves[0].pv[0]] > votes[bestThread->rootMoves[0].pv[0]]))
+            bestThread = th;
     }
 
     return bestThread;
@@ -269,3 +305,10 @@ void ThreadPool::wait_for_search_finished() const {
         if (th != front())
             th->wait_for_search_finished();
 }
+
+
+void ThreadPool::wait_for_workers_finished() const {
+
+    for (Thread* th : *this)
+        th->wait_for_worker_finished();
+}
diff --git a/src/thread.h b/src/thread.h
index 042bc2e9..83ba2f33 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -24,6 +24,7 @@
 #include <mutex>
 #include <thread>
 #include <vector>
+#include <functional>
 
 #include "material.h"
 #include "movepick.h"
@@ -38,23 +39,41 @@
 /// pointer to an entry its life time is unlimited and we don't have
 /// to care about someone changing the entry under our feet.
 
+namespace Detail {
+
+  template <typename T>
+  struct TypeIdentity {
+    using Type = T;
+  };
+
+}
+
 class Thread {
 
   std::mutex mutex;
   std::condition_variable cv;
   size_t idx;
   bool exit = false, searching = true; // Set before starting std::thread
+  std::function<void(Thread&)> worker;
   NativeThread stdThread;
 
 public:
   explicit Thread(size_t);
   virtual ~Thread();
   virtual void search();
+
+  // The function object to be executed is taken by value to remove
+  // the need for separate lvalue and rvalue overloads.
+  // The worker thread needs to have ownership of the task
+  // to be executed because otherwise there's no way to manage its lifetime.
+  virtual void execute_with_worker(std::function<void(Thread&)> t);
+
   void clear();
   void idle_loop();
   void start_searching();
   void wait_for_search_finished();
-  int best_move_count(Move move) const;
+  void wait_for_worker_finished();
+  size_t thread_idx() const { return idx; }
 
   Pawns::Table pawnsTable;
   Material::Table materialTable;
@@ -74,6 +93,11 @@ public:
   CapturePieceToHistory captureHistory;
   ContinuationHistory continuationHistory[2][2];
   Score contempt;
+  int failedHighCnt;
+  bool rootInTB;
+  int Cardinality;
+  bool UseRule50;
+  Depth ProbeDepth;
 };
 
 
@@ -101,6 +125,61 @@ struct MainThread : public Thread {
 
 struct ThreadPool : public std::vector<Thread*> {
 
+  // Each thread gets its own copy of the `worker` function object.
+  // This means that each worker thread will have exclusive access
+  // to the state of the `worker` function object.
+  void execute_with_workers(const std::function<void(Thread&)>& worker);
+
+  template <typename IndexT, typename FuncT>
+  void for_each_index_with_workers(
+    IndexT begin,
+    typename Detail::TypeIdentity<IndexT>::Type end,
+    FuncT func)
+  {
+    // This value must outlive the function call.
+    // It's fairly safe if we make it static
+    // because for_each_index_with_workers
+    // is not reentrant nor thread safe.
+    static std::atomic<IndexT> i_atomic;
+    i_atomic.store(begin);
+
+    execute_with_workers(
+      [end, func](Thread& th) mutable {
+        for(;;) {
+          const auto i = i_atomic.fetch_add(1);
+          if (i >= end)
+            break;
+
+          func(th, i);
+        }
+      });
+  }
+
+  template <typename IndexT, typename FuncT>
+  void for_each_index_chunk_with_workers(
+    IndexT begin,
+    typename Detail::TypeIdentity<IndexT>::Type end,
+    FuncT func)
+  {
+    // This value must outlive the function call.
+    // It's fairly safe if we make it static
+    // because for_each_index_with_workers
+    // is not reentrant nor thread safe.
+    const IndexT size = end - begin;
+    const IndexT chunk_size = (size + this->size()) / this->size();
+
+    execute_with_workers(
+      [chunk_size, end, func](Thread& th) mutable {
+        const IndexT thread_id = th.thread_idx();
+        const IndexT offset = chunk_size * thread_id;
+        if (offset >= end)
+          return;
+
+        const IndexT count = offset + chunk_size > end ? end - offset : chunk_size;
+        func(th, offset, count);
+      });
+  }
+
   void start_thinking(Position&, StateListPtr&, const Search::LimitsType&, bool = false);
   void clear();
   void set(size_t);
@@ -111,6 +190,7 @@ struct ThreadPool : public std::vector<Thread*> {
   Thread* get_best_thread() const;
   void start_searching();
   void wait_for_search_finished() const;
+  void wait_for_workers_finished() const;
 
   std::atomic_bool stop, increaseDepth;
 
diff --git a/src/timeman.cpp b/src/timeman.cpp
index 6d9c95ef..da08f12d 100644
--- a/src/timeman.cpp
+++ b/src/timeman.cpp
@@ -75,7 +75,7 @@ void TimeManagement::init(Search::LimitsType& limits, Color us, int ply) {
   // game time for the current move, so also cap to 20% of available game time.
   if (limits.movestogo == 0)
   {
-      optScale = std::min(0.008 + std::pow(ply + 3.0, 0.5) / 250.0,
+      optScale = std::min(0.0084 + std::pow(ply + 3.0, 0.5) * 0.0042,
                            0.2 * limits.time[us] / double(timeLeft));
       maxScale = std::min(7.0, 4.0 + ply / 12.0);
   }
diff --git a/src/tt.cpp b/src/tt.cpp
index 60a3a5f1..718587a8 100644
--- a/src/tt.cpp
+++ b/src/tt.cpp
@@ -28,11 +28,16 @@
 
 TranspositionTable TT; // Our global transposition table
 
+bool TranspositionTable::enable_transposition_table = true;
+
 /// TTEntry::save() populates the TTEntry with a new node's data, possibly
 /// overwriting an old position. Update is not atomic and can be racy.
 
 void TTEntry::save(Key k, Value v, bool pv, Bound b, Depth d, Move m, Value ev) {
 
+  if (!TranspositionTable::enable_transposition_table) {
+      return;
+  }
   // Preserve any existing move for the same position
   if (m || (uint16_t)k != key16)
       move16 = (uint16_t)m;
@@ -62,11 +67,12 @@ void TranspositionTable::resize(size_t mbSize) {
 
   Threads.main()->wait_for_search_finished();
 
-  aligned_ttmem_free(mem);
+  aligned_large_pages_free(table);
 
   clusterCount = mbSize * 1024 * 1024 / sizeof(Cluster);
-  table = static_cast<Cluster*>(aligned_ttmem_alloc(clusterCount * sizeof(Cluster), mem));
-  if (!mem)
+
+  table = static_cast<Cluster*>(aligned_large_pages_alloc(clusterCount * sizeof(Cluster)));
+  if (!table)
   {
       std::cerr << "Failed to allocate " << mbSize
                 << "MB for transposition table." << std::endl;
@@ -116,6 +122,11 @@ void TranspositionTable::clear() {
 
 TTEntry* TranspositionTable::probe(const Key key, bool& found) const {
 
+  if (!enable_transposition_table) {
+      found = false;
+      return first_entry(0);
+  }
+
   TTEntry* const tte = first_entry(key);
   const uint16_t key16 = (uint16_t)key;  // Use the low 16 bits as key inside the cluster
 
diff --git a/src/tt.h b/src/tt.h
index fdfd6769..d817f26d 100644
--- a/src/tt.h
+++ b/src/tt.h
@@ -73,7 +73,7 @@ class TranspositionTable {
   static_assert(sizeof(Cluster) == 32, "Unexpected Cluster size");
 
 public:
- ~TranspositionTable() { aligned_ttmem_free(mem); }
+ ~TranspositionTable() { aligned_large_pages_free(table); }
   void new_search() { generation8 += 8; } // Lower 3 bits are used by PV flag and Bound
   TTEntry* probe(const Key key, bool& found) const;
   int hashfull() const;
@@ -84,12 +84,13 @@ public:
     return &table[mul_hi64(key, clusterCount)].entry[0];
   }
 
+  static bool enable_transposition_table;
+
 private:
   friend struct TTEntry;
 
   size_t clusterCount;
   Cluster* table;
-  void* mem;
   uint8_t generation8; // Size must be not bigger than TTEntry::genBound8
 };
 
diff --git a/src/types.h b/src/types.h
index bcc4f77f..4918e8ff 100644
--- a/src/types.h
+++ b/src/types.h
@@ -57,6 +57,12 @@
 /// _WIN32             Building on Windows (any)
 /// _WIN64             Building on Windows 64 bit
 
+#if defined(__GNUC__ ) && (__GNUC__ < 9 || (__GNUC__ == 9 && __GNUC_MINOR__ <= 2)) && defined(_WIN32) && !defined(__clang__)
+#define ALIGNAS_ON_STACK_VARIABLES_BROKEN
+#endif
+
+#define ASSERT_ALIGNED(ptr, alignment) assert(reinterpret_cast<uintptr_t>(ptr) % alignment == 0)
+
 #if defined(_WIN64) && defined(_MSC_VER) // No Makefile used
 #  include <intrin.h> // Microsoft header for _BitScanForward64()
 #  define IS_64BIT
@@ -198,8 +204,8 @@ enum PieceType {
 
 enum Piece {
   NO_PIECE,
-  W_PAWN = 1, W_KNIGHT, W_BISHOP, W_ROOK, W_QUEEN, W_KING,
-  B_PAWN = 9, B_KNIGHT, B_BISHOP, B_ROOK, B_QUEEN, B_KING,
+  W_PAWN = PAWN,     W_KNIGHT, W_BISHOP, W_ROOK, W_QUEEN, W_KING,
+  B_PAWN = PAWN + 8, B_KNIGHT, B_BISHOP, B_ROOK, B_QUEEN, B_KING,
   PIECE_NB = 16
 };
 
diff --git a/src/uci.cpp b/src/uci.cpp
index d6745d19..8e64da6b 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -22,19 +22,23 @@
 #include <sstream>
 #include <string>
 
+#include "extra/stockfish_blas.h"
+#include "nnue/evaluate_nnue.h"
 #include "evaluate.h"
 #include "movegen.h"
+#include "nnue/nnue_test_command.h"
 #include "position.h"
 #include "search.h"
+#include "syzygy/tbprobe.h"
 #include "thread.h"
 #include "timeman.h"
 #include "tt.h"
 #include "uci.h"
-#include "syzygy/tbprobe.h"
 
-#if defined(EVAL_NNUE) && defined(ENABLE_TEST_CMD)
-#include "nnue/nnue_test_command.h"
-#endif
+#include "learn/gensfen.h"
+#include "learn/learn.h"
+#include "learn/convert.h"
+#include "learn/transform.h"
 
 using namespace std;
 
@@ -43,42 +47,16 @@ extern vector<string> setup_bench(const Position&, istream&);
 // FEN string of the initial position, normal chess
 const char* StartFEN = "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1";
 
-// Command to automatically generate a game record
-#if defined (EVAL_LEARN)
-namespace Learner
-{
-  // Automatic generation of teacher position
-  void gen_sfen(Position& pos, istringstream& is);
-
-  // Learning from the generated game record
-  void learn(Position& pos, istringstream& is);
-
-#if defined(GENSFEN2019)
-  // Automatic generation command of teacher phase under development
-  void gen_sfen2019(Position& pos, istringstream& is);
-#endif
-
-  // A pair of reader and evaluation value. Returned by Learner::search(),Learner::qsearch().
-  typedef std::pair<Value, std::vector<Move> > ValueAndPV;
-
-  ValueAndPV qsearch(Position& pos);
-  ValueAndPV search(Position& pos, int depth_, size_t multiPV = 1, uint64_t nodesLimit = 0);
-
-}
-#endif
-
-#if defined(EVAL_NNUE) && defined(ENABLE_TEST_CMD)
 void test_cmd(Position& pos, istringstream& is)
 {
     // Initialize as it may be searched.
-    Eval::init_NNUE();
+    Eval::NNUE::init();
 
     std::string param;
     is >> param;
 
-    if (param == "nnue") Eval::NNUE::TestCommand(pos, is);
+    if (param == "nnue") Eval::NNUE::test_command(pos, is);
 }
-#endif
 
 namespace {
 
@@ -125,7 +103,7 @@ namespace {
     Position p;
     p.set(pos.fen(), Options["UCI_Chess960"], &states->back(), Threads.main());
 
-    Eval::verify_NNUE();
+    Eval::NNUE::verify_eval_file_loaded();
 
     sync_cout << "\n" << Eval::trace(p) << sync_endl;
   }
@@ -134,7 +112,7 @@ namespace {
   // setoption() is called when engine receives the "setoption" UCI command. The
   // function updates the UCI option ("name") to the given value ("value").
 
-  void setoption(istringstream& is) {
+  void setoption_from_stream(istringstream& is) {
 
     string token, name, value;
 
@@ -148,10 +126,7 @@ namespace {
     while (is >> token)
         value += (value.empty() ? "" : " ") + token;
 
-    if (Options.count(name))
-        Options[name] = value;
-    else
-        sync_cout << "No such option: " << name << sync_endl;
+    UCI::setoption(name, value);
   }
 
 
@@ -210,7 +185,7 @@ namespace {
 
         if (token == "go" || token == "eval")
         {
-            cerr << "\nPosition: " << cnt++ << '/' << num << endl;
+            cerr << "\nPosition: " << cnt++ << '/' << num << " (" << pos.fen() << ")" << endl;
             if (token == "go")
             {
                go(pos, is, states);
@@ -220,7 +195,7 @@ namespace {
             else
                trace_eval(pos);
         }
-        else if (token == "setoption")  setoption(is);
+        else if (token == "setoption")  setoption_from_stream(is);
         else if (token == "position")   position(pos, is, states);
         else if (token == "ucinewgame") { Search::clear(); elapsed = now(); } // Search::clear() may take some while
     }
@@ -235,15 +210,23 @@ namespace {
          << "\nNodes/second    : " << 1000 * nodes / elapsed << endl;
   }
 
-  // The win rate model returns the probability (per mille) of winning given an eval
-  // and a game-ply. The model fits rather accurately the LTC fishtest statistics.
-  int win_rate_model(Value v, int ply) {
-     // Return win rate in per mille (rounded to nearest)
-     return int(0.5 + UCI::win_rate_model_double(v, ply));
-  }
-
 } // namespace
 
+void UCI::setoption(const std::string& name, const std::string& value)
+{
+    if (Options.count(name))
+        Options[name] = value;
+    else
+        sync_cout << "No such option: " << name << sync_endl;
+}
+
+// The win rate model returns the probability (per mille) of winning given an eval
+// and a game-ply. The model fits rather accurately the LTC fishtest statistics.
+int UCI::win_rate_model(Value v, int ply) {
+   // Return win rate in per mille (rounded to nearest)
+   return int(0.5 + win_rate_model_double(v, ply));
+}
+
 // The win rate model returns the probability (per mille) of winning given an eval
 // and a game-ply. The model fits rather accurately the LTC fishtest statistics.
 double UCI::win_rate_model_double(double v, int ply) {
@@ -270,11 +253,10 @@ double UCI::win_rate_model_double(double v, int ply) {
 // Call qsearch(),search() directly for testing
 // --------------------
 
-#if defined(EVAL_LEARN)
 void qsearch_cmd(Position& pos)
 {
   cout << "qsearch : ";
-  auto pv = Learner::qsearch(pos);
+  auto pv = Search::qsearch(pos);
   cout << "Value = " << pv.first << " , " << UCI::value(pv.first) << " , PV = ";
   for (auto m : pv.second)
     cout << UCI::move(m, false) << " ";
@@ -295,15 +277,13 @@ void search_cmd(Position& pos, istringstream& is)
   }
 
   cout << "search depth = " << depth << " , multi_pv = " << multi_pv << " : ";
-  auto pv = Learner::search(pos, depth, multi_pv);
+  auto pv = Search::search(pos, depth, multi_pv);
   cout << "Value = " << pv.first << " , " << UCI::value(pv.first) << " , PV = ";
   for (auto m : pv.second)
     cout << UCI::move(m, false) << " ";
   cout << endl;
 }
 
-#endif
-
 /// UCI::loop() waits for a command from stdin, parses it and calls the appropriate
 /// function. Also intercepts EOF from stdin to ensure gracefully exiting if the
 /// GUI dies unexpectedly. When called with some command line arguments, e.g. to
@@ -346,7 +326,7 @@ void UCI::loop(int argc, char* argv[]) {
                     << "\n"       << Options
                     << "\nuciok"  << sync_endl;
 
-      else if (token == "setoption")  setoption(is);
+      else if (token == "setoption")  setoption_from_stream(is);
       else if (token == "go")         go(pos, is, states);
       else if (token == "position")   position(pos, is, states);
       else if (token == "ucinewgame") Search::clear();
@@ -359,24 +339,35 @@ void UCI::loop(int argc, char* argv[]) {
       else if (token == "d")        sync_cout << pos << sync_endl;
       else if (token == "eval")     trace_eval(pos);
       else if (token == "compiler") sync_cout << compiler_info() << sync_endl;
-#if defined (EVAL_LEARN)
-      else if (token == "gensfen") Learner::gen_sfen(pos, is);
-      else if (token == "learn") Learner::learn(pos, is);
 
-#if defined (GENSFEN2019)
-	  // Command to generate teacher phase under development
-      else if (token == "gensfen2019") Learner::gen_sfen2019(pos, is);
-#endif
+      else if (token == "gensfen") Learner::gensfen(is);
+      else if (token == "learn") Learner::learn(is);
+      else if (token == "convert") Learner::convert(is);
+      else if (token == "convert_bin") Learner::convert_bin(is);
+      else if (token == "convert_plain") Learner::convert_plain(is);
+      else if (token == "convert_bin_from_pgn_extract") Learner::convert_bin_from_pgn_extract(is);
+      else if (token == "transform") Learner::transform(is);
+
       // Command to call qsearch(),search() directly for testing
       else if (token == "qsearch") qsearch_cmd(pos);
       else if (token == "search") search_cmd(pos, is);
+      else if (token == "tasktest")
+      {
+        Threads.execute_with_workers([](auto& th) {
+          std::cout << th.thread_idx() << '\n';
+        });
+      }
+      else if (token == "blastest")
+      {
+        Blas::test(Threads);
+      }
+      else if (token == "blasbench")
+      {
+        Blas::bench(Threads);
+      }
 
-#endif
-
-#if defined(EVAL_NNUE) && defined(ENABLE_TEST_CMD)
       // test command
       else if (token == "test") test_cmd(pos, is);
-#endif
       else
           sync_cout << "Unknown command: " << cmd << sync_endl;
 
diff --git a/src/uci.h b/src/uci.h
index c0e8372f..192963cb 100644
--- a/src/uci.h
+++ b/src/uci.h
@@ -72,8 +72,10 @@ std::string square(Square s);
 std::string move(Move m, bool chess960);
 std::string pv(const Position& pos, Depth depth, Value alpha, Value beta);
 std::string wdl(Value v, int ply);
+int win_rate_model(Value v, int ply);
 double win_rate_model_double(double v, int ply);
 Move to_move(const Position& pos, std::string& str);
+void setoption(const std::string& name, const std::string& value);
 
 } // namespace UCI
 
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index 519160cf..bdb1c6b1 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -21,6 +21,8 @@
 #include <ostream>
 #include <sstream>
 
+#include "nnue/evaluate_nnue.h"
+#include "evaluate.h"
 #include "misc.h"
 #include "search.h"
 #include "thread.h"
@@ -40,8 +42,14 @@ void on_hash_size(const Option& o) { TT.resize(size_t(o)); }
 void on_logger(const Option& o) { start_logger(o); }
 void on_threads(const Option& o) { Threads.set(size_t(o)); }
 void on_tb_path(const Option& o) { Tablebases::init(o); }
-void on_use_NNUE(const Option& ) { Eval::init_NNUE(); }
-void on_eval_file(const Option& ) { Eval::init_NNUE(); }
+void on_use_NNUE(const Option& ) { Eval::NNUE::init(); }
+void on_eval_file(const Option& ) { Eval::NNUE::init(); }
+void on_prune_at_shallow_depth(const Option& o) {
+    Search::prune_at_shallow_depth = o;
+}
+void on_enable_transposition_table(const Option& o) {
+    TranspositionTable::enable_transposition_table = o;
+}
 
 /// Our case insensitive less() function as required by UCI protocol
 bool CaseInsensitiveLess::operator() (const string& s1, const string& s2) const {
@@ -69,7 +77,6 @@ void init(OptionsMap& o) {
   o["Move Overhead"]         << Option(10, 0, 5000);
   o["Slow Mover"]            << Option(100, 10, 1000);
   o["nodestime"]             << Option(0, 0, 10000);
-  o["Training"]              << Option(false);
   o["UCI_Chess960"]          << Option(false);
   o["UCI_AnalyseMode"]       << Option(false);
   o["UCI_LimitStrength"]     << Option(false);
@@ -79,26 +86,22 @@ void init(OptionsMap& o) {
   o["SyzygyProbeDepth"]      << Option(1, 1, 100);
   o["Syzygy50MoveRule"]      << Option(true);
   o["SyzygyProbeLimit"]      << Option(7, 0, 7);
-  o["Use NNUE"]              << Option(true, on_use_NNUE);
-  // The default must follow the format nn-[SHA256 first 12 digits].nnue
-  // for the build process (profile-build and fishtest) to work.
-  o["EvalFile"]              << Option("nn-82215d0fd0df.nnue", on_eval_file);
-#ifdef EVAL_NNUE
+  o["Use NNUE"]              << Option("true var true var false var pure", "true", on_use_NNUE);
+  o["EvalFile"]              << Option(EvalFileDefaultName, on_eval_file);
   // When the evaluation function is loaded at the ucinewgame timing, it is necessary to convert the new evaluation function.
   // I want to hit the test eval convert command, but there is no new evaluation function
   // It ends abnormally before executing this command.
   // Therefore, with this hidden option, you can suppress the loading of the evaluation function when ucinewgame,
   // Hit the test eval convert command.
   o["SkipLoadingEval"]       << Option(false);
-  // how many moves to use a fixed move
-  // o["BookMoves"] << Option(16, 0, 10000);
-#endif
-#if defined(EVAL_LEARN)
   // When learning the evaluation function, you can change the folder to save the evaluation function.
   // Evalsave by default. This folder shall be prepared in advance.
-  // Automatically dig a folder under this folder like "0/", "1/", ... and save the evaluation function file there.
+  // Automatically create a folder under this folder like "0/", "1/", ... and save the evaluation function file there.
   o["EvalSaveDir"] << Option("evalsave");
-#endif
+  // Prune at shallow depth on PV nodes. False is recommended when using fixed depth search.
+  o["PruneAtShallowDepth"] << Option(true, on_prune_at_shallow_depth);
+  // Enable transposition table.
+  o["EnableTranspositionTable"] << Option(true, on_enable_transposition_table);
 }
 
 
@@ -152,7 +155,7 @@ Option::operator double() const {
 }
 
 Option::operator std::string() const {
-  assert(type == "string");
+  assert(type == "check" || type == "spin" || type == "combo" || type == "button" || type == "string");
   return currentValue;
 }
 
diff --git a/tests/instrumented.sh b/tests/instrumented.sh
index 03ded74a..dffc257a 100755
--- a/tests/instrumented.sh
+++ b/tests/instrumented.sh
@@ -16,13 +16,19 @@ case $1 in
     exeprefix='valgrind --error-exitcode=42'
     postfix='1>/dev/null'
     threads="1"
+    bench_depth=5
+    go_depth=10
+    tt_size=16
   ;;
   --valgrind-thread)
     echo "valgrind-thread testing started"
     prefix=''
-    exeprefix='valgrind --error-exitcode=42'
-    postfix='1>/dev/null'
+    exeprefix='valgrind --fair-sched=try --error-exitcode=42'
+    postfix=''
     threads="2"
+    bench_depth=5
+    go_depth=10
+    tt_size=16
   ;;
   --sanitizer-undefined)
     echo "sanitizer-undefined testing started"
@@ -30,6 +36,9 @@ case $1 in
     exeprefix=''
     postfix='2>&1 | grep -A50 "runtime error:"'
     threads="1"
+    bench_depth=8
+    go_depth=20
+    tt_size=128
   ;;
   --sanitizer-thread)
     echo "sanitizer-thread testing started"
@@ -37,6 +46,9 @@ case $1 in
     exeprefix=''
     postfix='2>&1 | grep -A50 "WARNING: ThreadSanitizer:"'
     threads="2"
+    bench_depth=8
+    go_depth=20
+    tt_size=128
 
 cat << EOF > tsan.supp
 race:TTEntry::move
@@ -70,7 +82,7 @@ for args in "eval" \
             "go depth 10" \
             "go movetime 1000" \
             "go wtime 8000 btime 8000 winc 500 binc 500" \
-            "bench 128 $threads 8 default depth"
+            "bench $tt_size $threads $bench_depth default depth"
 do
 
    echo "$prefix $exeprefix ./stockfish $args $postfix"
@@ -98,7 +110,7 @@ cat << EOF > game.exp
  expect "bestmove"
 
  send "position fen 5rk1/1K4p1/8/8/3B4/8/8/8 b - - 0 1\n"
- send "go depth 20\n"
+ send "go depth $go_depth\n"
  expect "bestmove"
 
  send "quit\n"
@@ -121,7 +133,7 @@ cat << EOF > syzygy.exp
  send "uci\n"
  send "setoption name SyzygyPath value ../tests/syzygy/\n"
  expect "info string Found 35 tablebases" {} timeout {exit 1}
- send "bench 128 1 8 default depth\n"
+ send "bench $tt_size 1 $bench_depth default depth\n"
  send "quit\n"
  expect eof
 
@@ -130,7 +142,7 @@ cat << EOF > syzygy.exp
  exit \$value
 EOF
 
-for exp in game.exp syzygy.exp
+for exp in game.exp
 do
 
   echo "$prefix expect $exp $postfix"
diff --git a/tests/instrumented_learn.sh b/tests/instrumented_learn.sh
new file mode 100755
index 00000000..9109e78b
--- /dev/null
+++ b/tests/instrumented_learn.sh
@@ -0,0 +1,155 @@
+#!/bin/bash
+# check for errors under valgrind or sanitizers.
+
+error()
+{
+  echo "instrumented testing failed on line $1"
+  exit 1
+}
+trap 'error ${LINENO}' ERR
+
+# define suitable post and prefixes for testing options
+case $1 in
+  --valgrind)
+    echo "valgrind testing started"
+    prefix=''
+    exeprefix='valgrind --error-exitcode=42'
+    postfix='1>/dev/null'
+    threads="1"
+  ;;
+  --valgrind-thread)
+    echo "valgrind-thread testing started"
+    prefix=''
+    exeprefix='valgrind --fair-sched=try --error-exitcode=42'
+    postfix='1>/dev/null'
+    threads="2"
+  ;;
+  --sanitizer-undefined)
+    echo "sanitizer-undefined testing started"
+    prefix='!'
+    exeprefix=''
+    postfix='2>&1 | grep -A50 "runtime error:"'
+    threads="1"
+  ;;
+  --sanitizer-thread)
+    echo "sanitizer-thread testing started"
+    prefix='!'
+    exeprefix=''
+    postfix='2>&1 | grep -A50 "WARNING: ThreadSanitizer:"'
+    threads="2"
+
+cat << EOF > tsan.supp
+race:TTEntry::move
+race:TTEntry::depth
+race:TTEntry::bound
+race:TTEntry::save
+race:TTEntry::value
+race:TTEntry::eval
+race:TTEntry::is_pv
+
+race:TranspositionTable::probe
+race:TranspositionTable::hashfull
+
+EOF
+
+    export TSAN_OPTIONS="suppressions=./tsan.supp"
+
+  ;;
+  *)
+    echo "unknown testing started"
+    prefix=''
+    exeprefix=''
+    postfix=''
+    threads="1"
+  ;;
+esac
+
+mkdir -p training_data
+mkdir -p validation_data
+
+# gensfen testing 01
+cat << EOF > gensfen01.exp
+ set timeout 240
+ spawn $exeprefix ./stockfish
+
+ send "uci\n"
+ expect "uciok"
+
+ send "setoption name Threads value $threads\n"
+ send "setoption name Use NNUE value false\n"
+ send "isready\n"
+ send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data/training_data.bin sfen_format bin\n"
+ expect "INFO: Gensfen finished."
+ send "convert_plain targetfile training_data/training_data.bin output_file_name training_data.txt\n"
+ expect "all done"
+ send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data/training_data.binpack sfen_format binpack\n"
+ expect "INFO: Gensfen finished."
+
+ send "quit\n"
+ expect eof
+
+ # return error code of the spawned program, useful for valgrind
+ lassign [wait] pid spawnid os_error_flag value
+ exit \$value
+EOF
+
+# gensfen testing 02
+cat << EOF > gensfen02.exp
+ set timeout 240
+ spawn $exeprefix ./stockfish
+
+ send "uci\n"
+ expect "uciok"
+
+ send "setoption name Threads value $threads\n"
+ send "setoption name Use NNUE value true\n"
+ send "isready\n"
+ send "gensfen depth 4 loop 50 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name validation_data/validation_data.bin sfen_format bin\n"
+ expect "INFO: Gensfen finished."
+ send "gensfen depth 4 loop 50 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name validation_data/validation_data.binpack sfen_format binpack\n"
+ expect "INFO: Gensfen finished."
+
+ send "quit\n"
+ expect eof
+
+ # return error code of the spawned program, useful for valgrind
+ lassign [wait] pid spawnid os_error_flag value
+ exit \$value
+EOF
+
+# simple learning
+cat << EOF > learn01.exp
+ set timeout 240
+ spawn $exeprefix ./stockfish
+
+ send "uci\n"
+ send "setoption name SkipLoadingEval value true\n"
+ send "setoption name Use NNUE value true\n"
+ send "setoption name Threads value $threads\n"
+ send "isready\n"
+ send "learn targetdir training_data epochs 1 sfen_read_size 100 thread_buffer_size 10 batchsize 100 use_draw_in_training 1 use_draw_in_validation 1 lr 1 eval_limit 32000 nn_batch_size 30 newbob_decay 0.5 eval_save_interval 30 loss_output_interval 10 validation_set_file_name validation_data/validation_data.bin\n"
+
+ expect "INFO (save_eval): Finished saving evaluation file in evalsave/final"
+
+ send "quit\n"
+ expect eof
+
+ # return error code of the spawned program, useful for valgrind
+ lassign [wait] pid spawnid os_error_flag value
+ exit \$value
+
+EOF
+
+for exp in gensfen01.exp gensfen02.exp learn01.exp
+do
+
+  echo "$prefix expect $exp $postfix"
+  eval "$prefix expect $exp $postfix"
+
+  rm $exp
+
+done
+
+rm -f tsan.supp
+
+echo "instrumented learn testing OK"