Merge branch 'master' into stockfish-nnue-2020-08-30-macos

2026-05-20 05:07:46 +00:00 · 2020-12-08 22:49:11 +08:00
parent bb26ce5aa1 3a1bd1185f
commit 055f907315
121 changed files with 23203 additions and 9127 deletions
@@ -1,5 +1,5 @@
 language: cpp
-dist: bionic
+dist: focal

 matrix:
  include:
@@ -7,33 +7,33 @@ matrix:
      compiler: gcc
      addons:
        apt:
-          packages: ['g++-8', 'g++-8-multilib', 'g++-multilib', 'valgrind', 'expect', 'curl']
-      env:
-        - COMPILER=g++-8
-        - COMP=gcc
-
-    - os: linux
-      compiler: clang
-      addons:
-        apt:
-          packages: ['clang-10', 'llvm-10-dev', 'g++-multilib', 'valgrind', 'expect', 'curl']
-      env:
-        - COMPILER=clang++-10
-        - COMP=clang
-
-    - os: osx
-      osx_image: xcode12
-      compiler: gcc
+          packages: ['g++-multilib', 'valgrind', 'expect', 'curl', 'libopenblas-dev']
      env:
        - COMPILER=g++
        - COMP=gcc

-    - os: osx
-      osx_image: xcode12
-      compiler: clang
-      env:
-        - COMPILER=clang++
-        - COMP=clang
+#    - os: linux
+#      compiler: clang
+#      addons:
+#        apt:
+#          packages: ['clang-10', 'llvm-10-dev', 'g++-multilib', 'valgrind', 'expect', 'curl', 'openblas']
+#      env:
+#        - COMPILER=clang++-10
+#        - COMP=clang
+#
+#    - os: osx
+#      osx_image: xcode12
+#      compiler: gcc
+#      env:
+#        - COMPILER=g++
+#        - COMP=gcc
+#
+#    - os: osx
+#      osx_image: xcode12
+#      compiler: clang
+#      env:
+#        - COMPILER=clang++
+#        - COMP=clang

 branches:
  only:
@@ -65,16 +65,13 @@ script:
  - make clean && make -j2 ARCH=x86-64-ssse3 build && ../tests/signature.sh $benchref
  - make clean && make -j2 ARCH=x86-64-sse3-popcnt build && ../tests/signature.sh $benchref
  - make clean && make -j2 ARCH=x86-64 build && ../tests/signature.sh $benchref
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-64 build && ../tests/signature.sh $benchref; fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 optimize=no debug=yes build && ../tests/signature.sh $benchref; fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32-sse41-popcnt build && ../tests/signature.sh $benchref; fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32-sse2 build && ../tests/signature.sh $benchref; fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 build && ../tests/signature.sh $benchref; fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-32 build && ../tests/signature.sh $benchref; fi
-  # workaround: exclude a custom version of llvm+clang, which doesn't find llvm-profdata on ubuntu
-  - if [[ "$TRAVIS_OS_NAME" != "linux" || "$COMP" == "gcc" ]]; then make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref; fi
+  # TODO avoid _mm_malloc
+  # - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-64 build && ../tests/signature.sh $benchref; fi
+  - make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref

  # compile only for some more advanced architectures (might not run in travis)
+  - make clean && make -j2 ARCH=x86-64-avx2 blas=yes build
+
  - make clean && make -j2 ARCH=x86-64-avx2 build
  - make clean && make -j2 ARCH=x86-64-bmi2 build
  - make clean && make -j2 ARCH=x86-64-avx512 build
@@ -91,11 +88,16 @@ script:
  # Valgrind
  #
  - export CXXFLAGS="-O1 -fno-inline"
-  - if [ -x "$(command -v valgrind )" ]; then make clean && make -j2 ARCH=x86-64-modern debug=yes optimize=no build > /dev/null && ../tests/instrumented.sh --valgrind; fi
-  - if [ -x "$(command -v valgrind )" ]; then ../tests/instrumented.sh --valgrind-thread; fi
+  - make clean && make -j2 ARCH=x86-64-modern debug=yes optimize=no build > /dev/null && ../tests/instrumented.sh --valgrind
+  - ../tests/instrumented.sh --valgrind-thread

  #
  # Sanitizer
  #
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-undefined; fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64-modern sanitize=thread    optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-thread; fi
+  - make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-undefined
+  - make clean && make -j2 ARCH=x86-64-modern sanitize=thread    optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-thread
+
+  # NNUE testing
+  - export CXXFLAGS="-O1 -fno-inline"
+  - make clean && make -j2 ARCH=x86-64-modern debug=no optimize=no build > /dev/null && ../tests/instrumented_learn.sh --valgrind
+  - make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=yes build > /dev/null && ../tests/instrumented_learn.sh --sanitizer-undefined
@@ -19,6 +19,7 @@ Alain Savard (Rocky640)
 Alayan Feh (Alayan-stk-2)
 Alexander Kure
 Alexander Pagel (Lolligerhans)
+Alfredo Menezes (lonfom169)
 Ali AlZhrani (Cooffe)
 Andrew Grant (AndyGrant)
 Andrey Neporada (nepal)
@@ -36,12 +37,14 @@ Bryan Cross (crossbr)
 candirufish
 Chess13234
 Chris Cain (ceebo)
+Dale Weiler (graphitemaster)
 Dan Schmidt (dfannius)
 Daniel Axtens (daxtens)
 Daniel Dugovic (ddugovic)
-Dariusz Orzechowski
+Dariusz Orzechowski (dorzechowski)
 David Zar
 Daylen Yang (daylen)
+Deshawn Mohan-Smith (GoldenRare)
 DiscanX
 Dominik Schlösser (domschl)
 double-beep
@@ -83,7 +86,7 @@ Jekaa
 Jerry Donald Watson (jerrydonaldwatson)
 jjoshua2
 Jonathan Calovski (Mysseno)
-Jonathan Dumale (SFisGOD)
+Jonathan Buladas Dumale (SFisGOD)
 Joost VandeVondele (vondele)
 Jörg Oster (joergoster)
 Joseph Ellis (jhellis3)
@@ -109,6 +112,7 @@ Mark Tenzer (31m059)
 marotear
 Matthew Lai (matthewlai)
 Matthew Sullivan (Matt14916)
+Maxim Molchanov (Maxim)
 Michael An (man)
 Michael Byrne (MichaelB7)
 Michael Chaly (Vizvezdenec)
@@ -5,9 +5,37 @@
 <h1 align="center">Stockfish NNUE</h1>

 ## Overview
+
 Stockfish NNUE is a port of a shogi neural network named NNUE (efficiently updateable neural network backwards) to Stockfish 11. To learn more about the Stockfish chess engine, look [here](stockfish.md) for an overview and [here](https://github.com/official-stockfish/Stockfish) for the official repository.

-## Compilation Instructions for Mac
+=======
+## Building
+
+To compile:
+```
+make -jN ARCH=... build
+```
+
+To compile with Profile Guided Optimizations. Requires that the computer that is used for compilation supports the selected `ARCH`.
+```
+make -jN ARCH=... profile-build
+```
+
+`N` is the number of threads to use for compilation.
+
+`ARCH` is one of:
+`x86-64-vnni512`, `x86-64-vnni256`, `x86-64-avx512`, `x86-64-bmi2`, `x86-64-avx2`,
+`x86-64-sse41-popcnt`, `x86-64-modern`, `x86-64-ssse3`, `x86-64-sse3-popcnt`,
+`x86-64`, `x86-32-sse41-popcnt`, `x86-32-sse2`, `x86-32`, `ppc-64`, `ppc-32,
+armv7`, `armv7-neon`, `armv8`, `apple-silicon`, `general-64`, `general-32`.
+
+`ARCH` needs to be chosen based based on the instruction set of the CPU that will run stockfish. `x86-64-modern` will produce a binary that works on most common processors, but other options may increase performance for specific hardware.
+
+Additional options:
+
+- `blas=[yes/no]` - whether to use an external BLAS library. Default is `no`. Using an external BLAS library may have a significantly improve learning performance and by default expects openBLAS to be installed.
+
+### Building Instructions for Mac

 1. Ensure that you have OpenBlas Installed
 ```
@@ -24,62 +52,91 @@ cd src
 make profile-learn ARCH=x86-64 COMP=gcc
 ```

-
 ## Training Guide
+
 ### Generating Training Data
-To generate training data from the classic eval, use the gensfen command with the setting "Use NNUE" set to "false". The given example is generation in its simplest form. There are more commands. 
+
+To generate training data from the classic eval, use the gensfen command with the setting "Use NNUE" set to "false". The given example is generation in its simplest form. There are more commands.
+
 ```
 uci
+setoption name PruneAtShallowDepth value false
 setoption name Use NNUE value false
 setoption name Threads value x
 setoption name Hash value y
 setoption name SyzygyPath value path
 isready
-gensfen depth a loop b use_draw_in_training_data_generation 1 eval_limit 32000 use_raw_nnue_eval 0
+gensfen depth a loop b use_draw_in_training_data_generation 1 eval_limit 32000
 ```
-Specify how many threads and how much memory you would like to use with the x and y values. The option SyzygyPath is not necessary, but if you would like to use it, you must first have Syzygy endgame tablebases on your computer, which you can find [here](http://oics.olympuschess.com/tracker/index.php). You will need to have a torrent client to download these tablebases, as that is probably the fastest way to obtain them. The path is the path to the folder containing those tablebases. It does not have to be surrounded in quotes.

-use_raw_nnue_eval controls if the training data generator or trainer uses raw NNUE eval values.  Don't forget to set use_raw_nnue_eval 0 when initial training data are generated.  Otherwise, the gensfen command will crash.
+- `depth` is the searched depth per move, or how far the engine looks forward. This value is an integer.
+- `loop` is the amount of positions generated. This value is also an integer.
+
+Specify how many threads and how much memory you would like to use with the `x` and `y` values. The option SyzygyPath is not necessary, but if you would like to use it, you must first have Syzygy endgame tablebases on your computer, which you can find [here](http://oics.olympuschess.com/tracker/index.php). You will need to have a torrent client to download these tablebases, as that is probably the fastest way to obtain them. The `path` is the path to the folder containing those tablebases. It does not have to be surrounded in quotes.
+
+This will create a file named "generated_kifu.binpack" in the same folder as the binary containing the generated training data. Once generation is done, you can rename the file to something like "1billiondepth12.binpack" to remember the depth and quantity of the positions and move it to a folder named "trainingdata" in the same directory as the binaries.
+
+You will also need validation data that is used for loss calculation and accuracy computation. Validation data is generated in the same way as training data, but generally at most 1 million positions should be used as there's no need for more and it would just slow the learning process down. It may also be better to slightly increase the depth for validation data. After generation you can rename the validation data file to "val.binpack" and drop it in a folder named "validationdata" in the same directory to make it easier.
+
+More information about gensfen and available options can be found in the [docs](docs/gensfen.md)
+
+### Training a network
+
+#### Training a Completely New Network
+
+Whether a new network is created or not is controlled by the UCI option `SkipLoadingEval`. If set to true then a new network will be created, which allows learning from scratch. If left at its default (false) then a network will be loaded and trained further. The second scenario is described in the reinforcement learning paragraph.
+
+A simple command chain to start with training could look like this:

-This will save a file named "generated_kifu.bin" in the same folder as the binary. Once generation is done, rename the file to something like "1billiondepth12.bin" to remember the depth and quantity of the positions and move it to a folder named "trainingdata" in the same directory as the binaries.
-#### Generation Parameters
- Depth is the searched depth per move, or how far the engine looks forward. This value is an integer.
- Loop is the amount of positions generated. This value is also an integer
-### Generating Validation Data
-The process is the same as the generation of training data, except for the fact that you need to set loop to 1 million, because you don't need a lot of validation data. The depth should be the same as before or slightly higher than the depth of the training data. After generation rename the validation data file to val.bin and drop it in a folder named "validationdata" in the same directory to make it easier. 
-### Training a Completely New Network
-Use the "learn" binary. Create an empty folder named "evalsave" in the same directory as the binaries.
 ```
 uci
+setoption name EnableTranspositionTable value false
+setoption name PruneAtShallowDepth value false
 setoption name SkipLoadingEval value true
-setoption name Use NNUE value true
+setoption name Use NNUE value pure
 setoption name Threads value x
 isready
-learn targetdir trainingdata loop 100 batchsize 1000000 use_draw_in_training 1 use_draw_in_validation 1 eta 1 lambda 1 eval_limit 32000 nn_batch_size 1000 newbob_decay 0.5 eval_save_interval 250000000 loss_output_interval 1000000 mirror_percentage 50 validation_set_file_name validationdata\val.bin
+learn targetdir trainingdata epochs 10000 batchsize 1000000 use_draw_in_training 1 use_draw_in_validation 1 lr 1 lambda 1 eval_limit 32000 nn_batch_size 1000 newbob_decay 0.5 eval_save_interval 250000000 loss_output_interval 1000000 validation_set_file_name validationdata\val.binpack
 ```
-Nets get saved in the "evalsave" folder. 

-#### Training Parameters
- eta is the learning rate
- lambda is the amount of weight it puts to eval of learning data vs win/draw/loss results. 1 puts all weight on eval, lambda 0 puts all weight on WDL results.
+This will utilize training data files in the "trainingdata" directory and validation data from file "validationdata\val.bin". Produced nets are saved in the "evalsave" folder.

-### Reinforcement Learning
-If you would like to do some reinforcement learning on your original network, you must first generate training data using the learn binaries with the setting `Use NNUE` set to true. Make sure that your previously trained network is in the eval folder. Use the commands specified above. Make sure `SkipLoadingEval` is set to false so that the data generated is using the neural net's eval by typing the command `setoption name SkipLoadingEval value false` before typing the `isready` command. You should aim to generate less positions than the first run, around 1/10 of the number of positions generated in the first run. The depth should be higher as well. You should also do the same for validation data, with the depth being higher than the last run.
+More information about learn and available parameters can be found in the [docs](docs/learn.md)

-After you have generated the training data, you must move it into your training data folder and delete the older data so that the binary does not accidentally train on the same data again. Do the same for the validation data and name it to val-1.bin to make it less confusing. Make sure the evalsave folder is empty. Then, using the same binary, type in the training commands shown above. Do __NOT__ set `SkipLoadingEval` to true, it must be false or you will get a completely new network, instead of a network trained with reinforcement learning. You should also set eval_save_interval to a number that is lower than the amount of positions in your training data, perhaps also 1/10 of the original value. The validation file should be set to the new validation data, not the old data.
+#### Reinforcement Learning

-After training is finished, your new net should be located in the "final" folder under the "evalsave" directory. You should test this new network against the older network to see if there are any improvements.
+If you would like to do some reinforcement learning on your original network, you must first generate training data with the setting `Use NNUE` set to `pure` and using the previous network (either name it "nn.bin" and put into alongside the binary or provide the `EvalFile` UCI option). Use the commands specified above. You should aim to generate less positions than the first run, around 1/10 of the number of positions generated in the first run. The depth should be higher as well. You should also do the same for validation data, with the depth being higher than the last run.
+
+After you have generated the training data, you must move it into your training data folder and move the older data so that the binary does not train on the same data again. Do the same for the validation data. Make sure the "evalsave" folder is empty. Then, using the same binary, type in the training commands shown above. Do __NOT__ set `SkipLoadingEval` to true, it must be false or you will get a completely new network, instead of a network trained with reinforcement learning. You should also set `eval_save_interval` to a number that is lower than the amount of positions in your training data, perhaps also 1/10 of the original value.
+
+After training is finished, your new net should be located in the "final" folder under the "evalsave" directory. You should test this new network against the older network to see if there are any improvements. Don't rely on the automatic rejection for network quality, sometimes even rejected nets can be better than the previous ones.

 ## Using Your Trained Net
+
 If you want to use your generated net, copy the net located in the "final" folder under the "evalsave" directory and move it into a new folder named "eval" under the directory with the binaries. You can then use the halfkp_256x2 binaries pertaining to your CPU with a standard chess GUI, such as Cutechess. Refer to the [releases page](https://abrok.eu/stockfish) to find out which binary is best for your CPU.

-If the engine does not load any net file, or shows "Error! *** not found or wrong format", please try to specify the net with the full file path with the "EvalFile" option by typing the command `setoption name EvalFile value path` where path is the full file path. The "Use NNUE" option must be set to true with the command `setoption name Use NNUE value true`.
+If the engine does not load any net file, or shows "Error! *** not found or wrong format", please try to specify the net with the full file path with the `EvalFile` UCI option by typing the command `setoption name EvalFile value path` where path is the full file path. The `Use NNUE` UCI option must be set either to `true` or `pure` with the command `setoption name Use NNUE value true/pure`.
+
+## Training data formats.
+
+Currently there are 3 training data formats. Two of them are supported directly.
+
+- `.bin` - the original training data format. Uses 40 bytes per entry. Is supported directly by the `gensfen` and `learn` commands.
+- `.plain` - a human readable training data format. This one is not supported directly by the `gensfen` and `learn` commands. It should not be used for data exchange because it's less compact than other formats. It is mostly useful for inspection of the data.
+- `.binpack` - a compact binary training data format that exploits positions chains to further reduce size. It uses on average between 2 to 3 bytes per entry when generating data with `gensfen`. It is supported directly by `gensfen` and `learn` commands. It is currently the default for the `gensfen` command. A more in depth description can be found [here](docs/binpack.md)
+
+### Conversion between formats.
+
+There is a builting converted that support all 3 formats described above. Any of them can be converted to any other. For more information and usage guide see [here](docs/convert.md).

 ## Resources
+
+- [Training NNUE for SF](https://docs.google.com/document/d/1os5GH8GGJbV0nKAfXD-qySBclFzKKtXKHbAnA-un8tA/edit) google document with important information and coding priorities
+- [Gensfen data (vondele)](https://drive.google.com/drive/folders/1mftuzYdl9o6tBaceR3d_VBQIrgKJsFpl) over 2b fens available
 - [Stockfish NNUE Wiki](https://www.qhapaq.org/shogi/shogiwiki/stockfish-nnue/)
 - [Training instructions](https://twitter.com/mktakizawa/status/1273042640280252416) from the creator of the Elmo shogi engine
 - [Original Talkchess thread](http://talkchess.com/forum3/viewtopic.php?t=74059) discussing Stockfish NNUE
- [Guide to Stockfish NNUE](http://yaneuraou.yaneu.com/2020/06/19/stockfish-nnue-the-complete-guide/) 
+- [Guide to Stockfish NNUE](http://yaneuraou.yaneu.com/2020/06/19/stockfish-nnue-the-complete-guide/)
 - [Unofficial Stockfish Discord](https://discord.gg/nv8gDtt)

 A more updated list can be found in the #sf-nnue-resources channel in the Discord.
@@ -1,154 +1,173 @@
-Contributors with >10,000 CPU hours as of January 7, 2020
+Contributors with >10,000 CPU hours as of Sept 2, 2020
 Thank you!

 Username                  CPU Hours   Games played
 --------------------------------------------------
-noobpwnftw                  9305707      695548021
-mlang                        780050       61648867
-dew                          621626       43921547
-mibere                       524702       42238645
-crunchy                      354587       27344275
-cw                           354495       27274181
-fastgm                       332801       22804359
-JojoM                        295750       20437451
-CSU_Dynasty                  262015       21828122
-Fisherman                    232181       18939229
-ctoks                        218866       17622052
-glinscott                    201989       13780820
-tvijlbrief                   201204       15337115
-velislav                     188630       14348485
-gvreuls                      187164       15149976
-bking_US                     180289       11876016
-nordlandia                   172076       13467830
-leszek                       157152       11443978
-Thanar                       148021       12365359
-spams                        141975       10319326
-drabel                       138073       11121749
-vdv                          137850        9394330
-mgrabiak                     133578       10454324
-TueRens                      132485       10878471
-bcross                       129683       11557084
-marrco                       126078        9356740
-sqrt2                        125830        9724586
-robal                        122873        9593418
-vdbergh                      120766        8926915
-malala                       115926        8002293
-CoffeeOne                    114241        5004100
-dsmith                       113189        7570238
-BrunoBanani                  104644        7436849
-Data                          92328        8220352
-mhoram                        89333        6695109
-davar                         87924        7009424
-xoto                          81094        6869316
-ElbertoOne                    80899        7023771
-grandphish2                   78067        6160199
-brabos                        77212        6186135
-psk                           75733        5984901
-BRAVONE                       73875        5054681
-sunu                          70771        5597972
-sterni1971                    70605        5590573
-MaZePallas                    66886        5188978
-Vizvezdenec                   63708        4967313
-nssy                          63462        5259388
-jromang                       61634        4940891
-teddybaer                     61231        5407666
-Pking_cda                     60099        5293873
-solarlight                    57469        5028306
-dv8silencer                   56913        3883992
-tinker                        54936        4086118
-renouve                       49732        3501516
-Freja                         49543        3733019
-robnjr                        46972        4053117
-rap                           46563        3219146
-Bobo1239                      46036        3817196
-ttruscott                     45304        3649765
-racerschmacer                 44881        3975413
-finfish                       44764        3370515
-eva42                         41783        3599691
-biffhero                      40263        3111352
-bigpen0r                      39817        3291647
-mhunt                         38871        2691355
-ronaldjerum                   38820        3240695
-Antihistamine                 38785        2761312
-pb00067                       38038        3086320
-speedycpu                     37591        3003273
-rkl                           37207        3289580
-VoyagerOne                    37050        3441673
-jbwiebe                       35320        2805433
-cuistot                       34191        2146279
-homyur                        33927        2850481
-manap                         32873        2327384
-gri                           32538        2515779
-oryx                          31267        2899051
-EthanOConnor                  30959        2090311
-SC                            30832        2730764
-csnodgrass                    29505        2688994
-jmdana                        29458        2205261
-strelock                      28219        2067805
-jkiiski                       27832        1904470
-Pyafue                        27533        1902349
-Garf                          27515        2747562
-eastorwest                    27421        2317535
-slakovv                       26903        2021889
-Prcuvu                        24835        2170122
-anst                          24714        2190091
-hyperbolic.tom                24319        2017394
-Patrick_G                     23687        1801617
-Sharaf_DG                     22896        1786697
-nabildanial                   22195        1519409
-chriswk                       21931        1868317
-achambord                     21665        1767323
-Zirie                         20887        1472937
-team-oh                       20217        1636708
-Isidor                        20096        1680691
-ncfish1                       19931        1520927
-nesoneg                       19875        1463031
-Spprtr                        19853        1548165
-JanErik                       19849        1703875
-agg177                        19478        1395014
-SFTUser                       19231        1567999
-xor12                         19017        1680165
-sg4032                        18431        1641865
-rstoesser                     18118        1293588
-MazeOfGalious                 17917        1629593
-j3corre                       17743         941444
-cisco2015                     17725        1690126
-ianh2105                      17706        1632562
-dex                           17678        1467203
-jundery                       17194        1115855
-iisiraider                    17019        1101015
-horst.prack                   17012        1465656
-Adrian.Schmidt123             16563        1281436
-purplefishies                 16342        1092533
-wei                           16274        1745989
-ville                         16144        1384026
-eudhan                        15712        1283717
-OuaisBla                      15581         972000
-DragonLord                    15559        1162790
-dju                           14716         875569
-chris                         14479        1487385
-0xB00B1ES                     14079        1001120
-OssumOpossum                  13776        1007129
-enedene                       13460         905279
-bpfliegel                     13346         884523
-Ente                          13198        1156722
-IgorLeMasson                  13087        1147232
-jpulman                       13000         870599
-ako027ako                     12775        1173203
-Nikolay.IT                    12352        1068349
-Andrew Grant                  12327         895539
-joster                        12008         950160
-AdrianSA                      11996         804972
-Nesa92                        11455        1111993
-fatmurphy                     11345         853210
-Dark_wizzie                   11108        1007152
-modolief                      10869         896470
-mschmidt                      10757         803401
-infinity                      10594         727027
-mabichito                     10524         749391
-Thomas A. Anderson            10474         732094
-thijsk                        10431         719357
-Flopzee                       10339         894821
-crocogoat                     10104        1013854
-SapphireBrand                 10104         969604
-stocky                        10017         699440
+noobpwnftw                 19352969     1231459677
+mlang                        957168       61657446
+dew                          949885       56893432
+mibere                       703817       46865007
+crunchy                      427035       27344275
+cw                           416006       27521077
+JojoM                        415904       24479564
+fastgm                       404873       23953472
+CSU_Dynasty                  335774       22850550
+tvijlbrief                   335199       21871270
+Fisherman                    325053       21786603
+gvreuls                      311480       20751516
+ctoks                        275877       18710423
+velislav                     241267       15596372
+glinscott                    217799       13780820
+nordlandia                   211692       13484886
+bcross                       206213       14934233
+bking_US                     198894       11876016
+leszek                       189170       11446821
+mgrabiak                     183896       11778092
+drabel                       181408       12489478
+TueRens                      181349       12192000
+Thanar                       179852       12365359
+vdv                          175171        9881246
+robal                        166948       10702862
+spams                        157128       10319326
+marrco                       149947        9376421
+sqrt2                        147963        9724586
+vdbergh                      137041        8926915
+CoffeeOne                    136294        5004100
+malala                       136182        8002293
+mhoram                       128934        8177193
+davar                        122092        7960001
+dsmith                       122059        7570238
+xoto                         119696        8222144
+grandphish2                  116481        7582197
+Data                         113305        8220352
+BrunoBanani                  112960        7436849
+ElbertoOne                    99028        7023771
+MaZePallas                    98571        6362619
+brabos                        92118        6186135
+psk                           89957        5984901
+sunu                          88463        6007033
+sterni1971                    86948        5613788
+Vizvezdenec                   83752        5343724
+BRAVONE                       81239        5054681
+nssy                          76497        5259388
+teddybaer                     75125        5407666
+Pking_cda                     73776        5293873
+jromang                       70695        4940891
+solarlight                    70517        5028306
+dv8silencer                   70287        3883992
+Bobo1239                      68515        4652287
+racerschmacer                 67468        4935996
+manap                         66273        4121774
+tinker                        63458        4213726
+linrock                       59082        4516053
+robnjr                        57262        4053117
+Freja                         56938        3733019
+ttruscott                     56005        3679485
+renouve                       53811        3501516
+cuistot                       52532        3014920
+finfish                       51360        3370515
+eva42                         51272        3599691
+rkl                           50759        3840947
+rap                           49985        3219146
+pb00067                       49727        3298270
+ronaldjerum                   47654        3240695
+bigpen0r                      47278        3291647
+biffhero                      46564        3111352
+VoyagerOne                    45386        3445881
+speedycpu                     43842        3003273
+jbwiebe                       43305        2805433
+Antihistamine                 41788        2761312
+mhunt                         41735        2691355
+eastorwest                    40387        2812173
+homyur                        39893        2850481
+gri                           39871        2515779
+oryx                          38228        2941656
+0x3C33                        37773        2529097
+SC                            37290        2731014
+csnodgrass                    36207        2688994
+jmdana                        36108        2205261
+strelock                      34716        2074055
+Garf                          33800        2747562
+EthanOConnor                  33370        2090311
+slakovv                       32915        2021889
+Spprtr                        32591        2139601
+Prcuvu                        30377        2170122
+anst                          30301        2190091
+jkiiski                       30136        1904470
+hyperbolic.tom                29840        2017394
+Pyafue                        29650        1902349
+OuaisBla                      27629        1578000
+chriswk                       26902        1868317
+achambord                     26582        1767323
+Patrick_G                     26276        1801617
+yorkman                       26193        1992080
+SFTUser                       25182        1675689
+nabildanial                   24942        1519409
+Sharaf_DG                     24765        1786697
+ncfish1                       24411        1520927
+agg177                        23890        1395014
+JanErik                       23408        1703875
+Isidor                        23388        1680691
+Norabor                       22976        1587862
+cisco2015                     22880        1759669
+Zirie                         22542        1472937
+team-oh                       22272        1636708
+MazeOfGalious                 21978        1629593
+sg4032                        21945        1643065
+ianh2105                      21725        1632562
+xor12                         21628        1680365
+dex                           21612        1467203
+nesoneg                       21494        1463031
+horst.prack                   20878        1465656
+0xB00B1ES                     20590        1208666
+j3corre                       20405         941444
+Adrian.Schmidt123             20316        1281436
+wei                           19973        1745989
+rstoesser                     19569        1293588
+eudhan                        19274        1283717
+Ente                          19070        1373058
+jundery                       18445        1115855
+iisiraider                    18247        1101015
+ville                         17883        1384026
+chris                         17698        1487385
+purplefishies                 17595        1092533
+DragonLord                    17014        1162790
+dju                           16515         929427
+IgorLeMasson                  16064        1147232
+ako027ako                     15671        1173203
+Nikolay.IT                    15154        1068349
+Andrew Grant                  15114         895539
+yurikvelo                     15027        1165616
+OssumOpossum                  14857        1007129
+enedene                       14476         905279
+bpfliegel                     14298         884523
+jpulman                       13982         870599
+joster                        13794         950160
+Nesa92                        13786        1114691
+Dark_wizzie                   13422        1007152
+Hjax                          13350         900887
+Fifis                         13313         965473
+mabichito                     12903         749391
+thijsk                        12886         722107
+crocogoat                     12876        1048802
+AdrianSA                      12860         804972
+Flopzee                       12698         894821
+fatmurphy                     12547         853210
+SapphireBrand                 12416         969604
+modolief                      12386         896470
+scuzzi                        12362         833465
+pgontarz                      12151         848794
+stocky                        11954         699440
+mschmidt                      11941         803401
+infinity                      11470         727027
+torbjo                        11387         728873
+Thomas A. Anderson            11372         732094
+snicolet                      11106         869170
+amicic                        10779         733593
+rpngn                         10712         688203
+d64                           10680         771144
+basepi                        10637         744851
+jjoshua2                      10559         670905
+dzjp                          10343         732529
+ols                           10259         570669
+lbraesch                      10252         647825
@@ -63,7 +63,7 @@ build_script:
  - cmake --build . --config %CONFIGURATION% -- /verbosity:minimal
  - ps: |
      # Download default NNUE net from fishtest
-      $nnuenet = Get-Content -Path src\ucioption.cpp | Select-String -CaseSensitive -Pattern "Option" | Select-String -CaseSensitive -Pattern "nn-[a-z0-9]{12}.nnue"
+      $nnuenet = Get-Content -Path src\evaluate.h | Select-String -CaseSensitive -Pattern "EvalFileDefaultName" | Select-String -CaseSensitive -Pattern "nn-[a-z0-9]{12}.nnue"
      $dummy = $nnuenet -match "(?<nnuenet>nn-[a-z0-9]{12}.nnue)"
      $nnuenet = $Matches.nnuenet
      Write-Host "Default net:" $nnuenet
@@ -0,0 +1,42 @@
+# Binpack
+
+Binpack is a binary training data storage format designed to take advantage of position chains differing by a single move. Therefore it is very good at compactly storing data generated from real games (as opposed to random positions for example sourced from an opening book).
+
+It is currently implemented through a single header library in `extra/nnue_data_binpack_format.h`.
+
+Below follows a rough description of the format in a BNF-like notation.
+
+```
+[[nodiscard]] std::uint16_t signedToUnsigned(std::int16_t a) {
+    std::uint16_t r;
+    std::memcpy(&r, &a, sizeof(std::uint16_t));
+    if (r & 0x8000) r ^= 0x7FFF; // flip value bits if negative
+    r = (r << 1) | (r >> 15); // store sign bit at bit 0
+    return r;
+}
+
+file := <block>*
+block := BINP<chain>*
+chain := <stem><movetext>
+stem := <pos><move><score><ply_and_result><rule50> (32 bytes)
+pos := https://github.com/Sopel97/nnue_data_compress/blob/master/src/chess/Position.h#L1166 (24 bytes)
+move := https://github.com/Sopel97/nnue_data_compress/blob/master/src/chess/Chess.h#L1044 (2 bytes)
+score := signedToUnsigned(score) (2 bytes, big endian)
+ply_and_result := ply bitwise_or (signedToUnsigned(result) << 14) (2 bytes, big endian)
+rule50 := rule_50_counter (2 bytes, big endian)
+    // this is a small defect from old version,
+    I didn't want to break backwards compatibility. Effectively means that there's
+    one byte left for something else in the future because rule50 always fits in one byte.
+
+movetext := <count><move_and_score>*
+count := number of plies in the movetext (2 bytes, big endian). Can be 0.
+move_and_score := <encoded_move><encoded_score> (~2 bytes)
+encoded_move := oof this one is complicated to explain.
+    https://github.com/Sopel97/nnue_data_compress/blob/master/src/compress_file.cpp#L827.
+    https://github.com/Sopel97/chess_pos_db/blob/master/docs/bcgn/variable_length.md
+
+encoded_score := https://en.wikipedia.org/wiki/Variable-width_encoding
+    with block size of 4 bits + 1 bit for extension bit.
+    Encoded value is signedToUnsigned(-prev_score - current_score)
+    (scores are always seen from the perspective of side to move in <pos>, that's why the '-' before prev_score)
+```
@@ -0,0 +1,18 @@
+# Convert
+
+`convert` allows conversion of training data between any of `.plain`, `.bin`, and `.binpack`.
+
+As all commands in stockfish `convert` can be invoked either from command line (as `stockfish.exe convert ...`) or in the interactive prompt.
+
+The syntax of this command is as follows:
+```
+convert from_path to_path [append] [validate]
+```
+
+`from_path` is the path to the file to convert from. The type of the data is deduced based on its extension (one of `.plain`, `.bin`, `.binpack`).
+`to_path` is the path to an output file. The type of the data is deduced from its extension. If the file does not exist it is created.
+
+`append` and `validate` can come in any order and are optional.
+If `append` not specified then the output file will be truncated prior to any writes. If `append` is specified then the converted training data will be appended to the end of the output file.
+
+If `validate` is specified then the conversion will stop on the first illegal move found and a diagnostic will be shown.
@@ -0,0 +1,67 @@
+# Gensfen
+
+`gensfen` command allows generation of training data from self-play in a manner that suits training better than traditional games. It introduces random moves to diversify openings, and fixed depth evaluation.
+
+As all commands in stockfish `gensfen` can be invoked either from command line (as `stockfish.exe gensfen ...`, but this is not recommended because it's not possible to specify UCI options before `gensfen` executes) or in the interactive prompt.
+
+It is recommended to set the `PruneAtShallowDepth` UCI option to `false` as it will increase the quality of fixed depth searches.
+
+It is recommended to keep the `EnableTranspositionTable` UCI option at the default `true` value as it will make the generation process faster without noticably harming the uniformity of the data.
+
+`gensfen` takes named parameters in the form of `gensfen param_1_name param_1_value param_2_name param_2_value ...`.
+
+Currently the following options are available:
+
+`set_recommended_uci_options` - this is a modifier not a parameter, no value follows it. If specified then some UCI options are set to recommended values.
+
+`depth` - minimum depth of evaluation of each position. Default: 3.
+
+`depth2` - maximum depth of evaluation of each position. If not specified then the same as `depth`.
+
+`nodes` - the number of nodes to use for evaluation of each position. This number is multiplied by the number of PVs of the current search. This does NOT override the `depth` and `depth2` options. If specified then whichever of depth or nodes limit is reached first applies.
+
+`loop` - the number of training data entries to generate. 1 entry == 1 position. Default: 8000000000 (8B).
+
+`output_file_name` - the name of the file to output to. If the extension is not present or doesn't match the selected training data format the right extension will be appened. Default: generated_kifu
+
+`eval_limit` - evaluations with higher absolute value than this will not be written and will terminate a self-play game. Should not exceed 10000 which is VALUE_KNOWN_WIN, but is only hardcapped at mate in 2 (\~30000). Default: 3000
+
+`random_move_minply` - the minimal ply at which a random move may be executed instead of a move chosen by search. Default: 1.
+
+`random_move_maxply` - the maximal ply at which a random move may be executed instead of a move chosen by search. Default: 24.
+
+`random_move_count` - maximum number of random moves in a single self-play game. Default: 5.
+
+`random_move_like_apery` - either 0 or 1. If 1 then random king moves will be followed by a random king move from the opponent whenever possible with 50% probability. Default: 0.
+
+`random_multi_pv` - the number of PVs used for determining the random move. If not specified then a truly random move will be chosen. If specified then a multiPV search will be performed the random move will be one of the moves chosen by the search.
+
+`random_multi_pv_diff` - Makes the multiPV random move selection consider only moves that are at most `random_multi_pv_diff` worse than the next best move. Default: 30000 (all multiPV moves).
+
+`random_multi_pv_depth` - the depth to use for multiPV search for random move. Default: `depth2`.
+
+`write_minply` - minimum ply for which the training data entry will be emitted. Default: 16.
+
+`write_maxply` - maximum ply for which the training data entry will be emitted. Default: 400.
+
+`book` - a path to an opening book to use for the starting positions. Currently only .epd format is supported. If not specified then the starting position is always the standard chess starting position.
+
+`save_every` - the number of training data entries per file. If not specified then there will be always one file. If specified there may be more than one file generated (each having at most `save_every` training data entries) and each file will have a unique number attached.
+
+`random_file_name` - if specified then the output filename will be chosen randomly. Overrides `output_file_name`.
+
+`write_out_draw_game_in_training_data_generation` - either 0 or 1. If 1 then training data from drawn games will be emitted too. Default: 1.
+
+`use_draw_in_training_data_generation` - deprecated, alias for `write_out_draw_game_in_training_data_generation`
+
+`detect_draw_by_consecutive_low_score` - either 0 or 1. If 1 then drawn games will be adjudicated when the score remains 0 for at least 8 plies after ply 80. Default: 1.
+
+`use_game_draw_adjudication` - deprecated, alias for `detect_draw_by_consecutive_low_score`
+
+`detect_draw_by_insufficient_mating_material` - either 0 or 1. If 1 then position with insufficient material will be adjudicated as draws. Default: 1.
+
+`sfen_format` - format of the training data to use. Either `bin` or `binpack`. Default: `binpack`.
+
+`ensure_quiet` - this is a flag option. When specified the positions will be from the qsearch leaf.
+
+`seed` - seed for the PRNG. Can be either a number or a string. If it's a string then its hash will be used. If not specified then the current time will be used.
@@ -0,0 +1,114 @@
+# Learn
+
+`learn` command allows training a network from training data.
+
+As all commands in stockfish `learn` can be invoked either from command line (as `stockfish.exe learn ...`, but this is not recommended because it's not possible to specify UCI options before `learn` executes) or in the interactive prompt.
+
+`learn` takes named parameters in the form of `learn param_1_name param_1_value param_2_name param_2_value ...`. Unrecognized parameters form a list of paths to training data files.
+
+It is recommended to set the `EnableTranspositionTable` UCI option to `false` to reduce the interference between qsearches which are used to provide shallow evaluation. Using TT may cause the shallow evaluation to diverge from the real evaluation of the net, hiding imperfections.
+
+It is recommended to set the `PruneAtShallowDepth` UCI option to `false` as it will provide more accurate shallow evaluation.
+
+It is **required** to set the `Use NNUE` UCI option to `pure` as otherwise the function being optimized will not always match the function being probed, in which case not much can be learned.
+
+Currently the following options are available:
+
+`set_recommended_uci_options` - this is a modifier not a parameter, no value follows it. If specified then some UCI options are set to recommended values.
+
+`bat` - the size of a batch in multiples of 10000. This determines how many entries are read and shuffled at once during training. Default: 100 (meaning batch size of 1000000).
+
+`targetdir` - path to the direction from which training data will be read. All files in this directory are read sequentially. If not specified then only the list of files from positional arguments will be used. If specified then files from the given directory will be used after the explicitly specified files.
+
+`epochs` - the number of weight update cycles (epochs) to train the network for. One such cycle is `batchsize` positions. If not specified then the training will loop forever.
+
+`basedir` - the base directory for the paths. Default: "" (current directory)
+
+`batchsize` - same as `bat` but doesn't scale by 10000. Default: 1000000
+
+`lr` - initial learning rate. Default: 1.
+
+`use_draw_games_in_training` - either 0 or 1. If 1 then draws will be used in training too. Default: 1.
+
+`use_draw_in_training` - deprecated, alias for `use_draw_games_in_training`
+
+`use_draw_games_in_validation` - either 0 or 1. If 1 then draws will be used in validation too. Default: 1.
+
+`use_draw_in_validation` - deprecated, alias for `use_draw_games_in_validation`
+
+`skip_duplicated_positions_in_training` - either 0 or 1. If 1 then a small hashtable will be used to try to eliminate duplicated position from training. Default: 0.
+
+`use_hash_in_training` - deprecated, alias for `skip_duplicated_positions_in_training`
+
+`winning_probability_coefficient` - some magic value for winning probability. If you need to read this then don't touch it. Default: 1.0 / PawnValueEg / 4.0 * std::log(10.0)
+
+`use_wdl` - either 0 or 1. If 1 then the evaluations will be converted to win/draw/loss percentages prior to learning on them. (Slightly changes the gradient because eval has a different derivative than wdl). Default: 0.
+
+`lambda` - value in range [0..1]. 1 means that only evaluation is used for learning, 0 means that only game result is used. Values inbetween result in interpolation between the two contributions. See `lambda_limit` for when this is applied. Default: 1.0.
+
+`lambda2` - value in range [0..1]. 1 means that only evaluation is used for learning, 0 means that only game result is used. Values inbetween result in interpolation between the two contributions. See `lambda_limit` for when this is applied. Default: 1.0.
+
+`lambda_limit` - the maximum absolute score value for which `lambda` is used as opposed to `lambda2`. For positions with absolute evaluation higher than `lambda_limit` `lambda2` will be used. Default: 32000 (so always `lambda`).
+
+`max_grad` - the maximum allowed loss gradient for backpropagation. Effectively a form of gradient clipping. Useful for the first iterations with a randomly generated net as with higher lr backpropagation often overshoots and kills the net. The default value is fairly conservative, values as low as 0.25 could be used with lr of 1.0 without problems. Default: 1.0.
+
+`reduction_gameply` - the minimum ply after which positions won't be skipped. Positions at plies below this value are skipped with a probability that lessens linearly with the ply (reaching 0 at `reduction_gameply`). Default: 1.
+
+`eval_limit` - positions with absolute evaluation higher than this will be skipped. Default: 32000 (nothing is skipped).
+
+`save_only_once` - this is a modifier not a parameter, no value follows it. If specified then there will be only one network file generated.
+
+`no_shuffle` - this is a modifier not a parameter, no value follows it. If specified then data within a batch won't be shuffled.
+
+`nn_batch_size` - minibatch size used for learning. Should be smaller than batch size. Default: 1000.
+
+`newbob_decay` - learning rate will be multiplied by this factor every time a net is rejected (so in other words it controls LR drops). Default: 0.5 (no LR drops)
+
+`assume_quiet` - this is a flag option. When specified learn will not perform qsearch to reach a quiet position.
+
+`smart_fen_skipping` - this is a flag option. When specified some position that are not good candidates for teaching are skipped. This includes positions where the best move is a capture or promotion, and position where a king is in check.
+
+`newbob_num_trials` - determines after how many subsequent rejected nets the training process will be terminated. Default: 4.
+
+`auto_lr_drop` - every time this many positions are processed the learning rate is multiplied by `newbob_decay`. In other words this value specifies for how many positions a single learning rate stage lasts. If 0 then doesn't have any effect. Default: 0.
+
+`nn_options` - if you're reading this you don't use it. It passes messages directly to the network evaluation. I don't know what it can do either.
+
+`eval_save_interval` - every `eval_save_interval` positions the network will be saved and either accepted or rejected (in which case an LR drop follows). Default: 100000000 (100M). (generally people use values in 10M-100M range)
+
+`loss_output_interval` - every `loss_output_interval` fitness statistics are displayed. Default: 1000000 (1M)
+
+`validation_set_file_name` - path to the file with training data to be used for validation (loss computation and move accuracy)
+
+`sfen_read_size` - the number of sfens to always keep in the buffer. Default: 10000000 (10M)
+
+`thread_buffer_size` - the number of sfens to copy at once to each thread requesting more sfens for learning. Default: 10000
+
+`seed` - seed for the PRNG. Can be either a number or a string. If it's a string then its hash will be used. If not specified then the current time will be used.
+
+`verbose` - this is a modifier, not a parameter. When used there will be more detailed output during training.
+
+## Legacy subcommands and parameters
+
+### Convert
+
+`convert_plain`
+`convert_bin`
+`interpolate_eval`
+`check_invalid_fen`
+`check_illegal_move`
+`convert_bin_from_pgn-extract`
+`pgn_eval_side_to_move`
+`convert_no_eval_fens_as_score_zero`
+`src_score_min_value`
+`src_score_max_value`
+`dest_score_min_value`
+`dest_score_max_value`
+
+### Shuffle
+
+`shuffle`
+`buffer_size`
+`shuffleq`
+`shufflem`
+`output_file_name`
@@ -0,0 +1,21 @@
+# Transform
+
+`transform` command exposes subcommands that perform some specific transformation over data. The call syntax is `transform <subcommand>`. Currently implemented subcommands are listed and described below.
+
+## `nudged_static`
+
+`transform nudged_static` takes named parameters in the form of `gensfen param_1_name param_1_value param_2_name param_2_value ...` and flag parameters which don't require values.
+
+This command goes through positions in the input files and replaces the scores with new ones - generated from static eval - but slightly adjusted based on the scores in the original input file.
+
+Currently the following options are available:
+
+`input_file` - path to the input file. Supports bin and binpack formats. Default: in.binpack.
+
+`output_file` - path to the output file. Supports bin and binpack formats. Default: out.binpack.
+
+`absolute` - states that the adjustment should be bounded by an absolute value. After this token follows the maximum absolute adjustment. Values are always adjusted towards scores in the input file. This is the default mode. Default maximum adjustement: 5.
+
+`relative` - states that the adjustment should be bounded by a value relative in magnitude to the static eval value. After this token follows the maximum relative change - a floating point value greater than 0. For example a value of 0.1 only allows changing the static eval by at most 10% towards the score from the input file.
+
+`interpolate` states that the output score should be a value interpolated between static eval and the score from the input file. After this token follows the interpolation constant `t`. `t` of 0 means that only static eval is used. `t` of 1 means that only score from the input file is used. `t` of 0.5 means that the static eval and input score are averaged. It accepts values outside of range `<0, 1>`, but the usefulness is questionable.
@@ -0,0 +1,42 @@
+import sys
+
+ENTRY_SIZE = 40
+NUM_ENTRIES_IN_CHUNK = 1024*1024
+
+def copy(infile, outfile, count, times):
+    if times > 1:
+        outfile.write(infile.read(count*ENTRY_SIZE)*times)
+    else:
+        offset = 0
+        while offset < count:
+            to_read = NUM_ENTRIES_IN_CHUNK if offset + NUM_ENTRIES_IN_CHUNK <= count else count - offset
+
+            outfile.write(infile.read(to_read*ENTRY_SIZE))
+
+            offset += NUM_ENTRIES_IN_CHUNK
+
+def work():
+    filename = sys.argv[1]
+    offset = int(sys.argv[2])
+    count = int(sys.argv[3])
+    times = int(sys.argv[4]) if len(sys.argv) >= 5 else 1
+
+    with open(filename, 'rb') as infile:
+        infile.seek(offset * ENTRY_SIZE)
+        filename_parts = filename.split('.')
+        out_path = '.'.join(filename_parts[:-1]) + '_' + str(offset) + '_' + str(count) + '_' + str(times) + '.' + filename_parts[-1]
+        with open(out_path, 'wb') as outfile:
+            copy(infile, outfile, count, times)
+
+def show_help():
+    print('Usage: python extract_bin.py filename offset count [times]')
+    print('filename - the path to the .bin file to process')
+    print('offset - the number of sfens to skip')
+    print('count - the number of sfens to extract')
+    print('times - the number of times to repeat the extracted sfens. Default = 1')
+    print('The result is saved in a new file named `filename.stem`_`offset`_`count`_`times`.bin')
+
+if len(sys.argv) < 4:
+    show_help()
+else:
+    work()
@@ -0,0 +1,69 @@
+import struct
+import sys
+import os
+import random
+from pathlib import Path
+
+def index_binpack(file):
+    print('Indexing...')
+    index = []
+    offset = 0
+    report_every = 100
+    prev_mib = -report_every
+    while file.peek():
+        chunk_header = file.read(8)
+        assert chunk_header[0:4] == b'BINP'
+        size = struct.unpack('<I', chunk_header[4:])[0]
+        file.seek(size, os.SEEK_CUR)
+        index.append((offset, size + 8))
+        offset += size + 8
+
+        mib = offset // 1024 // 1024
+        if mib // 100 != prev_mib // 100:
+            print('Indexed {} MiB'.format(mib))
+            prev_mib = mib
+
+    return index
+
+def copy_binpack_indexed(in_file, index, out_file):
+    print('Copying...')
+    total_size = 0
+    report_every = 100
+    prev_mib = -report_every
+    for offset, size in index:
+        in_file.seek(offset, os.SEEK_SET)
+        data = in_file.read(size)
+        assert len(data) == size
+        out_file.write(data)
+
+        total_size += size
+        mib = total_size // 1024 // 1024
+        if mib // 100 != prev_mib // 100:
+            print('Copied {} MiB'.format(mib))
+            prev_mib = mib
+
+def main():
+    if len(sys.argv) < 3:
+        print('Usage: python shuffle_binpack.py infile outfile')
+        return
+
+    in_filename = sys.argv[1]
+    out_filename = sys.argv[2]
+
+    if (Path(out_filename).exists()):
+        print('Output path already exists. Please specify a path to a file that does not exist.')
+        return
+
+    in_file = open(in_filename, 'rb')
+    out_file = open(out_filename, 'wb')
+
+    index = index_binpack(in_file)
+    print('Shuffling...')
+    random.shuffle(index)
+
+    copy_binpack_indexed(in_file, index, out_file)
+
+    in_file.close()
+    out_file.close()
+
+main()
@@ -28,43 +28,49 @@ else
 EXE = stockfish
 endif

-### Installation dir definitions
-PREFIX = /usr/local
-BINDIR = $(PREFIX)/bin
-
-### Built-in benchmark for pgo-builds
-PGOBENCH = ./$(EXE) bench
-PGOGENSFEN = ./$(EXE) gensfen depth 3 loop 100000
-
-### Source and object files
-SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp \
-	material.cpp misc.cpp movegen.cpp movepick.cpp pawns.cpp position.cpp psqt.cpp \
-	search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \
-	nnue/evaluate_nnue.cpp \
-	nnue/evaluate_nnue_learner.cpp \
-	nnue/features/half_kp.cpp \
-	nnue/features/half_relative_kp.cpp \
-	nnue/features/k.cpp \
-	nnue/features/p.cpp \
-	nnue/features/castling_right.cpp \
-	nnue/features/enpassant.cpp \
-	nnue/nnue_test_command.cpp \
-	extra/sfen_packer.cpp \
-	learn/gensfen2019.cpp \
-	learn/learner.cpp \
-	learn/learning_tools.cpp \
-	learn/multi_think.cpp
-
-OBJS = $(notdir $(SRCS:.cpp=.o))
-
-VPATH = syzygy:nnue:nnue/features:eval:extra:learn
-
 ### Establish the operating system name
 KERNEL = $(shell uname -s)
 ifeq ($(KERNEL),Linux)
 	OS = $(shell uname -o)
 endif

+### Installation dir definitions
+PREFIX = /usr/local
+BINDIR = $(PREFIX)/bin
+
+### Built-in benchmark for pgo-builds
+PGO_TRAINING_DATA_FILE = pgo_training_data.bin
+PGOBENCH = ./$(EXE) bench
+PGOGENSFEN = ./$(EXE) gensfen depth 3 loop 1000 sfen_format bin output_file_name $(PGO_TRAINING_DATA_FILE)
+
+### Source and object files
+SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp \
+	material.cpp misc.cpp movegen.cpp movepick.cpp pawns.cpp position.cpp psqt.cpp \
+	search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \
+	extra/stockfish_blas.cpp \
+	nnue/evaluate_nnue.cpp \
+	nnue/evaluate_nnue_learner.cpp \
+	nnue/features/half_kp.cpp \
+	nnue/features/half_ka.cpp \
+	nnue/features/half_relative_kp.cpp \
+	nnue/features/half_relative_ka.cpp \
+	nnue/features/k.cpp \
+	nnue/features/p.cpp \
+	nnue/features/a.cpp \
+	nnue/features/castling_right.cpp \
+	nnue/features/enpassant.cpp \
+	nnue/nnue_test_command.cpp \
+	learn/sfen_packer.cpp \
+	learn/learn.cpp \
+	learn/gensfen.cpp \
+	learn/opening_book.cpp \
+	learn/convert.cpp \
+	learn/transform.cpp
+
+OBJS = $(notdir $(SRCS:.cpp=.o))
+
+VPATH = syzygy:nnue:nnue/features:eval:extra:learn
+
 ### ==========================================================================
 ### Section 2. High-level Configuration
 ### ==========================================================================
@@ -99,17 +105,23 @@ endif

 ### 2.1. General and architecture defaults

+ifeq ($(ARCH),)
+   ARCH = x86-64-modern
+   help_skip_sanity = yes
+endif
 # explicitly check for the list of supported architectures (as listed with make help),
 # the user can override with `make ARCH=x86-32-vnni256 SUPPORTED_ARCH=true`
-ifeq ($(ARCH),$(filter $(ARCH),x86-64-vnni512 x86-64-vnni256 x86-64-avx512 x86-64-bmi2 x86-64-avx2 \
-                               x86-64-sse41-popcnt x86-64-modern x86-64-ssse3 x86-64-sse3-popcnt \
-                               x86-64 x86-32-sse41-popcnt x86-32-sse2 x86-32 ppc-64 ppc-32 \
-                               armv7 armv7-neon armv8 apple-silicon general-64 general-32))
+ifeq ($(ARCH), $(filter $(ARCH), \
+                 x86-64-vnni512 x86-64-vnni256 x86-64-avx512 x86-64-bmi2 x86-64-avx2 \
+                 x86-64-sse41-popcnt x86-64-modern x86-64-ssse3 x86-64-sse3-popcnt \
+                 x86-64 x86-32-sse41-popcnt x86-32-sse2 x86-32 ppc-64 ppc-32 \
+                 armv7 armv7-neon armv8 apple-silicon general-64 general-32))
   SUPPORTED_ARCH=true
 else
   SUPPORTED_ARCH=false
 endif

+blas = no
 optimize = yes
 debug = no
 sanitize = no
@@ -127,7 +139,6 @@ avx512 = no
 vnni256 = no
 vnni512 = no
 neon = no
-ARCH = x86-64-modern
 STRIP = strip

 ### 2.2 Architecture specific
@@ -306,9 +317,9 @@ endif
 ### ==========================================================================

 ### 3.1 Selecting compiler (default = gcc)
-CXXFLAGS += -Wall -Wcast-qual -fno-exceptions -std=c++17 $(EXTRACXXFLAGS) $(LEARNCXXFLAGS)
-DEPENDFLAGS += -std=c++17
-LDFLAGS += $(EXTRALDFLAGS) $(LEARNLDFLAGS)
+CXXFLAGS += -g -Wall -Wcast-qual -fno-exceptions -std=c++17 -fopenmp -I. $(EXTRACXXFLAGS)
+LDFLAGS += -fopenmp $(EXTRALDFLAGS)
+DEPENDFLAGS += -std=c++17 -I.

 ifeq ($(COMP),)
 	COMP=gcc
@@ -391,19 +402,6 @@ ifeq ($(COMP),clang)
 	endif
 endif

-ifeq ($(comp),icc)
-	profile_make = icc-profile-make
-	profile_use = icc-profile-use
-else
-ifeq ($(comp),clang)
-	profile_make = clang-profile-make
-	profile_use = clang-profile-use
-else
-	profile_make = gcc-profile-make
-	profile_use = gcc-profile-use
-endif
-endif
-
 ifeq ($(KERNEL),Darwin)
 	CXXFLAGS += -arch $(arch) -mmacosx-version-min=10.14
 	LDFLAGS += -arch $(arch) -mmacosx-version-min=10.14
@@ -415,20 +413,30 @@ endif
 # Currently we don't know how to make PGO builds with the NDK yet.
 ifeq ($(COMP),ndk)
 	CXXFLAGS += -stdlib=libc++ -fPIE
+	comp=clang
 	ifeq ($(arch),armv7)
-		comp=armv7a-linux-androideabi16-clang
 		CXX=armv7a-linux-androideabi16-clang++
 		CXXFLAGS += -mthumb -march=armv7-a -mfloat-abi=softfp -mfpu=neon
 		STRIP=arm-linux-androideabi-strip
 	endif
 	ifeq ($(arch),armv8)
-		comp=aarch64-linux-android21-clang
 		CXX=aarch64-linux-android21-clang++
 		STRIP=aarch64-linux-android-strip
 	endif
 	LDFLAGS += -static-libstdc++ -pie -lm -latomic
 endif

+ifeq ($(comp),icc)
+	profile_make = icc-profile-make
+	profile_use = icc-profile-use
+else ifeq ($(comp),clang)
+	profile_make = clang-profile-make
+	profile_use = clang-profile-use
+else
+	profile_make = gcc-profile-make
+	profile_use = gcc-profile-use
+endif
+
 ### Travis CI script uses COMPILER to overwrite CXX
 ifdef COMPILER
 	COMPCXX=$(COMPILER)
@@ -463,14 +471,33 @@ ifneq ($(comp),mingw)
 endif
 endif

-### 3.2.1 Debugging
+### 3.2.1. BLAS libraries
+ifeq ($(blas), yes)
+	LDFLAGS += -lopenblas
+
+	ifeq ($(KERNEL),Linux)
+		LDFLAGS +=
+	else
+		CXXFLAGS += -I/mingw64/include/OpenBLAS
+
+		ifeq ($(debug),yes)
+			LDFLAGS += -Wl,-static
+		else
+			LDFLAGS += -Wl,-s -static
+		endif
+	endif
+
+	CXXFLAGS += -DUSE_BLAS
+endif
+
+### 3.2.2 Debugging
 ifeq ($(debug),no)
 	CXXFLAGS += -DNDEBUG
 else
 	CXXFLAGS += -g
 endif

-### 3.2.2 Debugging with undefined behavior sanitizers
+### 3.2.3 Debugging with undefined behavior sanitizers
 ifneq ($(sanitize),no)
        CXXFLAGS += -g3 -fsanitize=$(sanitize)
        LDFLAGS += -fsanitize=$(sanitize)
@@ -600,11 +627,13 @@ endif
 ### needs access to the optimization flags.
 ifeq ($(optimize),yes)
 ifeq ($(debug), no)
-	ifeq ($(COMP),ndk)
-		CXXFLAGS += -flto=thin
-		LDFLAGS += $(CXXFLAGS)
-	else ifeq ($(comp),clang)
+	ifeq ($(comp),clang)
 		CXXFLAGS += -flto=thin
+		ifneq ($(findstring MINGW,$(KERNEL)),)
+			CXXFLAGS += -fuse-ld=lld
+		else ifneq ($(findstring MSYS,$(KERNEL)),)
+			CXXFLAGS += -fuse-ld=lld
+		endif
 		LDFLAGS += $(CXXFLAGS)

 # GCC and CLANG use different methods for parallelizing LTO and CLANG pretends to be
@@ -628,10 +657,12 @@ ifeq ($(debug), no)
 # So, only enable it for a cross from Linux by default.
 	else ifeq ($(comp),mingw)
 	ifeq ($(KERNEL),Linux)
+	ifneq ($(arch),i386)
 		CXXFLAGS += -flto
 		LDFLAGS += $(CXXFLAGS) -flto=jobserver
 	endif
 	endif
+	endif
 endif
 endif

@@ -707,11 +738,12 @@ help:
 	@echo "make -j build ARCH=x86-64-ssse3 COMP=clang"
 	@echo ""
 	@echo "-------------------------------"
-ifeq ($(SUPPORTED_ARCH), true)
+ifeq ($(SUPPORTED_ARCH)$(help_skip_sanity), true)
 	@echo "The selected architecture $(ARCH) will enable the following configuration: "
 	@$(MAKE) ARCH=$(ARCH) COMP=$(COMP) config-sanity
 else
 	@echo "Specify a supported architecture with the ARCH option for more details"
+	@echo ""
 endif


@@ -719,7 +751,7 @@ endif
        config-sanity icc-profile-use icc-profile-make gcc-profile-use gcc-profile-make \
        clang-profile-use clang-profile-make

-build: config-sanity
+build: net config-sanity
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) all

 profile-build: net config-sanity objclean profileclean
@@ -729,6 +761,7 @@ profile-build: net config-sanity objclean profileclean
 	@echo ""
 	@echo "Step 2/4. Running benchmark for pgo-build ..."
 	$(PGOBENCH) > /dev/null
+	$(PGOGENSFEN) > /dev/null
 	@echo ""
 	@echo "Step 3/4. Building optimized executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) objclean
@@ -745,12 +778,13 @@ install:
 	-cp $(EXE) $(BINDIR)
 	-strip $(BINDIR)/$(EXE)

-#clean all
+# clean all
 clean: objclean profileclean
 	@rm -f .depend *~ core

+# evaluation network (nnue)
 net:
-	$(eval nnuenet := $(shell grep EvalFile ucioption.cpp | grep Option | sed 's/.*\(nn-[a-z0-9]\{12\}.nnue\).*/\1/'))
+	$(eval nnuenet := $(shell grep EvalFileDefaultName evaluate.h | grep define | sed 's/.*\(nn-[a-z0-9]\{12\}.nnue\).*/\1/'))
 	@echo "Default net: $(nnuenet)"
 	$(eval nnuedownloadurl := https://tests.stockfishchess.org/api/nn/$(nnuenet))
 	$(eval curl_or_wget := $(shell if hash curl 2>/dev/null; then echo "curl -skL"; elif hash wget 2>/dev/null; then echo "wget -qO-"; fi))
@@ -772,7 +806,6 @@ net:
            echo "shasum / sha256sum not found, skipping net validation"; \
        fi

-
 # clean binaries and objects
 objclean:
 	@rm -f $(EXE) *.o ./syzygy/*.o ./nnue/*.o ./nnue/features/*.o ./learn/*.o ./extra/*.o ./eval/*.o
@@ -782,6 +815,7 @@ profileclean:
 	@rm -rf profdir
 	@rm -f bench.txt *.gcda *.gcno ./syzygy/*.gcda ./nnue/*.gcda ./nnue/features/*.gcda *.s ./learn/*.gcda ./extra/*.gcda ./eval/*.gcda
 	@rm -f stockfish.profdata *.profraw
+	@rm -f $(PGO_TRAINING_DATA_FILE)

 default:
 	help
@@ -792,7 +826,7 @@ default:

 all: $(EXE) .depend

-config-sanity:
+config-sanity: net
 	@echo ""
 	@echo "Config:"
 	@echo "debug: '$(debug)'"
@@ -913,6 +947,6 @@ profile-learn: config-sanity objclean profileclean
 	rm generated_kifu.bin

 .depend:
-	-@$(CXX) $(DEPENDFLAGS) -MM $(SRCS) > $@ 2> /dev/null
+	-@$(CXX) $(DEPENDFLAGS) -MM $(SRCS) > $@

 -include .depend
@@ -164,5 +164,7 @@ vector<string> setup_bench(const Position& current, istream& is) {
          ++posCounter;
      }

+  list.emplace_back("setoption name Use NNUE value true");
+
  return list;
 }
@@ -1,82 +0,0 @@
-#ifndef _EVALUATE_COMMON_H_
-#define _EVALUATE_COMMON_H_
-
-// A common header-like function for modern evaluation functions (EVAL_KPPT and EVAL_KPP_KKPT).
-
-#if defined(EVAL_NNUE) || defined(EVAL_LEARN)
-#include <functional>
-
-// KK file name
-#define KK_BIN "KK_synthesized.bin"
-
-// KKP file name
-#define KKP_BIN "KKP_synthesized.bin"
-
-// KPP file name
-#define KPP_BIN "KPP_synthesized.bin"
-
-namespace Eval
-{
-
-#if defined(USE_EVAL_HASH)
-	// prefetch function
-	void prefetch_evalhash(const Key key);
-#endif
-
-	// An operator that applies the function f to each parameter of the evaluation function.
-	// Used for parameter analysis etc.
-	// type indicates the survey target.
-	// type = -1 :KK,KKP,KPP all
-	// type = 0: KK only
-	// type = 1: KKP only
-	// type = 2: KPP only
-	void foreach_eval_param(std::function<void(int32_t, int32_t)>f, int type = -1);
-
-	// --------------------------
-	// for learning
-	// --------------------------
-
-#if defined(EVAL_LEARN)
-	// Initialize the gradient array during learning
-	// Pass the learning rate as an argument. If 0.0, the default value is used.
-	// The epoch of update_weights() gradually changes from eta to eta2 until eta_epoch.
-	// After eta2_epoch, gradually change from eta2 to eta3.
-	void init_grad(double eta1, uint64_t eta_epoch, double eta2, uint64_t eta2_epoch, double eta3);
-
-	// Add the gradient difference value to the gradient array for all features that appear in the current phase.
-	// freeze[0]: Flag that kk does not learn
-	// freeze[1]: Flag that kkp does not learn
-	// freeze[2]: Flag that kpp does not learn
-	// freeze[3]: Flag that kppp does not learn
-	void add_grad(Position& pos, Color rootColor, double delt_grad, const std::array<bool, 4>& freeze);
-
-	// Do SGD or AdaGrad or something based on the current gradient.
-	// epoch: Generation counter (starting from 0)
-	// freeze[0]: Flag that kk does not learn
-	// freeze[1]: Flag that kkp does not learn
-	// freeze[2]: Flag that kpp does not learn
-	// freeze[3]: Flag that kppp does not learn
-	void update_weights(uint64_t epoch, const std::array<bool, 4>& freeze);
-
-	// Save the evaluation function parameters to a file.
-	// You can specify the extension added to the end of the file.
-	void save_eval(std::string suffix);
-
-	// Get the current eta.
-	double get_eta();
-
-	// --learning related commands
-
-	// A function that normalizes KK. Note that it is not completely equivalent to the original evaluation function.
-	// By making the values of kkp and kpp as close to zero as possible, the value of the feature factor (which is zero) that did not appear during learning
-	// The idea of ensuring it is valid.
-	void regularize_kk();
-
-#endif
-
-
-}
-
-#endif  // defined(EVAL_NNUE) || defined(EVAL_LEARN)
-
-#endif // _EVALUATE_KPPT_COMMON_H_
@@ -20,61 +20,25 @@
 #include <cassert>
 #include <cstdlib>
 #include <cstring>   // For std::memset
+#include <fstream>
 #include <iomanip>
 #include <sstream>
 #include <iostream>
-#include <set>
+#include <streambuf>
+#include <vector>
+
+#include "nnue/evaluate_nnue.h"

 #include "bitboard.h"
 #include "evaluate.h"
 #include "material.h"
+#include "misc.h"
 #include "pawns.h"
 #include "thread.h"
 #include "uci.h"
+#include "incbin/incbin.h"

-#ifdef EVAL_LEARN
-namespace Learner
-{
-    extern bool use_raw_nnue_eval;
-}
-#endif
-
-namespace Eval {
-
-  bool useNNUE;
-  std::string eval_file_loaded="None";
-
-  void init_NNUE() {
-
-    useNNUE = Options["Use NNUE"];
-    std::string eval_file = std::string(Options["EvalFile"]);
-    if (useNNUE && eval_file_loaded != eval_file)
-        if (Eval::NNUE::load_eval_file(eval_file))
-            eval_file_loaded = eval_file;
-  }
-
-  void verify_NNUE() {
-
-    std::string eval_file = std::string(Options["EvalFile"]);
-    if (useNNUE && eval_file_loaded != eval_file)
-    {
-        UCI::OptionsMap defaults;
-        UCI::init(defaults);
-
-        sync_cout << "info string ERROR: NNUE evaluation used, but the network file " << eval_file << " was not loaded successfully." << sync_endl;
-        sync_cout << "info string ERROR: The UCI option EvalFile might need to specify the full path, including the directory/folder name, to the file." << sync_endl;
-        sync_cout << "info string ERROR: The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/"+std::string(defaults["EvalFile"]) << sync_endl;
-        sync_cout << "info string ERROR: If the UCI option Use NNUE is set to true, network evaluation parameters compatible with the program must be available." << sync_endl;
-        sync_cout << "info string ERROR: The engine will be terminated now." << sync_endl;
-        std::exit(EXIT_FAILURE);
-    }
-
-    if (useNNUE)
-        sync_cout << "info string NNUE evaluation using " << eval_file << " enabled." << sync_endl;
-    else
-        sync_cout << "info string classical evaluation enabled." << sync_endl;
-  }
-}
+using namespace std;

 namespace Trace {

@@ -120,11 +84,11 @@ using namespace Trace;
 namespace {

  // Threshold for lazy and space evaluation
-  constexpr Value LazyThreshold1 =  Value(1400);
-  constexpr Value LazyThreshold2 =  Value(1300);
-  constexpr Value SpaceThreshold = Value(12222);
-  constexpr Value NNUEThreshold1 =   Value(550);
-  constexpr Value NNUEThreshold2 =   Value(150);
+  constexpr Value LazyThreshold1 =  Value(1565);
+  constexpr Value LazyThreshold2 =  Value(1102);
+  constexpr Value SpaceThreshold = Value(11551);
+  constexpr Value NNUEThreshold1 =   Value(682);
+  constexpr Value NNUEThreshold2 =   Value(176);

  // KingAttackWeights[PieceType] contains king attack weights by piece type
  constexpr int KingAttackWeights[PIECE_TYPE_NB] = { 0, 0, 81, 52, 44, 10 };
@@ -132,7 +96,7 @@ namespace {
  // SafeCheck[PieceType][single/multiple] contains safe check bonus by piece type,
  // higher if multiple safe checks are possible for that piece type.
  constexpr int SafeCheck[][2] = {
-      {}, {}, {792, 1283}, {645, 967}, {1084, 1897}, {772, 1119}
+      {}, {}, {803, 1292}, {639, 974}, {1087, 1878}, {759, 1132}
  };

 #define S(mg, eg) make_score(mg, eg)
@@ -140,19 +104,25 @@ namespace {
  // MobilityBonus[PieceType-2][attacked] contains bonuses for middle and end game,
  // indexed by piece type and number of attacked squares in the mobility area.
  constexpr Score MobilityBonus[][32] = {
-    { S(-62,-81), S(-53,-56), S(-12,-31), S( -4,-16), S(  3,  5), S( 13, 11), // Knight
-      S( 22, 17), S( 28, 20), S( 33, 25) },
-    { S(-48,-59), S(-20,-23), S( 16, -3), S( 26, 13), S( 38, 24), S( 51, 42), // Bishop
-      S( 55, 54), S( 63, 57), S( 63, 65), S( 68, 73), S( 81, 78), S( 81, 86),
-      S( 91, 88), S( 98, 97) },
-    { S(-60,-78), S(-20,-17), S(  2, 23), S(  3, 39), S(  3, 70), S( 11, 99), // Rook
-      S( 22,103), S( 31,121), S( 40,134), S( 40,139), S( 41,158), S( 48,164),
-      S( 57,168), S( 57,169), S( 62,172) },
-    { S(-30,-48), S(-12,-30), S( -8, -7), S( -9, 19), S( 20, 40), S( 23, 55), // Queen
-      S( 23, 59), S( 35, 75), S( 38, 78), S( 53, 96), S( 64, 96), S( 65,100),
-      S( 65,121), S( 66,127), S( 67,131), S( 67,133), S( 72,136), S( 72,141),
-      S( 77,147), S( 79,150), S( 93,151), S(108,168), S(108,168), S(108,171),
-      S(110,182), S(114,182), S(114,192), S(116,219) }
+    { S(-62,-79), S(-53,-57), S(-12,-31), S( -3,-17), S(  3,  7), S( 12, 13), // Knight
+      S( 21, 16), S( 28, 21), S( 37, 26) },
+    { S(-47,-59), S(-20,-25), S( 14, -8), S( 29, 12), S( 39, 21), S( 53, 40), // Bishop
+      S( 53, 56), S( 60, 58), S( 62, 65), S( 69, 72), S( 78, 78), S( 83, 87),
+      S( 91, 88), S( 96, 98) },
+    { S(-60,-82), S(-24,-15), S(  0, 17) ,S(  3, 43), S(  4, 72), S( 14,100), // Rook
+      S( 20,102), S( 30,122), S( 41,133), S(41 ,139), S( 41,153), S( 45,160),
+      S( 57,165), S( 58,170), S( 67,175) },
+    { S(-29,-49), S(-16,-29), S( -8, -8), S( -8, 17), S( 18, 39), S( 25, 54), // Queen
+      S( 23, 59), S( 37, 73), S( 41, 76), S( 54, 95), S( 65, 95) ,S( 68,101),
+      S( 69,124), S( 70,128), S( 70,132), S( 70,133) ,S( 71,136), S( 72,140),
+      S( 74,147), S( 76,149), S( 90,153), S(104,169), S(105,171), S(106,171),
+      S(112,178), S(114,185), S(114,187), S(119,221) }
+  };
+
+  // BishopPawns[distance from edge] contains a file-dependent penalty for pawns on
+  // squares of the same color as our bishop.
+  constexpr Score BishopPawns[int(FILE_NB) / 2] = {
+    S(3, 8), S(3, 9), S(1, 8), S(3, 7)
  };

  // KingProtector[knight/bishop] contains penalty for each distance unit to own king
@@ -160,32 +130,31 @@ namespace {

  // Outpost[knight/bishop] contains bonuses for each knight or bishop occupying a
  // pawn protected square on rank 4 to 6 which is also safe from a pawn attack.
-  constexpr Score Outpost[] = { S(56, 36), S(30, 23) };
+  constexpr Score Outpost[] = { S(56, 34), S(31, 23) };

  // PassedRank[Rank] contains a bonus according to the rank of a passed pawn
  constexpr Score PassedRank[RANK_NB] = {
-    S(0, 0), S(10, 28), S(17, 33), S(15, 41), S(62, 72), S(168, 177), S(276, 260)
+    S(0, 0), S(9, 28), S(15, 31), S(17, 39), S(64, 70), S(171, 177), S(277, 260)
  };

  // RookOnFile[semiopen/open] contains bonuses for each rook when there is
  // no (friendly) pawn on the rook file.
-  constexpr Score RookOnFile[] = { S(19, 7), S(48, 29) };
+  constexpr Score RookOnFile[] = { S(19, 7), S(48, 27) };

  // ThreatByMinor/ByRook[attacked PieceType] contains bonuses according to
  // which piece type attacks which one. Attacks on lesser pieces which are
  // pawn-defended are not considered.
  constexpr Score ThreatByMinor[PIECE_TYPE_NB] = {
-    S(0, 0), S(5, 32), S(57, 41), S(77, 56), S(88, 119), S(79, 161)
+    S(0, 0), S(5, 32), S(55, 41), S(77, 56), S(89, 119), S(79, 162)
  };

  constexpr Score ThreatByRook[PIECE_TYPE_NB] = {
-    S(0, 0), S(3, 46), S(37, 68), S(42, 60), S(0, 38), S(58, 41)
+    S(0, 0), S(3, 44), S(37, 68), S(42, 60), S(0, 39), S(58, 43)
  };

  // Assorted bonuses and penalties
  constexpr Score BadOutpost          = S( -7, 36);
  constexpr Score BishopOnKingRing    = S( 24,  0);
-  constexpr Score BishopPawns         = S(  3,  7);
  constexpr Score BishopXRayPawns     = S(  4,  5);
  constexpr Score CorneredBishop      = S( 50, 50);
  constexpr Score FlankAttacks        = S(  8,  0);
@@ -198,7 +167,6 @@ namespace {
  constexpr Score ReachableOutpost    = S( 31, 22);
  constexpr Score RestrictedPiece     = S(  7,  7);
  constexpr Score RookOnKingRing      = S( 16,  0);
-  constexpr Score RookOnQueenFile     = S(  6, 11);
  constexpr Score SliderOnQueen       = S( 60, 18);
  constexpr Score ThreatByKing        = S( 24, 89);
  constexpr Score ThreatByPawnPush    = S( 48, 39);
@@ -387,7 +355,7 @@ namespace {
                // when the bishop is outside the pawn chain.
                Bitboard blocked = pos.pieces(Us, PAWN) & shift<Down>(pos.pieces());

-                score -= BishopPawns * pos.pawns_on_same_color_squares(Us, s)
+                score -= BishopPawns[edge_distance(file_of(s))] * pos.pawns_on_same_color_squares(Us, s)
                                     * (!(attackedBy[Us][PAWN] & s) + popcount(blocked & CenterFiles));

                // Penalty for all enemy pawns x-rayed
@@ -414,10 +382,6 @@ namespace {

        if (Pt == ROOK)
        {
-            // Bonus for rook on the same file as a queen
-            if (file_bb(s) & pos.pieces(QUEEN))
-                score += RookOnQueenFile;
-
            // Bonus for rook on an open or semi-open file
            if (pos.is_on_semiopen_file(Us, s))
                score += RookOnFile[pos.is_on_semiopen_file(Them, s)];
@@ -515,18 +479,18 @@ namespace {
    int kingFlankAttack  = popcount(b1) + popcount(b2);
    int kingFlankDefense = popcount(b3);

-    kingDanger +=        kingAttackersCount[Them] * kingAttackersWeight[Them]
-                 + 185 * popcount(kingRing[Us] & weak)
-                 + 148 * popcount(unsafeChecks)
-                 +  98 * popcount(pos.blockers_for_king(Us))
-                 +  69 * kingAttacksCount[Them]
-                 +   3 * kingFlankAttack * kingFlankAttack / 8
-                 +       mg_value(mobility[Them] - mobility[Us])
-                 - 873 * !pos.count<QUEEN>(Them)
-                 - 100 * bool(attackedBy[Us][KNIGHT] & attackedBy[Us][KING])
-                 -   6 * mg_value(score) / 8
-                 -   4 * kingFlankDefense
-                 +  37;
+    kingDanger +=        kingAttackersCount[Them] * kingAttackersWeight[Them] // (~10 Elo)
+                 + 185 * popcount(kingRing[Us] & weak)                        // (~15 Elo)
+                 + 148 * popcount(unsafeChecks)                               // (~4 Elo)
+                 +  98 * popcount(pos.blockers_for_king(Us))                  // (~2 Elo)
+                 +  69 * kingAttacksCount[Them]                               // (~0.5 Elo)
+                 +   3 * kingFlankAttack * kingFlankAttack / 8                // (~0.5 Elo)
+                 +       mg_value(mobility[Them] - mobility[Us])              // (~0.5 Elo)
+                 - 873 * !pos.count<QUEEN>(Them)                              // (~24 Elo)
+                 - 100 * bool(attackedBy[Us][KNIGHT] & attackedBy[Us][KING])  // (~5 Elo)
+                 -   6 * mg_value(score) / 8                                  // (~8 Elo)
+                 -   4 * kingFlankDefense                                     // (~5 Elo)
+                 +  37;                                                       // (~0.5 Elo)

    // Transform the kingDanger units into a Score, and subtract it from the evaluation
    if (kingDanger > 100)
@@ -843,7 +807,9 @@ namespace {
            sf = 37 + 3 * (pos.count<QUEEN>(WHITE) == 1 ? pos.count<BISHOP>(BLACK) + pos.count<KNIGHT>(BLACK)
                                                        : pos.count<BISHOP>(WHITE) + pos.count<KNIGHT>(WHITE));
        else
-            sf = std::min(sf, 36 + 7 * pos.count<PAWN>(strongSide));
+            sf = std::min(sf, 36 + 7 * pos.count<PAWN>(strongSide)) - 4 * !pawnsOnBothFlanks;
+
+        sf -= 4 * !pawnsOnBothFlanks;
    }

    // Interpolate between the middlegame and (scaled by 'sf') endgame score
@@ -947,19 +913,47 @@ make_v:
 /// evaluation of the position from the point of view of the side to move.

 Value Eval::evaluate(const Position& pos) {
-#ifdef EVAL_LEARN
-  if (Learner::use_raw_nnue_eval) {
-      return NNUE::evaluate(pos);
+
+  Value v;
+
+  if (NNUE::useNNUE == NNUE::UseNNUEMode::Pure) {
+      v = NNUE::evaluate(pos);
+
+      // Guarantee evaluation does not hit the tablebase range
+      v = std::clamp(v, VALUE_TB_LOSS_IN_MAX_PLY + 1, VALUE_TB_WIN_IN_MAX_PLY - 1);
+
+      return v;
  }
-#endif
+  else if (NNUE::useNNUE == NNUE::UseNNUEMode::False)
+      v = Evaluation<NO_TRACE>(pos).value();
+  else
+  {
+      // Scale and shift NNUE for compatibility with search and classical evaluation
+      auto  adjusted_NNUE = [&](){
+         int mat = pos.non_pawn_material() + PawnValueMg * pos.count<PAWN>();
+         return NNUE::evaluate(pos) * (720 + mat / 32) / 1024 + Tempo;
+      };

-  bool classical = !Eval::useNNUE
-                ||  abs(eg_value(pos.psq_score())) * 16 > NNUEThreshold1 * (16 + pos.rule50_count());
-  Value v = classical ? Evaluation<NO_TRACE>(pos).value()
-                      : NNUE::evaluate(pos) * 5 / 4 + Tempo;
+      // If there is PSQ imbalance use classical eval, with small probability if it is small
+      Value psq = Value(abs(eg_value(pos.psq_score())));
+      int   r50 = 16 + pos.rule50_count();
+      bool  largePsq = psq * 16 > (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50;
+      bool  classical = largePsq || (psq > PawnValueMg / 4 && !(pos.this_thread()->nodes & 0xB));

-  if (classical && Eval::useNNUE && abs(v) * 16 < NNUEThreshold2 * (16 + pos.rule50_count()))
-      v = NNUE::evaluate(pos) * 5 / 4 + Tempo;
+      bool strongClassical = pos.non_pawn_material() < 2 * RookValueMg && pos.count<PAWN>() < 2;
+
+      v = classical || strongClassical ? Evaluation<NO_TRACE>(pos).value() : adjusted_NNUE();
+
+      // If the classical eval is small and imbalance large, use NNUE nevertheless.
+      // For the case of opposite colored bishops, switch to NNUE eval with
+      // small probability if the classical eval is less than the threshold.
+      if (   largePsq && !strongClassical
+          && (   abs(v) * 16 < NNUEThreshold2 * r50
+              || (   pos.opposite_bishops()
+                  && abs(v) * 16 < (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50
+                  && !(pos.this_thread()->nodes & 0xB))))
+          v = adjusted_NNUE();
+  }

  // Damp down the evaluation linearly when shuffling
  v = v * (100 - pos.rule50_count()) / 100;
@@ -1015,7 +1009,7 @@ std::string Eval::trace(const Position& pos) {

  ss << "\nClassical evaluation: " << to_cp(v) << " (white side)\n";

-  if (Eval::useNNUE)
+  if (NNUE::useNNUE != NNUE::UseNNUEMode::False)
  {
      v = NNUE::evaluate(pos);
      v = pos.side_to_move() == WHITE ? v : -v;
@@ -26,23 +26,13 @@
 class Position;

 namespace Eval {
-
  std::string trace(const Position& pos);
  Value evaluate(const Position& pos);

-  extern bool useNNUE;
-  extern std::string eval_file_loaded;
-  void init_NNUE();
-  void verify_NNUE();
-
-  namespace NNUE {
-
-    Value evaluate(const Position& pos);
-    Value compute_eval(const Position& pos);
-    void  update_eval(const Position& pos);
-    bool  load_eval_file(const std::string& evalFile);
-
-  } // namespace NNUE
+  // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
+  // for the build process (profile-build and fishtest) to work. Do not change the
+  // name of the macro, as it is used in the Makefile.
+  #define EvalFileDefaultName   "nn-c3ca321c51c9.nnue"

 } // namespace Eval

@@ -1,429 +0,0 @@
-#if defined (EVAL_LEARN)
-
-#include "../misc.h"
-#include "../position.h"
-
-#include <sstream>
-#include <fstream>
-#include <cstring> // std::memset()
-
-using namespace std;
-
-// -----------------------------------
-// stage compression/decompression
-// -----------------------------------
-
-// Class that handles bitstream
-// useful when doing aspect encoding
-struct BitStream
-{
-  // Set the memory to store the data in advance.
-  // Assume that memory is cleared to 0.
-  void  set_data(uint8_t* data_) { data = data_; reset(); }
-
-  // Get the pointer passed in set_data().
-  uint8_t* get_data() const { return data; }
-
-  // Get the cursor.
-  int get_cursor() const { return bit_cursor; }
-
-  // reset the cursor
-  void reset() { bit_cursor = 0; }
-
-  // Write 1bit to the stream.
-  // If b is non-zero, write out 1. If 0, write 0.
-  void write_one_bit(int b)
-  {
-    if (b)
-      data[bit_cursor / 8] |= 1 << (bit_cursor & 7);
-
-    ++bit_cursor;
-  }
-
-  // Get 1 bit from the stream.
-  int read_one_bit()
-  {
-    int b = (data[bit_cursor / 8] >> (bit_cursor & 7)) & 1;
-    ++bit_cursor;
-
-    return b;
-  }
-
-  // write n bits of data
-  // Data shall be written out from the lower order of d.
-  void write_n_bit(int d, int n)
-  {
-    for (int i = 0; i <n; ++i)
-      write_one_bit(d & (1 << i));
-  }
-
-  // read n bits of data
-  // Reverse conversion of write_n_bit().
-  int read_n_bit(int n)
-  {
-    int result = 0;
-    for (int i = 0; i < n; ++i)
-      result |= read_one_bit() ? (1 << i) : 0;
-
-    return result;
-  }
-
-private:
-  // Next bit position to read/write.
-  int bit_cursor;
-
-  // data entity
-  uint8_t* data;
-};
-
-
-// Huffman coding
-// * is simplified from mini encoding to make conversion easier.
-//
-// 1 box on the board (other than NO_PIECE) = 2 to 6 bits (+ 1-bit flag + 1-bit forward and backward)
-// 1 piece of hand piece = 1-5bit (+ 1-bit flag + 1bit ahead and behind)
-//
-// empty xxxxx0 + 0 (none)
-// step xxxx01 + 2 xxxx0 + 2
-// incense xx0011 + 2 xx001 + 2
-// Katsura xx1011 + 2 xx101 + 2
-// silver xx0111 + 2 xx011 + 2
-// Gold x01111 + 1 x0111 + 1 // Gold is valid and has no flags.
-// corner 011111 + 2 01111 + 2
-// Fly 111111 + 2 11111 + 2
-//
-// Assuming all pieces are on the board,
-// Sky 81-40 pieces = 41 boxes = 41bit
-// Walk 4bit*18 pieces = 72bit
-// Incense 6bit*4 pieces = 24bit
-// Katsura 6bit*4 pieces = 24bit
-// Silver 6bit*4 pieces = 24bit
-// Gold 6bit* 4 pieces = 24bit
-// corner 8bit* 2 pieces = 16bit
-// Fly 8bit* 2 pieces = 16bit
-// -------
-// 241bit + 1bit (turn) + 7bit × 2 (King's position after) = 256bit
-//
-// When the piece on the board moves to the hand piece, the piece on the board becomes empty, so the box on the board can be expressed with 1 bit,
-// Since the hand piece can be expressed by 1 bit less than the piece on the board, the total number of bits does not change in the end.
-// Therefore, in this expression, any aspect can be expressed by this bit number.
-// It is a hand piece and no flag is required, but if you include this, the bit number of the piece on the board will be -1
-// Since the total number of bits can be fixed, we will include this as well.
-
-// Huffman Encoding
-//
-// Empty  xxxxxxx0
-// Pawn   xxxxx001 + 1 bit (Side to move)
-// Knight xxxxx011 + 1 bit (Side to move)
-// Bishop xxxxx101 + 1 bit (Side to move)
-// Rook   xxxxx111 + 1 bit (Side to move)
-
-struct HuffmanedPiece
-{
-  int code; // how it will be coded
-  int bits; // How many bits do you have
-};
-
-HuffmanedPiece huffman_table[] =
-{
-  {0b0000,1}, // NO_PIECE
-  {0b0001,4}, // PAWN
-  {0b0011,4}, // KNIGHT
-  {0b0101,4}, // BISHOP
-  {0b0111,4}, // ROOK
-  {0b1001,4}, // QUEEN
-};
-
-// Class for compressing/decompressing sfen
-// sfen can be packed to 256bit (32bytes) by Huffman coding.
-// This is proven by mini. The above is Huffman coding.
-//
-// Internal format = 1-bit turn + 7-bit king position *2 + piece on board (Huffman coding) + hand piece (Huffman coding)
-// Side to move (White = 0, Black = 1) (1bit)
-// White King Position (6 bits)
-// Black King Position (6 bits)
-// Huffman Encoding of the board
-// Castling availability (1 bit x 4)
-// En passant square (1 or 1 + 6 bits)
-// Rule 50 (6 bits)
-// Game play (8 bits)
-//
-// TODO(someone): Rename SFEN to FEN.
-//
-struct SfenPacker
-{
-  // Pack sfen and store in data[32].
-  void pack(const Position& pos)
-  {
-// cout << pos;
-
-    memset(data, 0, 32 /* 256bit */);
-    stream.set_data(data);
-
-    // turn
-    // Side to move.
-    stream.write_one_bit((int)(pos.side_to_move()));
-
-    // 7-bit positions for leading and trailing balls
-    // White king and black king, 6 bits for each.
-    for(auto c: Colors)
-      stream.write_n_bit(pos.king_square(c), 6);
-
-    // Write the pieces on the board other than the kings.
-    for (Rank r = RANK_8; r >= RANK_1; --r)
-    {
-      for (File f = FILE_A; f <= FILE_H; ++f)
-      {
-        Piece pc = pos.piece_on(make_square(f, r));
-        if (type_of(pc) == KING)
-          continue;
-        write_board_piece_to_stream(pc);
-      }
-    }
-
-    // TODO(someone): Support chess960.
-    stream.write_one_bit(pos.can_castle(WHITE_OO));
-    stream.write_one_bit(pos.can_castle(WHITE_OOO));
-    stream.write_one_bit(pos.can_castle(BLACK_OO));
-    stream.write_one_bit(pos.can_castle(BLACK_OOO));
-
-    if (pos.ep_square() == SQ_NONE) {
-      stream.write_one_bit(0);
-    }
-    else {
-      stream.write_one_bit(1);
-      stream.write_n_bit(static_cast<int>(pos.ep_square()), 6);
-    }
-
-    stream.write_n_bit(pos.state()->rule50, 6);
-
-    stream.write_n_bit(1 + (pos.game_ply()-(pos.side_to_move() == BLACK)) / 2, 8);
-
-    assert(stream.get_cursor() <= 256);
-  }
-
-  // sfen packed by pack() (256bit = 32bytes)
-  // Or sfen to decode with unpack()
-  uint8_t *data; // uint8_t[32];
-
-//private:
-  // Position::set_from_packed_sfen(uint8_t data[32]) I want to use these functions, so the line is bad, but I want to keep it public.
-
-  BitStream stream;
-
-  // Output the board pieces to stream.
-  void write_board_piece_to_stream(Piece pc)
-  {
-    // piece type
-    PieceType pr = type_of(pc);
-    auto c = huffman_table[pr];
-    stream.write_n_bit(c.code, c.bits);
- 
-    if (pc == NO_PIECE)
-      return;
-
-    // first and second flag
-    stream.write_one_bit(color_of(pc));
-  }
-
-  // Read one board piece from stream
-  Piece read_board_piece_from_stream()
-  {
-    PieceType pr = NO_PIECE_TYPE;
-    int code = 0, bits = 0;
-    while (true)
-    {
-      code |= stream.read_one_bit() << bits;
-      ++bits;
-
-      assert(bits <= 6);
-
-      for (pr = NO_PIECE_TYPE; pr <KING; ++pr)
-        if (huffman_table[pr].code == code
-          && huffman_table[pr].bits == bits)
-          goto Found;
-    }
-  Found:;
-    if (pr == NO_PIECE_TYPE)
-      return NO_PIECE;
-
-    // first and second flag
-    Color c = (Color)stream.read_one_bit();
-    
-    return make_piece(c, pr);
-  }
-};
-
-
-// -----------------------------------
-// Add to Position class
-// -----------------------------------
-
-// Add a function that directly unpacks for speed. It's pretty tough.
-// Write it by combining packer::unpack() and Position::set().
-// If there is a problem with the passed phase and there is an error, non-zero is returned.
-int Position::set_from_packed_sfen(const PackedSfen& sfen , StateInfo * si, Thread* th, bool mirror)
-{
-	SfenPacker packer;
-	auto& stream = packer.stream;
-	stream.set_data((uint8_t*)&sfen);
-
-	std::memset(this, 0, sizeof(Position));
-	std::memset(si, 0, sizeof(StateInfo));
-  std::fill_n(&pieceList[0][0], sizeof(pieceList) / sizeof(Square), SQ_NONE);
-  st = si;
-
-	// Active color
-	sideToMove = (Color)stream.read_one_bit();
-
-  pieceList[W_KING][0] = SQUARE_NB;
-  pieceList[B_KING][0] = SQUARE_NB;
-
-	// First the position of the ball
-	if (mirror)
-	{
-		for (auto c : Colors)
-			board[flip_file((Square)stream.read_n_bit(6))] = make_piece(c, KING);
-	}
-	else
-	{
-		for (auto c : Colors)
-			board[stream.read_n_bit(6)] = make_piece(c, KING);
-	}
-
-  // Piece placement
-  for (Rank r = RANK_8; r >= RANK_1; --r)
-  {
-    for (File f = FILE_A; f <= FILE_H; ++f)
-    {
-      auto sq = make_square(f, r);
-      if (mirror) {
-        sq = flip_file(sq);
-      }
-
-      // it seems there are already balls
-      Piece pc;
-      if (type_of(board[sq]) != KING)
-      {
-        assert(board[sq] == NO_PIECE);
-        pc = packer.read_board_piece_from_stream();
-      }
-      else
-      {
-        pc = board[sq];
-        board[sq] = NO_PIECE; // put_piece() will catch ASSERT unless you remove it all.
-      }
-
-      // There may be no pieces, so skip in that case.
-      if (pc == NO_PIECE)
-        continue;
-
-      put_piece(Piece(pc), sq);
-
-      //cout << sq << ' ' << board[sq] << ' ' << stream.get_cursor() << endl;
-
-      if (stream.get_cursor()> 256)
-        return 1;
-      //assert(stream.get_cursor() <= 256);
-
-    }
-  }
-
-  // Castling availability.
-  // TODO(someone): Support chess960.
-  st->castlingRights = 0;
-  if (stream.read_one_bit()) {
-    Square rsq;
-    for (rsq = relative_square(WHITE, SQ_H1); piece_on(rsq) != W_ROOK; --rsq) {}
-    set_castling_right(WHITE, rsq);
-  }
-  if (stream.read_one_bit()) {
-    Square rsq;
-    for (rsq = relative_square(WHITE, SQ_A1); piece_on(rsq) != W_ROOK; ++rsq) {}
-    set_castling_right(WHITE, rsq);
-  }
-  if (stream.read_one_bit()) {
-    Square rsq;
-    for (rsq = relative_square(BLACK, SQ_H1); piece_on(rsq) != B_ROOK; --rsq) {}
-    set_castling_right(BLACK, rsq);
-  }
-  if (stream.read_one_bit()) {
-    Square rsq;
-    for (rsq = relative_square(BLACK, SQ_A1); piece_on(rsq) != B_ROOK; ++rsq) {}
-    set_castling_right(BLACK, rsq);
-  }
-
-  // En passant square. Ignore if no pawn capture is possible
-  if (stream.read_one_bit()) {
-    Square ep_square = static_cast<Square>(stream.read_n_bit(6));
-    if (mirror) {
-      ep_square = flip_file(ep_square);
-    }
-    st->epSquare = ep_square;
-
-    if (!(attackers_to(st->epSquare) & pieces(sideToMove, PAWN))
-      || !(pieces(~sideToMove, PAWN) & (st->epSquare + pawn_push(~sideToMove))))
-      st->epSquare = SQ_NONE;
-  }
-  else {
-    st->epSquare = SQ_NONE;
-  }
-
-  // Halfmove clock
-  st->rule50 = static_cast<Square>(stream.read_n_bit(6));
-
-  // Fullmove number
-  gamePly = static_cast<Square>(stream.read_n_bit(8));
-  // Convert from fullmove starting from 1 to gamePly starting from 0,
-  // handle also common incorrect FEN with fullmove = 0.
-  gamePly = std::max(2 * (gamePly - 1), 0) + (sideToMove == BLACK);
-
-  assert(stream.get_cursor() <= 256);
-
-  chess960 = false;
-  thisThread = th;
-set_state(st);
-
-  //std::cout << *this << std::endl;
-
-  assert(pos_is_ok());
-
-	return 0;
-}
-
-// Give the board, hand piece, and turn, and return the sfen.
-//std::string Position::sfen_from_rawdata(Piece board[81], Hand hands[2], Color turn, int gamePly_)
-//{
-// // Copy it to an internal structure and call sfen() if the conversion process depends only on it
-// // Maybe it will be converted normally...
-//  Position pos;
-//
-//  memcpy(pos.board, board, sizeof(Piece) * 81);
-//  memcpy(pos.hand, hands, sizeof(Hand) * 2);
-//  pos.sideToMove = turn;
-//  pos.gamePly = gamePly_;
-//
-//  return pos.sfen();
-//
-// // Implementation of ↑ is beautiful, but slow.
-// // This is a bottleneck when learning a large amount of game records, so write a function to unpack directly.
-//}
-
-// Get the packed sfen. Returns to the buffer specified in the argument.
-void Position::sfen_pack(PackedSfen& sfen)
-{
-  SfenPacker sp;
-  sp.data = (uint8_t*)&sfen;
-  sp.pack(*this);
-}
-
-//// Unpack the packed sfen. Returns an sfen string.
-//std::string Position::sfen_unpack(const PackedSfen& sfen)
-//{
-// SfenPacker sp;
-// sp.data = (uint8_t*)&sfen;
-// return sp.unpack();
-//}
-
-
-#endif // USE_SFEN_PACKER
@@ -0,0 +1,140 @@
+#ifndef _STOCKFISH_BLAS_H_
+#define _STOCKFISH_BLAS_H_
+
+struct ThreadPool;
+
+#if defined (_MSC_VER)
+#define SF_BLAS_RESTRICT __restrict
+#elif defined (__INTEL_COMPILER)
+#define SF_BLAS_RESTRICT restrict
+#elif defined (__clang__)
+#define SF_BLAS_RESTRICT __restrict__
+#elif defined (__GNUC__)
+#define SF_BLAS_RESTRICT __restrict__
+#endif
+
+namespace Blas {
+
+    enum struct MatrixLayout {
+        RowMajor = 101,
+        ColMajor = 102
+    };
+
+    enum struct MatrixTranspose {
+        NoTrans = 111,
+        Trans = 112
+    };
+
+    void scopy(
+        const int N,
+        const float * SF_BLAS_RESTRICT X,
+        float * SF_BLAS_RESTRICT Y
+    );
+
+    void scopy(
+        const int N,
+        const float * SF_BLAS_RESTRICT X, const int incX,
+        float * SF_BLAS_RESTRICT Y, const int incY
+    );
+
+    void scopy(
+        ThreadPool& thread_pool,
+        const int N,
+        const float * SF_BLAS_RESTRICT X,
+        float * SF_BLAS_RESTRICT Y
+    );
+
+    void scopy(
+        ThreadPool& thread_pool,
+        const int N,
+        const float * SF_BLAS_RESTRICT X, const int incX,
+        float * SF_BLAS_RESTRICT Y, const int incY
+    );
+
+    void sscal(
+        const int N,
+        const float alpha,
+        float * SF_BLAS_RESTRICT X
+    );
+
+    void sscal(
+        const int N,
+        const float alpha,
+        float * SF_BLAS_RESTRICT X, const int incX
+    );
+
+    void sscal(
+        ThreadPool& thread_pool,
+        const int N,
+        const float alpha,
+        float * SF_BLAS_RESTRICT X
+    );
+
+    void sscal(
+        ThreadPool& thread_pool,
+        const int N,
+        const float alpha,
+        float * SF_BLAS_RESTRICT X, const int incX
+    );
+
+    void saxpy(
+        const int N,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT X,
+        float * SF_BLAS_RESTRICT Y
+    );
+
+    void saxpy(
+        const int N,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT X, const int incX,
+        float * SF_BLAS_RESTRICT Y, const int incY
+    );
+
+    void saxpy(
+        ThreadPool& thread_pool,
+        const int N,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT X,
+        float * SF_BLAS_RESTRICT Y
+    );
+
+    void saxpy(
+        ThreadPool& thread_pool,
+        const int N,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT X, const int incX,
+        float * SF_BLAS_RESTRICT Y, const int incY
+    );
+
+    void sgemm(
+        ThreadPool& thread_pool,
+        MatrixLayout layout, MatrixTranspose TransA, MatrixTranspose TransB,
+        const int M, const int N, const int K,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT A, const int lda,
+        const float * SF_BLAS_RESTRICT B, const int ldb,
+        const float beta,
+        float * SF_BLAS_RESTRICT C, const int ldc
+    );
+
+    void sgemm(
+        MatrixLayout layout, MatrixTranspose TransA, MatrixTranspose TransB,
+        const int M, const int N, const int K,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT A, const int lda,
+        const float * SF_BLAS_RESTRICT B, const int ldb,
+        const float beta,
+        float * SF_BLAS_RESTRICT C, const int ldc
+    );
+
+    void test(
+        ThreadPool& thread_pool
+    );
+
+    void bench(
+        ThreadPool& thread_pool
+    );
+}
+
+#endif
@@ -0,0 +1,26 @@
+The file "incbin.h" is free and unencumbered software released into
+the public domain by Dale Weiler, see:
+   <https://github.com/graphitemaster/incbin>
+
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <http://unlicense.org/>
@@ -0,0 +1,368 @@
+/**
+ * @file incbin.h
+ * @author Dale Weiler
+ * @brief Utility for including binary files
+ *
+ * Facilities for including binary files into the current translation unit and
+ * making use from them externally in other translation units.
+ */
+#ifndef INCBIN_HDR
+#define INCBIN_HDR
+#include <limits.h>
+#if   defined(__AVX512BW__) || \
+      defined(__AVX512CD__) || \
+      defined(__AVX512DQ__) || \
+      defined(__AVX512ER__) || \
+      defined(__AVX512PF__) || \
+      defined(__AVX512VL__) || \
+      defined(__AVX512F__)
+# define INCBIN_ALIGNMENT_INDEX 6
+#elif defined(__AVX__)      || \
+      defined(__AVX2__)
+# define INCBIN_ALIGNMENT_INDEX 5
+#elif defined(__SSE__)      || \
+      defined(__SSE2__)     || \
+      defined(__SSE3__)     || \
+      defined(__SSSE3__)    || \
+      defined(__SSE4_1__)   || \
+      defined(__SSE4_2__)   || \
+      defined(__neon__)
+# define INCBIN_ALIGNMENT_INDEX 4
+#elif ULONG_MAX != 0xffffffffu
+# define INCBIN_ALIGNMENT_INDEX 3
+# else
+# define INCBIN_ALIGNMENT_INDEX 2
+#endif
+
+/* Lookup table of (1 << n) where `n' is `INCBIN_ALIGNMENT_INDEX' */
+#define INCBIN_ALIGN_SHIFT_0 1
+#define INCBIN_ALIGN_SHIFT_1 2
+#define INCBIN_ALIGN_SHIFT_2 4
+#define INCBIN_ALIGN_SHIFT_3 8
+#define INCBIN_ALIGN_SHIFT_4 16
+#define INCBIN_ALIGN_SHIFT_5 32
+#define INCBIN_ALIGN_SHIFT_6 64
+
+/* Actual alignment value */
+#define INCBIN_ALIGNMENT \
+    INCBIN_CONCATENATE( \
+        INCBIN_CONCATENATE(INCBIN_ALIGN_SHIFT, _), \
+        INCBIN_ALIGNMENT_INDEX)
+
+/* Stringize */
+#define INCBIN_STR(X) \
+    #X
+#define INCBIN_STRINGIZE(X) \
+    INCBIN_STR(X)
+/* Concatenate */
+#define INCBIN_CAT(X, Y) \
+    X ## Y
+#define INCBIN_CONCATENATE(X, Y) \
+    INCBIN_CAT(X, Y)
+/* Deferred macro expansion */
+#define INCBIN_EVAL(X) \
+    X
+#define INCBIN_INVOKE(N, ...) \
+    INCBIN_EVAL(N(__VA_ARGS__))
+
+/* Green Hills uses a different directive for including binary data */
+#if defined(__ghs__)
+#  if (__ghs_asm == 2)
+#    define INCBIN_MACRO ".file"
+/* Or consider the ".myrawdata" entry in the ld file */
+#  else
+#    define INCBIN_MACRO "\tINCBIN"
+#  endif
+#else
+#  define INCBIN_MACRO ".incbin"
+#endif
+
+#ifndef _MSC_VER
+#  define INCBIN_ALIGN \
+    __attribute__((aligned(INCBIN_ALIGNMENT)))
+#else
+#  define INCBIN_ALIGN __declspec(align(INCBIN_ALIGNMENT))
+#endif
+
+#if defined(__arm__) || /* GNU C and RealView */ \
+    defined(__arm) || /* Diab */ \
+    defined(_ARM) /* ImageCraft */
+#  define INCBIN_ARM
+#endif
+
+#ifdef __GNUC__
+/* Utilize .balign where supported */
+#  define INCBIN_ALIGN_HOST ".balign " INCBIN_STRINGIZE(INCBIN_ALIGNMENT) "\n"
+#  define INCBIN_ALIGN_BYTE ".balign 1\n"
+#elif defined(INCBIN_ARM)
+/*
+ * On arm assemblers, the alignment value is calculated as (1 << n) where `n' is
+ * the shift count. This is the value passed to `.align'
+ */
+#  define INCBIN_ALIGN_HOST ".align " INCBIN_STRINGIZE(INCBIN_ALIGNMENT_INDEX) "\n"
+#  define INCBIN_ALIGN_BYTE ".align 0\n"
+#else
+/* We assume other inline assembler's treat `.align' as `.balign' */
+#  define INCBIN_ALIGN_HOST ".align " INCBIN_STRINGIZE(INCBIN_ALIGNMENT) "\n"
+#  define INCBIN_ALIGN_BYTE ".align 1\n"
+#endif
+
+/* INCBIN_CONST is used by incbin.c generated files */
+#if defined(__cplusplus)
+#  define INCBIN_EXTERNAL extern "C"
+#  define INCBIN_CONST    extern const
+#else
+#  define INCBIN_EXTERNAL extern
+#  define INCBIN_CONST    const
+#endif
+
+/**
+ * @brief Optionally override the linker section into which data is emitted.
+ *
+ * @warning If you use this facility, you'll have to deal with platform-specific linker output
+ * section naming on your own
+ *
+ * Overriding the default linker output section, e.g for esp8266/Arduino:
+ * @code
+ * #define INCBIN_OUTPUT_SECTION ".irom.text"
+ * #include "incbin.h"
+ * INCBIN(Foo, "foo.txt");
+ * // Data is emitted into program memory that never gets copied to RAM
+ * @endcode
+ */
+#if !defined(INCBIN_OUTPUT_SECTION)
+#  if defined(__APPLE__)
+#    define INCBIN_OUTPUT_SECTION         ".const_data"
+#  else
+#    define INCBIN_OUTPUT_SECTION         ".rodata"
+#  endif
+#endif
+
+#if defined(__APPLE__)
+/* The directives are different for Apple branded compilers */
+#  define INCBIN_SECTION         INCBIN_OUTPUT_SECTION "\n"
+#  define INCBIN_GLOBAL(NAME)    ".globl " INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME "\n"
+#  define INCBIN_INT             ".long "
+#  define INCBIN_MANGLE          "_"
+#  define INCBIN_BYTE            ".byte "
+#  define INCBIN_TYPE(...)
+#else
+#  define INCBIN_SECTION         ".section " INCBIN_OUTPUT_SECTION "\n"
+#  define INCBIN_GLOBAL(NAME)    ".global " INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME "\n"
+#  if defined(__ghs__)
+#    define INCBIN_INT           ".word "
+#  else
+#    define INCBIN_INT           ".int "
+#  endif
+#  if defined(__USER_LABEL_PREFIX__)
+#    define INCBIN_MANGLE        INCBIN_STRINGIZE(__USER_LABEL_PREFIX__)
+#  else
+#    define INCBIN_MANGLE        ""
+#  endif
+#  if defined(INCBIN_ARM)
+/* On arm assemblers, `@' is used as a line comment token */
+#    define INCBIN_TYPE(NAME)    ".type " INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME ", %object\n"
+#  elif defined(__MINGW32__) || defined(__MINGW64__)
+/* Mingw doesn't support this directive either */
+#    define INCBIN_TYPE(NAME)
+#  else
+/* It's safe to use `@' on other architectures */
+#    define INCBIN_TYPE(NAME)    ".type " INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME ", @object\n"
+#  endif
+#  define INCBIN_BYTE            ".byte "
+#endif
+
+/* List of style types used for symbol names */
+#define INCBIN_STYLE_CAMEL 0
+#define INCBIN_STYLE_SNAKE 1
+
+/**
+ * @brief Specify the prefix to use for symbol names.
+ *
+ * By default this is `g', producing symbols of the form:
+ * @code
+ * #include "incbin.h"
+ * INCBIN(Foo, "foo.txt");
+ *
+ * // Now you have the following symbols:
+ * // const unsigned char gFooData[];
+ * // const unsigned char *const gFooEnd;
+ * // const unsigned int gFooSize;
+ * @endcode
+ *
+ * If however you specify a prefix before including: e.g:
+ * @code
+ * #define INCBIN_PREFIX incbin
+ * #include "incbin.h"
+ * INCBIN(Foo, "foo.txt");
+ *
+ * // Now you have the following symbols instead:
+ * // const unsigned char incbinFooData[];
+ * // const unsigned char *const incbinFooEnd;
+ * // const unsigned int incbinFooSize;
+ * @endcode
+ */
+#if !defined(INCBIN_PREFIX)
+#  define INCBIN_PREFIX g
+#endif
+
+/**
+ * @brief Specify the style used for symbol names.
+ *
+ * Possible options are
+ * - INCBIN_STYLE_CAMEL "CamelCase"
+ * - INCBIN_STYLE_SNAKE "snake_case"
+ *
+ * Default option is *INCBIN_STYLE_CAMEL* producing symbols of the form:
+ * @code
+ * #include "incbin.h"
+ * INCBIN(Foo, "foo.txt");
+ *
+ * // Now you have the following symbols:
+ * // const unsigned char <prefix>FooData[];
+ * // const unsigned char *const <prefix>FooEnd;
+ * // const unsigned int <prefix>FooSize;
+ * @endcode
+ *
+ * If however you specify a style before including: e.g:
+ * @code
+ * #define INCBIN_STYLE INCBIN_STYLE_SNAKE
+ * #include "incbin.h"
+ * INCBIN(foo, "foo.txt");
+ *
+ * // Now you have the following symbols:
+ * // const unsigned char <prefix>foo_data[];
+ * // const unsigned char *const <prefix>foo_end;
+ * // const unsigned int <prefix>foo_size;
+ * @endcode
+ */
+#if !defined(INCBIN_STYLE)
+#  define INCBIN_STYLE INCBIN_STYLE_CAMEL
+#endif
+
+/* Style lookup tables */
+#define INCBIN_STYLE_0_DATA Data
+#define INCBIN_STYLE_0_END End
+#define INCBIN_STYLE_0_SIZE Size
+#define INCBIN_STYLE_1_DATA _data
+#define INCBIN_STYLE_1_END _end
+#define INCBIN_STYLE_1_SIZE _size
+
+/* Style lookup: returning identifier */
+#define INCBIN_STYLE_IDENT(TYPE) \
+    INCBIN_CONCATENATE( \
+        INCBIN_STYLE_, \
+        INCBIN_CONCATENATE( \
+            INCBIN_EVAL(INCBIN_STYLE), \
+            INCBIN_CONCATENATE(_, TYPE)))
+
+/* Style lookup: returning string literal */
+#define INCBIN_STYLE_STRING(TYPE) \
+    INCBIN_STRINGIZE( \
+        INCBIN_STYLE_IDENT(TYPE)) \
+
+/* Generate the global labels by indirectly invoking the macro with our style
+ * type and concatenating the name against them. */
+#define INCBIN_GLOBAL_LABELS(NAME, TYPE) \
+    INCBIN_INVOKE( \
+        INCBIN_GLOBAL, \
+        INCBIN_CONCATENATE( \
+            NAME, \
+            INCBIN_INVOKE( \
+                INCBIN_STYLE_IDENT, \
+                TYPE))) \
+    INCBIN_INVOKE( \
+        INCBIN_TYPE, \
+        INCBIN_CONCATENATE( \
+            NAME, \
+            INCBIN_INVOKE( \
+                INCBIN_STYLE_IDENT, \
+                TYPE)))
+
+/**
+ * @brief Externally reference binary data included in another translation unit.
+ *
+ * Produces three external symbols that reference the binary data included in
+ * another translation unit.
+ *
+ * The symbol names are a concatenation of `INCBIN_PREFIX' before *NAME*; with
+ * "Data", as well as "End" and "Size" after. An example is provided below.
+ *
+ * @param NAME The name given for the binary data
+ *
+ * @code
+ * INCBIN_EXTERN(Foo);
+ *
+ * // Now you have the following symbols:
+ * // extern const unsigned char <prefix>FooData[];
+ * // extern const unsigned char *const <prefix>FooEnd;
+ * // extern const unsigned int <prefix>FooSize;
+ * @endcode
+ */
+#define INCBIN_EXTERN(NAME) \
+    INCBIN_EXTERNAL const INCBIN_ALIGN unsigned char \
+        INCBIN_CONCATENATE( \
+            INCBIN_CONCATENATE(INCBIN_PREFIX, NAME), \
+            INCBIN_STYLE_IDENT(DATA))[]; \
+    INCBIN_EXTERNAL const INCBIN_ALIGN unsigned char *const \
+    INCBIN_CONCATENATE( \
+        INCBIN_CONCATENATE(INCBIN_PREFIX, NAME), \
+        INCBIN_STYLE_IDENT(END)); \
+    INCBIN_EXTERNAL const unsigned int \
+        INCBIN_CONCATENATE( \
+            INCBIN_CONCATENATE(INCBIN_PREFIX, NAME), \
+            INCBIN_STYLE_IDENT(SIZE))
+
+/**
+ * @brief Include a binary file into the current translation unit.
+ *
+ * Includes a binary file into the current translation unit, producing three symbols
+ * for objects that encode the data and size respectively.
+ *
+ * The symbol names are a concatenation of `INCBIN_PREFIX' before *NAME*; with
+ * "Data", as well as "End" and "Size" after. An example is provided below.
+ *
+ * @param NAME The name to associate with this binary data (as an identifier.)
+ * @param FILENAME The file to include (as a string literal.)
+ *
+ * @code
+ * INCBIN(Icon, "icon.png");
+ *
+ * // Now you have the following symbols:
+ * // const unsigned char <prefix>IconData[];
+ * // const unsigned char *const <prefix>IconEnd;
+ * // const unsigned int <prefix>IconSize;
+ * @endcode
+ *
+ * @warning This must be used in global scope
+ * @warning The identifiers may be different if INCBIN_STYLE is not default
+ *
+ * To externally reference the data included by this in another translation unit
+ * please @see INCBIN_EXTERN.
+ */
+#ifdef _MSC_VER
+#define INCBIN(NAME, FILENAME) \
+    INCBIN_EXTERN(NAME)
+#else
+#define INCBIN(NAME, FILENAME) \
+    __asm__(INCBIN_SECTION \
+            INCBIN_GLOBAL_LABELS(NAME, DATA) \
+            INCBIN_ALIGN_HOST \
+            INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(DATA) ":\n" \
+            INCBIN_MACRO " \"" FILENAME "\"\n" \
+            INCBIN_GLOBAL_LABELS(NAME, END) \
+            INCBIN_ALIGN_BYTE \
+            INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(END) ":\n" \
+                INCBIN_BYTE "1\n" \
+            INCBIN_GLOBAL_LABELS(NAME, SIZE) \
+            INCBIN_ALIGN_HOST \
+            INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(SIZE) ":\n" \
+                INCBIN_INT INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(END) " - " \
+                           INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(DATA) "\n" \
+            INCBIN_ALIGN_HOST \
+            ".text\n" \
+    ); \
+    INCBIN_EXTERN(NAME)
+
+#endif
+#endif
@@ -0,0 +1,667 @@
+#ifndef LEARNER_AUTOGRAD_H
+#define LEARNER_AUTOGRAD_H
+
+#include <cmath>
+#include <utility>
+#include <type_traits>
+#include <memory>
+#include <tuple>
+#include <optional>
+#include <algorithm>
+#include <cstdint>
+
+namespace Learner
+{
+    template <typename T>
+    struct ValueWithGrad
+    {
+        T value;
+        T grad;
+
+        ValueWithGrad& operator+=(const ValueWithGrad<T>& rhs)
+        {
+            value += rhs.value;
+            grad += rhs.grad;
+            return *this;
+        }
+
+        ValueWithGrad& operator-=(const ValueWithGrad<T>& rhs)
+        {
+            value -= rhs.value;
+            grad -= rhs.grad;
+            return *this;
+        }
+
+        ValueWithGrad& operator*=(T rhs)
+        {
+            value *= rhs;
+            grad *= rhs;
+            return *this;
+        }
+
+        ValueWithGrad& operator/=(T rhs)
+        {
+            value /= rhs;
+            grad /= rhs;
+            return *this;
+        }
+
+        [[nodiscard]] ValueWithGrad abs() const
+        {
+            return { std::abs(value), std::abs(grad) };
+        }
+
+        [[nodiscard]] ValueWithGrad clamp_grad(T max) const
+        {
+            return { value, std::clamp(grad, -max, max) };
+        }
+    };
+}
+
+namespace Learner::Autograd::UnivariateStatic
+{
+
+    template <typename T>
+    struct Identity
+    {
+        using type = T;
+    };
+
+    template <typename T>
+    using Id = typename Identity<T>::type;
+
+    template <typename T>
+    using StoreValueOrRef = std::conditional_t<
+            std::is_rvalue_reference_v<T>,
+            std::remove_reference_t<T>,
+            const std::remove_reference_t<T>&
+        >;
+
+    namespace Detail
+    {
+        using CallIdType = std::uint32_t;
+
+        struct CallId
+        {
+            CallIdType call_id{};
+
+            constexpr CallId() :
+                call_id(0)
+            {
+            }
+
+            constexpr CallId(CallIdType id) :
+                call_id(id)
+            {
+            }
+
+            [[nodiscard]] bool operator==(CallId rhs) const noexcept
+            {
+                return call_id == rhs.call_id;
+            }
+
+            [[nodiscard]] bool operator!=(CallId rhs) const noexcept
+            {
+                return call_id != rhs.call_id;
+            }
+        };
+
+        [[nodiscard]] inline CallId next_call_id()
+        {
+            static thread_local CallIdType s_call_id = 0;
+            return CallId{ s_call_id++ };
+        }
+
+        template <typename T, typename Tuple>
+        struct TupleContains;
+
+        template <typename T, typename... Us>
+        struct TupleContains<T, std::tuple<Us...>> : std::disjunction<std::is_same<T, Us>...> {};
+
+        template <typename T, typename Tuple>
+        constexpr bool TupleContainsV = TupleContains<T, Tuple>::value;
+
+        template <typename... Ts>
+        constexpr bool AreAllConstantV = (std::remove_reference_t<Ts>::is_constant && ...);
+    }
+
+    template <typename T, typename ChildT>
+    struct Evaluable
+    {
+        constexpr Evaluable() = default;
+
+        // We append a unique call id so that we can invalidate the cache when
+        // the next computation starts. A single evaluation should see
+        // the same call_id at every node.
+        template <typename... ArgsTs>
+        [[nodiscard]] auto eval(const std::tuple<ArgsTs...>& args) const
+        {
+            const auto call_id = Detail::next_call_id();
+            const auto new_args = std::tuple_cat(args, std::tuple(call_id));
+            return ValueWithGrad<T>{ value(new_args), grad(new_args) };
+        }
+
+        template <typename... ArgsTs,
+            typename SFINAE = std::enable_if_t<Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
+        [[nodiscard]] auto value(const std::tuple<ArgsTs...>& args) const
+        {
+            const ChildT* this_ = static_cast<const ChildT*>(this);
+
+            const auto call_id = std::get<Detail::CallId>(args);
+            if (!value_cache.has_value() || value_cache_call_id != call_id)
+            {
+                value_cache_call_id = call_id;
+                value_cache = this_->calculate_value(args);
+            }
+
+            return *value_cache;
+        }
+
+        template <typename... ArgsTs,
+            typename SFINAE = std::enable_if_t<!Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
+        [[nodiscard]] auto value(const std::tuple<ArgsTs...>& args, ...) const
+        {
+            const auto call_id = Detail::next_call_id();
+            const auto new_args = std::tuple_cat(args, std::tuple(call_id));
+            return value(new_args);
+        }
+
+        template <typename... ArgsTs,
+            typename SFINAE = std::enable_if_t<Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
+        [[nodiscard]] auto grad(const std::tuple<ArgsTs...>& args) const
+        {
+            if constexpr (ChildT::is_constant)
+            {
+                return T(0.0);
+            }
+            else
+            {
+                const ChildT* this_ = static_cast<const ChildT*>(this);
+
+                const auto call_id = std::get<Detail::CallId>(args);
+                if (!grad_cache.has_value() || grad_cache_call_id != call_id)
+                {
+                    grad_cache_call_id = call_id;
+                    grad_cache = this_->calculate_grad(args);
+                }
+
+                return *grad_cache;
+            }
+        }
+
+        template <typename... ArgsTs,
+            typename SFINAE = std::enable_if_t<!Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
+        [[nodiscard]] auto grad(const std::tuple<ArgsTs...>& args, ...) const
+        {
+            const auto call_id = Detail::next_call_id();
+            const auto new_args = std::tuple_cat(args, std::tuple(call_id));
+            return grad(new_args);
+        }
+
+    private:
+        mutable std::optional<T> value_cache;
+        mutable std::optional<T> grad_cache;
+        mutable Detail::CallId value_cache_call_id{};
+        mutable Detail::CallId grad_cache_call_id{};
+    };
+
+    template <typename T, int I>
+    struct VariableParameter : Evaluable<T, VariableParameter<T, I>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = false;
+
+        constexpr VariableParameter()
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
+        {
+            return std::get<I>(args);
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>&) const
+        {
+            return T(1.0);
+        }
+    };
+
+    template <typename T, int I>
+    struct ConstantParameter : Evaluable<T, ConstantParameter<T, I>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = true;
+
+        constexpr ConstantParameter()
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
+        {
+            return std::get<I>(args);
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>&) const
+        {
+            return T(0.0);
+        }
+    };
+
+    template <typename T>
+    struct Constant : Evaluable<T, Constant<T>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = true;
+
+        constexpr Constant(T x) :
+            m_x(std::move(x))
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>&) const
+        {
+            return m_x;
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>&) const
+        {
+            return T(0.0);
+        }
+
+    private:
+        T m_x;
+    };
+
+    // The "constant" may change between executions, but is assumed to be
+    // constant during a single evaluation.
+    template <typename T>
+    struct ConstantRef : Evaluable<T, ConstantRef<T>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = true;
+
+        constexpr ConstantRef(const T& x) :
+            m_x(x)
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>&) const
+        {
+            return m_x;
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>&) const
+        {
+            return T(0.0);
+        }
+
+    private:
+        const T& m_x;
+    };
+
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    struct Sum : Evaluable<T, Sum<LhsT, RhsT, T>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = Detail::AreAllConstantV<LhsT, RhsT>;
+
+        constexpr Sum(LhsT&& lhs, RhsT&& rhs) :
+            m_lhs(std::forward<LhsT>(lhs)),
+            m_rhs(std::forward<RhsT>(rhs))
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_lhs.value(args) + m_rhs.value(args);
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_lhs.grad(args) + m_rhs.grad(args);
+        }
+
+    private:
+        StoreValueOrRef<LhsT> m_lhs;
+        StoreValueOrRef<RhsT> m_rhs;
+    };
+
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator+(LhsT&& lhs, RhsT&& rhs)
+    {
+        return Sum<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
+    }
+
+    template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator+(LhsT&& lhs, Id<T> rhs)
+    {
+        return Sum<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
+    }
+
+    template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator+(Id<T> lhs, RhsT&& rhs)
+    {
+        return Sum<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
+    }
+
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    struct Difference : Evaluable<T, Difference<LhsT, RhsT, T>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = Detail::AreAllConstantV<LhsT, RhsT>;
+
+        constexpr Difference(LhsT&& lhs, RhsT&& rhs) :
+            m_lhs(std::forward<LhsT>(lhs)),
+            m_rhs(std::forward<RhsT>(rhs))
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_lhs.value(args) - m_rhs.value(args);
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_lhs.grad(args) - m_rhs.grad(args);
+        }
+
+    private:
+        StoreValueOrRef<LhsT> m_lhs;
+        StoreValueOrRef<RhsT> m_rhs;
+    };
+
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator-(LhsT&& lhs, RhsT&& rhs)
+    {
+        return Difference<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
+    }
+
+    template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator-(LhsT&& lhs, Id<T> rhs)
+    {
+        return Difference<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
+    }
+
+    template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator-(Id<T> lhs, RhsT&& rhs)
+    {
+        return Difference<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
+    }
+
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    struct Product : Evaluable<T, Product<LhsT, RhsT, T>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = Detail::AreAllConstantV<LhsT, RhsT>;
+
+        constexpr Product(LhsT&& lhs, RhsT&& rhs) :
+            m_lhs(std::forward<LhsT>(lhs)),
+            m_rhs(std::forward<RhsT>(rhs))
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_lhs.value(args) * m_rhs.value(args);
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_lhs.grad(args) * m_rhs.value(args) + m_lhs.value(args) * m_rhs.grad(args);
+        }
+
+    private:
+        StoreValueOrRef<LhsT> m_lhs;
+        StoreValueOrRef<RhsT> m_rhs;
+    };
+
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator*(LhsT&& lhs, RhsT&& rhs)
+    {
+        return Product<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
+    }
+
+    template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator*(LhsT&& lhs, Id<T> rhs)
+    {
+        return Product<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
+    }
+
+    template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator*(Id<T> lhs, RhsT&& rhs)
+    {
+        return Product<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
+    }
+
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    struct Quotient : Evaluable<T, Quotient<LhsT, RhsT, T>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = Detail::AreAllConstantV<LhsT, RhsT>;
+
+        constexpr Quotient(LhsT&& lhs, RhsT&& rhs) :
+            m_lhs(std::forward<LhsT>(lhs)),
+            m_rhs(std::forward<RhsT>(rhs))
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_lhs.value(args) / m_rhs.value(args);
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        {
+            auto g = m_rhs.value(args);
+            return (m_lhs.grad(args) * g - m_lhs.value(args) * m_rhs.grad(args)) / (g * g);
+        }
+
+    private:
+        StoreValueOrRef<LhsT> m_lhs;
+        StoreValueOrRef<RhsT> m_rhs;
+    };
+
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator/(LhsT&& lhs, RhsT&& rhs)
+    {
+        return Quotient<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
+    }
+
+    template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator/(LhsT&& lhs, Id<T> rhs)
+    {
+        return Quotient<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
+    }
+
+    template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator/(Id<T> lhs, RhsT&& rhs)
+    {
+        return Quotient<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
+    }
+
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
+    struct Negation : Evaluable<T, Negation<ArgT, T>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = Detail::AreAllConstantV<ArgT>;
+
+        constexpr explicit Negation(ArgT&& x) :
+            m_x(std::forward<ArgT>(x))
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
+        {
+            return -m_x.value(args);
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        {
+            return -m_x.grad(args);
+        }
+
+    private:
+        StoreValueOrRef<ArgT> m_x;
+    };
+
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
+    [[nodiscard]] constexpr auto operator-(ArgT&& x)
+    {
+        return Negation<ArgT&&>(std::forward<ArgT>(x));
+    }
+
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
+    struct Sigmoid : Evaluable<T, Sigmoid<ArgT, T>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = Detail::AreAllConstantV<ArgT>;
+
+        constexpr explicit Sigmoid(ArgT&& x) :
+            m_x(std::forward<ArgT>(x))
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
+        {
+            return value_(m_x.value(args));
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_x.grad(args) * grad_(m_x.value(args));
+        }
+
+    private:
+        StoreValueOrRef<ArgT> m_x;
+
+        [[nodiscard]] T value_(T x) const
+        {
+            return 1.0 / (1.0 + std::exp(-x));
+        }
+
+        [[nodiscard]] T grad_(T x) const
+        {
+            return value_(x) * (1.0 - value_(x));
+        }
+    };
+
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
+    [[nodiscard]] constexpr auto sigmoid(ArgT&& x)
+    {
+        return Sigmoid<ArgT&&>(std::forward<ArgT>(x));
+    }
+
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
+    struct Pow : Evaluable<T, Pow<ArgT, T>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = Detail::AreAllConstantV<ArgT>;
+
+        constexpr explicit Pow(ArgT&& x, Id<T> exponent) :
+            m_x(std::forward<ArgT>(x)),
+            m_exponent(std::move(exponent))
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
+        {
+            return std::pow(m_x.value(args), m_exponent);
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_exponent * std::pow(m_x.value(args), m_exponent - T(1.0)) * m_x.grad(args);
+        }
+
+    private:
+        StoreValueOrRef<ArgT> m_x;
+        T m_exponent;
+    };
+
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
+    [[nodiscard]] constexpr auto pow(ArgT&& x, Id<T> exp)
+    {
+        return Pow<ArgT&&>(std::forward<ArgT>(x), std::move(exp));
+    }
+
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
+    struct Log : Evaluable<T, Log<ArgT, T>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = Detail::AreAllConstantV<ArgT>;
+
+        constexpr explicit Log(ArgT&& x) :
+            m_x(std::forward<ArgT>(x))
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
+        {
+            return value_(m_x.value(args));
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_x.grad(args) * grad_(m_x.value(args));
+        }
+
+    private:
+        StoreValueOrRef<ArgT> m_x;
+
+        T value_(T x) const
+        {
+            return std::log(x);
+        }
+
+        T grad_(T x) const
+        {
+            return 1.0 / x;
+        }
+    };
+
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
+    [[nodiscard]] constexpr auto log(ArgT&& x)
+    {
+        return Log<ArgT&&>(std::forward<ArgT>(x));
+    }
+
+}
+
+#endif
@@ -0,0 +1,815 @@
+#include "convert.h"
+
+#include "uci.h"
+#include "misc.h"
+#include "thread.h"
+#include "position.h"
+#include "tt.h"
+
+#include "extra/nnue_data_binpack_format.h"
+
+#include "nnue/evaluate_nnue.h"
+
+#include "syzygy/tbprobe.h"
+
+#include <sstream>
+#include <fstream>
+#include <unordered_set>
+#include <iomanip>
+#include <list>
+#include <cmath>    // std::exp(),std::pow(),std::log()
+#include <cstring>  // memcpy()
+#include <memory>
+#include <limits>
+#include <optional>
+#include <chrono>
+#include <random>
+#include <regex>
+#include <filesystem>
+
+using namespace std;
+
+namespace Learner
+{
+    bool fen_is_ok(Position& pos, std::string input_fen) {
+        std::string pos_fen = pos.fen();
+        std::istringstream ss_input(input_fen);
+        std::istringstream ss_pos(pos_fen);
+
+        // example : "2r4r/4kpp1/nb1np3/p2p3p/B2P1BP1/PP6/4NPKP/2R1R3 w - h6 0 24"
+        //       --> "2r4r/4kpp1/nb1np3/p2p3p/B2P1BP1/PP6/4NPKP/2R1R3"
+        std::string str_input, str_pos;
+        ss_input >> str_input;
+        ss_pos >> str_pos;
+
+        // Only compare "Piece placement field" between input_fen and pos.fen().
+        return str_input == str_pos;
+    }
+
+    void convert_bin(
+        const vector<string>& filenames,
+        const string& output_file_name,
+        const int ply_minimum,
+        const int ply_maximum,
+        const int interpolate_eval,
+        const int src_score_min_value,
+        const int src_score_max_value,
+        const int dest_score_min_value,
+        const int dest_score_max_value,
+        const bool check_invalid_fen,
+        const bool check_illegal_move)
+    {
+        std::cout << "check_invalid_fen=" << check_invalid_fen << std::endl;
+        std::cout << "check_illegal_move=" << check_illegal_move << std::endl;
+
+        std::fstream fs;
+        uint64_t data_size = 0;
+        uint64_t filtered_size = 0;
+        uint64_t filtered_size_fen = 0;
+        uint64_t filtered_size_move = 0;
+        uint64_t filtered_size_ply = 0;
+        auto th = Threads.main();
+        auto& tpos = th->rootPos;
+        // convert plain rag to packed sfenvalue for Yaneura king
+        fs.open(output_file_name, ios::app | ios::binary);
+        StateListPtr states;
+        for (auto filename : filenames) {
+            std::cout << "convert " << filename << " ... ";
+            std::string line;
+            ifstream ifs;
+            ifs.open(filename);
+            PackedSfenValue p;
+            data_size = 0;
+            filtered_size = 0;
+            filtered_size_fen = 0;
+            filtered_size_move = 0;
+            filtered_size_ply = 0;
+            p.gamePly = 1; // Not included in apery format. Should be initialized
+            bool ignore_flag_fen = false;
+            bool ignore_flag_move = false;
+            bool ignore_flag_ply = false;
+            while (std::getline(ifs, line)) {
+                std::stringstream ss(line);
+                std::string token;
+                std::string value;
+                ss >> token;
+                if (token == "fen") {
+                    states = StateListPtr(new std::deque<StateInfo>(1)); // Drop old and create a new one
+                    std::string input_fen = line.substr(4);
+                    tpos.set(input_fen, false, &states->back(), Threads.main());
+                    if (check_invalid_fen && !fen_is_ok(tpos, input_fen)) {
+                        ignore_flag_fen = true;
+                        filtered_size_fen++;
+                    }
+                    else {
+                        tpos.sfen_pack(p.sfen);
+                    }
+                }
+                else if (token == "move") {
+                    ss >> value;
+                    Move move = UCI::to_move(tpos, value);
+                    if (check_illegal_move && move == MOVE_NONE) {
+                        ignore_flag_move = true;
+                        filtered_size_move++;
+                    }
+                    else {
+                        p.move = move;
+                    }
+                }
+                else if (token == "score") {
+                    double score;
+                    ss >> score;
+                    // Training Formula ?Issue #71 ?nodchip/Stockfish https://github.com/nodchip/Stockfish/issues/71
+                    // Normalize to [0.0, 1.0].
+                    score = (score - src_score_min_value) / (src_score_max_value - src_score_min_value);
+                    // Scale to [dest_score_min_value, dest_score_max_value].
+                    score = score * (dest_score_max_value - dest_score_min_value) + dest_score_min_value;
+                    p.score = Math::clamp((int32_t)std::round(score), -(int32_t)VALUE_MATE, (int32_t)VALUE_MATE);
+                }
+                else if (token == "ply") {
+                    int temp;
+                    ss >> temp;
+                    if (temp < ply_minimum || temp > ply_maximum) {
+                        ignore_flag_ply = true;
+                        filtered_size_ply++;
+                    }
+                    p.gamePly = uint16_t(temp); // No cast here?
+                    if (interpolate_eval != 0) {
+                        p.score = min(3000, interpolate_eval * temp);
+                    }
+                }
+                else if (token == "result") {
+                    int temp;
+                    ss >> temp;
+                    p.game_result = int8_t(temp); // Do you need a cast here?
+                    if (interpolate_eval) {
+                        p.score = p.score * p.game_result;
+                    }
+                }
+                else if (token == "e") {
+                    if (!(ignore_flag_fen || ignore_flag_move || ignore_flag_ply)) {
+                        fs.write((char*)&p, sizeof(PackedSfenValue));
+                        data_size += 1;
+                        // debug
+                        // std::cout<<tpos<<std::endl;
+                        // std::cout<<p.score<<","<<int(p.gamePly)<<","<<int(p.game_result)<<std::endl;
+                    }
+                    else {
+                        filtered_size++;
+                    }
+                    ignore_flag_fen = false;
+                    ignore_flag_move = false;
+                    ignore_flag_ply = false;
+                }
+            }
+            std::cout << "done " << data_size << " parsed " << filtered_size << " is filtered"
+                << " (invalid fen:" << filtered_size_fen << ", illegal move:" << filtered_size_move << ", invalid ply:" << filtered_size_ply << ")" << std::endl;
+            ifs.close();
+        }
+        std::cout << "all done" << std::endl;
+        fs.close();
+    }
+
+    static inline void ltrim(std::string& s) {
+        s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](int ch) {
+            return !std::isspace(ch);
+            }));
+    }
+
+    static inline void rtrim(std::string& s) {
+        s.erase(std::find_if(s.rbegin(), s.rend(), [](int ch) {
+            return !std::isspace(ch);
+            }).base(), s.end());
+    }
+
+    static inline void trim(std::string& s) {
+        ltrim(s);
+        rtrim(s);
+    }
+
+    int parse_game_result_from_pgn_extract(std::string result) {
+        // White Win
+        if (result == "\"1-0\"") {
+            return 1;
+        }
+        // Black Win
+        else if (result == "\"0-1\"") {
+            return -1;
+        }
+        // Draw
+        else {
+            return 0;
+        }
+    }
+
+    // 0.25 -->  0.25 * PawnValueEg
+    // #-4  --> -mate_in(4)
+    // #3   -->  mate_in(3)
+    // -M4  --> -mate_in(4)
+    // +M3  -->  mate_in(3)
+    Value parse_score_from_pgn_extract(std::string eval, bool& success) {
+        success = true;
+
+        if (eval.substr(0, 1) == "#") {
+            if (eval.substr(1, 1) == "-") {
+                return -mate_in(stoi(eval.substr(2, eval.length() - 2)));
+            }
+            else {
+                return mate_in(stoi(eval.substr(1, eval.length() - 1)));
+            }
+        }
+        else if (eval.substr(0, 2) == "-M") {
+            //std::cout << "eval=" << eval << std::endl;
+            return -mate_in(stoi(eval.substr(2, eval.length() - 2)));
+        }
+        else if (eval.substr(0, 2) == "+M") {
+            //std::cout << "eval=" << eval << std::endl;
+            return mate_in(stoi(eval.substr(2, eval.length() - 2)));
+        }
+        else {
+            char* endptr;
+            double value = strtod(eval.c_str(), &endptr);
+
+            if (*endptr != '\0') {
+                success = false;
+                return VALUE_ZERO;
+            }
+            else {
+                return Value(value * static_cast<double>(PawnValueEg));
+            }
+        }
+    }
+
+    // for Debug
+    //#define DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT
+
+    bool is_like_fen(std::string fen) {
+        int count_space = std::count(fen.cbegin(), fen.cend(), ' ');
+        int count_slash = std::count(fen.cbegin(), fen.cend(), '/');
+
+#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
+        //std::cout << "count_space=" << count_space << std::endl;
+        //std::cout << "count_slash=" << count_slash << std::endl;
+#endif
+
+        return count_space == 5 && count_slash == 7;
+    }
+
+    void convert_bin_from_pgn_extract(
+        const vector<string>& filenames,
+        const string& output_file_name,
+        const bool pgn_eval_side_to_move,
+        const bool convert_no_eval_fens_as_score_zero)
+    {
+        std::cout << "pgn_eval_side_to_move=" << pgn_eval_side_to_move << std::endl;
+        std::cout << "convert_no_eval_fens_as_score_zero=" << convert_no_eval_fens_as_score_zero << std::endl;
+
+        auto th = Threads.main();
+        auto& pos = th->rootPos;
+
+        std::fstream ofs;
+        ofs.open(output_file_name, ios::out | ios::binary);
+
+        int game_count = 0;
+        int fen_count = 0;
+
+        for (auto filename : filenames) {
+            std::cout << now_string() << " convert " << filename << std::endl;
+            ifstream ifs;
+            ifs.open(filename);
+
+            int game_result = 0;
+
+            std::string line;
+            while (std::getline(ifs, line)) {
+
+                if (line.empty()) {
+                    continue;
+                }
+
+                else if (line.substr(0, 1) == "[") {
+                    std::regex pattern_result(R"(\[Result (.+?)\])");
+                    std::smatch match;
+
+                    // example: [Result "1-0"]
+                    if (std::regex_search(line, match, pattern_result)) {
+                        game_result = parse_game_result_from_pgn_extract(match.str(1));
+#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
+                        std::cout << "game_result=" << game_result << std::endl;
+#endif
+                        game_count++;
+                        if (game_count % 10000 == 0) {
+                            std::cout << now_string() << " game_count=" << game_count << ", fen_count=" << fen_count << std::endl;
+                        }
+                    }
+
+                    continue;
+                }
+
+                else {
+                    int gamePly = 1;
+                    auto itr = line.cbegin();
+
+                    while (true) {
+                        gamePly++;
+
+                        PackedSfenValue psv;
+                        memset((char*)&psv, 0, sizeof(PackedSfenValue));
+
+                        // fen
+                        {
+                            bool fen_found = false;
+
+                            while (!fen_found) {
+                                std::regex pattern_bracket(R"(\{(.+?)\})");
+                                std::smatch match;
+                                if (!std::regex_search(itr, line.cend(), match, pattern_bracket)) {
+                                    break;
+                                }
+
+                                itr += match.position(0) + match.length(0) - 1;
+                                std::string str_fen = match.str(1);
+                                trim(str_fen);
+
+                                if (is_like_fen(str_fen)) {
+                                    fen_found = true;
+
+                                    StateInfo si;
+                                    pos.set(str_fen, false, &si, th);
+                                    pos.sfen_pack(psv.sfen);
+                                }
+
+#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
+                                std::cout << "str_fen=" << str_fen << std::endl;
+                                std::cout << "fen_found=" << fen_found << std::endl;
+#endif
+                            }
+
+                            if (!fen_found) {
+                                break;
+                            }
+                        }
+
+                        // move
+                        {
+                            std::regex pattern_move(R"(\}(.+?)\{)");
+                            std::smatch match;
+                            if (!std::regex_search(itr, line.cend(), match, pattern_move)) {
+                                break;
+                            }
+
+                            itr += match.position(0) + match.length(0) - 1;
+                            std::string str_move = match.str(1);
+                            trim(str_move);
+#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
+                            std::cout << "str_move=" << str_move << std::endl;
+#endif
+                            psv.move = UCI::to_move(pos, str_move);
+                        }
+
+                        // eval
+                        bool eval_found = false;
+                        {
+                            std::regex pattern_bracket(R"(\{(.+?)\})");
+                            std::smatch match;
+                            if (!std::regex_search(itr, line.cend(), match, pattern_bracket)) {
+                                break;
+                            }
+
+                            std::string str_eval_clk = match.str(1);
+                            trim(str_eval_clk);
+#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
+                            std::cout << "str_eval_clk=" << str_eval_clk << std::endl;
+#endif
+
+                            // example: { [%eval 0.25] [%clk 0:10:00] }
+                            // example: { [%eval #-4] [%clk 0:10:00] }
+                            // example: { [%eval #3] [%clk 0:10:00] }
+                            // example: { +0.71/22 1.2s }
+                            // example: { -M4/7 0.003s }
+                            // example: { M3/245 0.017s }
+                            // example: { +M1/245 0.010s, White mates }
+                            // example: { 0.60 }
+                            // example: { book }
+                            // example: { rnbqkb1r/pp3ppp/2p1pn2/3p4/2PP4/2N2N2/PP2PPPP/R1BQKB1R w KQkq - 0 5 }
+
+                            // Considering the absence of eval
+                            if (!is_like_fen(str_eval_clk)) {
+                                itr += match.position(0) + match.length(0) - 1;
+
+                                if (str_eval_clk != "book") {
+                                    std::regex pattern_eval1(R"(\[\%eval (.+?)\])");
+                                    std::regex pattern_eval2(R"((.+?)\/)");
+
+                                    std::string str_eval;
+                                    if (std::regex_search(str_eval_clk, match, pattern_eval1) ||
+                                        std::regex_search(str_eval_clk, match, pattern_eval2)) {
+                                        str_eval = match.str(1);
+                                        trim(str_eval);
+                                    }
+                                    else {
+                                        str_eval = str_eval_clk;
+                                    }
+
+                                    bool success = false;
+                                    Value value = parse_score_from_pgn_extract(str_eval, success);
+                                    if (success) {
+                                        eval_found = true;
+                                        psv.score = Math::clamp(value, -VALUE_MATE, VALUE_MATE);
+                                    }
+
+#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
+                                    std::cout << "str_eval=" << str_eval << std::endl;
+                                    std::cout << "success=" << success << ", psv.score=" << psv.score << std::endl;
+#endif
+                                }
+                            }
+                        }
+
+                        // write
+                        if (eval_found || convert_no_eval_fens_as_score_zero) {
+                            if (!eval_found && convert_no_eval_fens_as_score_zero) {
+                                psv.score = 0;
+                            }
+
+                            psv.gamePly = gamePly;
+                            psv.game_result = game_result;
+
+                            if (pos.side_to_move() == BLACK) {
+                                if (!pgn_eval_side_to_move) {
+                                    psv.score *= -1;
+                                }
+                                psv.game_result *= -1;
+                            }
+
+                            ofs.write((char*)&psv, sizeof(PackedSfenValue));
+
+                            fen_count++;
+                        }
+                    }
+
+                    game_result = 0;
+                }
+            }
+        }
+
+        std::cout << now_string() << " game_count=" << game_count << ", fen_count=" << fen_count << std::endl;
+        std::cout << now_string() << " all done" << std::endl;
+        ofs.close();
+    }
+
+    void convert_plain(
+        const vector<string>& filenames,
+        const string& output_file_name)
+    {
+        Position tpos;
+        std::ofstream ofs;
+        ofs.open(output_file_name, ios::app);
+        auto th = Threads.main();
+        for (auto filename : filenames) {
+            std::cout << "convert " << filename << " ... ";
+
+            // Just convert packedsfenvalue to text
+            std::fstream fs;
+            fs.open(filename, ios::in | ios::binary);
+            PackedSfenValue p;
+            while (true)
+            {
+                if (fs.read((char*)&p, sizeof(PackedSfenValue))) {
+                    StateInfo si;
+                    tpos.set_from_packed_sfen(p.sfen, &si, th);
+
+                    // write as plain text
+                    ofs << "fen " << tpos.fen() << std::endl;
+                    ofs << "move " << UCI::move(Move(p.move), false) << std::endl;
+                    ofs << "score " << p.score << std::endl;
+                    ofs << "ply " << int(p.gamePly) << std::endl;
+                    ofs << "result " << int(p.game_result) << std::endl;
+                    ofs << "e" << std::endl;
+                }
+                else {
+                    break;
+                }
+            }
+            fs.close();
+            std::cout << "done" << std::endl;
+        }
+        ofs.close();
+        std::cout << "all done" << std::endl;
+    }
+
+    static inline const std::string plain_extension = ".plain";
+    static inline const std::string bin_extension = ".bin";
+    static inline const std::string binpack_extension = ".binpack";
+
+    static bool file_exists(const std::string& name)
+    {
+        std::ifstream f(name);
+        return f.good();
+    }
+
+    static bool ends_with(const std::string& lhs, const std::string& end)
+    {
+        if (end.size() > lhs.size()) return false;
+
+        return std::equal(end.rbegin(), end.rend(), lhs.rbegin());
+    }
+
+    static bool is_convert_of_type(
+        const std::string& input_path,
+        const std::string& output_path,
+        const std::string& expected_input_extension,
+        const std::string& expected_output_extension)
+    {
+        return ends_with(input_path, expected_input_extension)
+            && ends_with(output_path, expected_output_extension);
+    }
+
+    using ConvertFunctionType = void(std::string inputPath, std::string outputPath, std::ios_base::openmode om, bool validate);
+
+    static ConvertFunctionType* get_convert_function(const std::string& input_path, const std::string& output_path)
+    {
+        if (is_convert_of_type(input_path, output_path, plain_extension, bin_extension))
+            return binpack::convertPlainToBin;
+        if (is_convert_of_type(input_path, output_path, plain_extension, binpack_extension))
+            return binpack::convertPlainToBinpack;
+
+        if (is_convert_of_type(input_path, output_path, bin_extension, plain_extension))
+            return binpack::convertBinToPlain;
+        if (is_convert_of_type(input_path, output_path, bin_extension, binpack_extension))
+            return binpack::convertBinToBinpack;
+
+        if (is_convert_of_type(input_path, output_path, binpack_extension, plain_extension))
+            return binpack::convertBinpackToPlain;
+        if (is_convert_of_type(input_path, output_path, binpack_extension, bin_extension))
+            return binpack::convertBinpackToBin;
+
+        return nullptr;
+    }
+
+    static void convert(const std::string& input_path, const std::string& output_path, std::ios_base::openmode om, bool validate)
+    {
+        if(!file_exists(input_path))
+        {
+            std::cerr << "Input file does not exist.\n";
+            return;
+        }
+
+        auto func = get_convert_function(input_path, output_path);
+        if (func != nullptr)
+        {
+            func(input_path, output_path, om, validate);
+        }
+        else
+        {
+            std::cerr << "Conversion between files of these types is not supported.\n";
+        }
+    }
+
+    static void convert(const std::vector<std::string>& args)
+    {
+        if (args.size() < 2 || args.size() > 4)
+        {
+            std::cerr << "Invalid arguments.\n";
+            std::cerr << "Usage: convert from_path to_path [append] [validate]\n";
+            return;
+        }
+
+        const bool append = std::find(args.begin() + 2, args.end(), "append") != args.end();
+        const bool validate = std::find(args.begin() + 2, args.end(), "validate") != args.end();
+
+        const std::ios_base::openmode openmode =
+            append
+            ? std::ios_base::app
+            : std::ios_base::trunc;
+
+        convert(args[0], args[1], openmode, validate);
+    }
+
+    void convert(istringstream& is)
+    {
+        std::vector<std::string> args;
+
+        while (true)
+        {
+            std::string token = "";
+            is >> token;
+            if (token == "")
+                break;
+
+            args.push_back(token);
+        }
+
+        convert(args);
+    }
+
+    static void append_files_from_dir(
+        std::vector<std::string>& filenames,
+        const std::string& base_dir,
+        const std::string& target_dir)
+    {
+        string kif_base_dir = Path::combine(base_dir, target_dir);
+
+        namespace sys = std::filesystem;
+        sys::path p(kif_base_dir); // Origin of enumeration
+        std::for_each(sys::directory_iterator(p), sys::directory_iterator(),
+            [&](const sys::path& path) {
+                if (sys::is_regular_file(path))
+                    filenames.push_back(Path::combine(target_dir, path.filename().generic_string()));
+            });
+    }
+
+    static void rebase_files(
+        std::vector<std::string>& filenames,
+        const std::string& base_dir)
+    {
+        for (auto& file : filenames)
+        {
+            file = Path::combine(base_dir, file);
+        }
+    }
+
+    void convert_bin_from_pgn_extract(std::istringstream& is)
+    {
+        std::vector<std::string> filenames;
+
+        string base_dir;
+        string target_dir;
+
+        bool pgn_eval_side_to_move = false;
+        bool convert_no_eval_fens_as_score_zero = false;
+
+        string output_file_name = "shuffled_sfen.bin";
+
+        while (true)
+        {
+            string option;
+            is >> option;
+
+            if (option == "")
+                break;
+
+            if (option == "targetdir") is >> target_dir;
+            else if (option == "targetfile")
+            {
+                std::string filename;
+                is >> filename;
+                filenames.push_back(filename);
+            }
+
+            else if (option == "basedir")   is >> base_dir;
+
+            else if (option == "pgn_eval_side_to_move") is >> pgn_eval_side_to_move;
+            else if (option == "convert_no_eval_fens_as_score_zero") is >> convert_no_eval_fens_as_score_zero;
+            else if (option == "output_file_name") is >> output_file_name;
+            else
+            {
+                cout << "Unknown option: " << option << ". Ignoring.\n";
+            }
+        }
+
+        if (!target_dir.empty())
+        {
+            append_files_from_dir(filenames, base_dir, target_dir);
+        }
+        rebase_files(filenames, base_dir);
+
+        Eval::NNUE::init();
+
+        cout << "convert_bin_from_pgn-extract.." << endl;
+        convert_bin_from_pgn_extract(
+            filenames,
+            output_file_name,
+            pgn_eval_side_to_move,
+            convert_no_eval_fens_as_score_zero);
+    }
+
+    void convert_bin(std::istringstream& is)
+    {
+        std::vector<std::string> filenames;
+
+        string base_dir;
+        string target_dir;
+
+        int ply_minimum = 0;
+        int ply_maximum = 114514;
+        bool interpolate_eval = 0;
+        bool check_invalid_fen = false;
+        bool check_illegal_move = false;
+
+        bool pgn_eval_side_to_move = false;
+        bool convert_no_eval_fens_as_score_zero = false;
+
+        double src_score_min_value = 0.0;
+        double src_score_max_value = 1.0;
+        double dest_score_min_value = 0.0;
+        double dest_score_max_value = 1.0;
+
+        string output_file_name = "shuffled_sfen.bin";
+
+        while (true)
+        {
+            string option;
+            is >> option;
+
+            if (option == "")
+                break;
+
+            if (option == "targetdir") is >> target_dir;
+            else if (option == "targetfile")
+            {
+                std::string filename;
+                is >> filename;
+                filenames.push_back(filename);
+            }
+
+            else if (option == "basedir")   is >> base_dir;
+
+            else if (option == "ply_minimum") is >> ply_minimum;
+            else if (option == "ply_maximum") is >> ply_maximum;
+            else if (option == "interpolate_eval") is >> interpolate_eval;
+            else if (option == "check_invalid_fen") is >> check_invalid_fen;
+            else if (option == "check_illegal_move") is >> check_illegal_move;
+            else if (option == "pgn_eval_side_to_move") is >> pgn_eval_side_to_move;
+            else if (option == "convert_no_eval_fens_as_score_zero") is >> convert_no_eval_fens_as_score_zero;
+            else if (option == "src_score_min_value") is >> src_score_min_value;
+            else if (option == "src_score_max_value") is >> src_score_max_value;
+            else if (option == "dest_score_min_value") is >> dest_score_min_value;
+            else if (option == "dest_score_max_value") is >> dest_score_max_value;
+            else if (option == "output_file_name") is >> output_file_name;
+            else
+            {
+                cout << "Unknown option: " << option << ". Ignoring.\n";
+            }
+        }
+
+        if (!target_dir.empty())
+        {
+            append_files_from_dir(filenames, base_dir, target_dir);
+        }
+        rebase_files(filenames, base_dir);
+
+        Eval::NNUE::init();
+
+        cout << "convert_bin.." << endl;
+            convert_bin(
+                filenames,
+                output_file_name,
+                ply_minimum,
+                ply_maximum,
+                interpolate_eval,
+                src_score_min_value,
+                src_score_max_value,
+                dest_score_min_value,
+                dest_score_max_value,
+                check_invalid_fen,
+                check_illegal_move
+            );
+    }
+
+    void convert_plain(std::istringstream& is)
+    {
+        std::vector<std::string> filenames;
+
+        string base_dir;
+        string target_dir;
+
+        string output_file_name = "shuffled_sfen.bin";
+
+        while (true)
+        {
+            string option;
+            is >> option;
+
+            if (option == "")
+                break;
+
+            if (option == "targetdir") is >> target_dir;
+            else if (option == "targetfile")
+            {
+                std::string filename;
+                is >> filename;
+                filenames.push_back(filename);
+            }
+
+            else if (option == "basedir")   is >> base_dir;
+
+            else if (option == "output_file_name") is >> output_file_name;
+            else
+            {
+                cout << "Unknown option: " << option << ". Ignoring.\n";
+            }
+        }
+
+        if (!target_dir.empty())
+        {
+            append_files_from_dir(filenames, base_dir, target_dir);
+        }
+        rebase_files(filenames, base_dir);
+
+        Eval::NNUE::init();
+
+        cout << "convert_plain.." << endl;
+        convert_plain(filenames, output_file_name);
+    }
+}
@@ -0,0 +1,18 @@
+#ifndef _CONVERT_H_
+#define _CONVERT_H_
+
+#include <vector>
+#include <string>
+#include <sstream>
+
+namespace Learner {
+    void convert(std::istringstream& is);
+
+    void convert_bin_from_pgn_extract(std::istringstream& is);
+
+    void convert_bin(std::istringstream& is);
+
+    void convert_plain(std::istringstream& is);
+}
+
+#endif
@@ -0,0 +1,962 @@
+#include "gensfen.h"
+
+#include "sfen_writer.h"
+#include "packed_sfen.h"
+#include "opening_book.h"
+
+#include "misc.h"
+#include "position.h"
+#include "thread.h"
+#include "tt.h"
+#include "uci.h"
+
+#include "extra/nnue_data_binpack_format.h"
+
+#include "nnue/evaluate_nnue.h"
+#include "nnue/evaluate_nnue_learner.h"
+
+#include "syzygy/tbprobe.h"
+
+#include <atomic>
+#include <chrono>
+#include <climits>
+#include <cmath>
+#include <cstring>
+#include <filesystem>
+#include <fstream>
+#include <iomanip>
+#include <limits>
+#include <list>
+#include <memory>
+#include <optional>
+#include <random>
+#include <shared_mutex>
+#include <sstream>
+#include <unordered_set>
+
+using namespace std;
+
+namespace Learner
+{
+    // Class to generate sfen with multiple threads
+    struct Gensfen
+    {
+        struct Params
+        {
+            // Min and max depths for search during gensfen
+            int search_depth_min = 3;
+            int search_depth_max = -1;
+
+            // Number of the nodes to be searched.
+            // 0 represents no limits.
+            uint64_t nodes = 0;
+
+            // Upper limit of evaluation value of generated situation
+            int eval_limit = 3000;
+
+            // minimum ply with random move
+            // maximum ply with random move
+            // Number of random moves in one station
+            int random_move_minply = 1;
+            int random_move_maxply = 24;
+            int random_move_count = 5;
+
+            // Move kings with a probability of 1/N when randomly moving like Apery software.
+            // When you move the king again, there is a 1/N chance that it will randomly moved
+            // once in the opponent's turn.
+            // Apery has N=2. Specifying 0 here disables this function.
+            int random_move_like_apery = 0;
+
+            // For when using multi pv instead of random move.
+            // random_multi_pv is the number of candidates for MultiPV.
+            // When adopting the move of the candidate move, the difference
+            // between the evaluation value of the move of the 1st place
+            // and the evaluation value of the move of the Nth place is.
+            // Must be in the range random_multi_pv_diff.
+            // random_multi_pv_depth is the search depth for MultiPV.
+            int random_multi_pv = 0;
+            int random_multi_pv_diff = 32000;
+            int random_multi_pv_depth = -1;
+
+            // The minimum and maximum ply (number of steps from
+            // the initial phase) of the sfens to write out.
+            int write_minply = 16;
+            int write_maxply = 400;
+
+            uint64_t save_every = std::numeric_limits<uint64_t>::max();
+
+            std::string output_file_name = "generated_kifu";
+
+            SfenOutputType sfen_format = SfenOutputType::Binpack;
+
+            std::string seed;
+
+            bool write_out_draw_game_in_training_data_generation = true;
+            bool detect_draw_by_consecutive_low_score = true;
+            bool detect_draw_by_insufficient_mating_material = true;
+
+            bool ensure_quiet = false;
+
+            uint64_t num_threads;
+
+            std::string book;
+
+            void enforce_constraints()
+            {
+                search_depth_max = std::max(search_depth_min, search_depth_max);
+                random_multi_pv_depth = std::max(search_depth_min, random_multi_pv_depth);
+
+                // Limit the maximum to a one-stop score. (Otherwise you might not end the loop)
+                eval_limit = std::min(eval_limit, (int)mate_in(2));
+
+                save_every = std::max(save_every, REPORT_STATS_EVERY);
+
+                num_threads = Options["Threads"];
+            }
+        };
+
+        // Hash to limit the export of identical sfens
+        static constexpr uint64_t GENSFEN_HASH_SIZE = 64 * 1024 * 1024;
+        // It must be 2**N because it will be used as the mask to calculate hash_index.
+        static_assert((GENSFEN_HASH_SIZE& (GENSFEN_HASH_SIZE - 1)) == 0);
+
+        static constexpr uint64_t REPORT_DOT_EVERY = 5000;
+        static constexpr uint64_t REPORT_STATS_EVERY = 200000;
+        static_assert(REPORT_STATS_EVERY % REPORT_DOT_EVERY == 0);
+
+        Gensfen(
+            const Params& prm
+        ) :
+            params(prm),
+            prng(prm.seed),
+            sfen_writer(prm.output_file_name, prm.num_threads, prm.save_every, prm.sfen_format)
+        {
+            hash.resize(GENSFEN_HASH_SIZE);
+
+            if (!prm.book.empty())
+            {
+                opening_book = open_opening_book(prm.book, prng);
+                if (opening_book == nullptr)
+                {
+                    std::cout << "WARNING: Failed to open opening book " << prm.book << ". Falling back to startpos.\n";
+                }
+            }
+
+            // Output seed to veryfy by the user if it's not identical by chance.
+            std::cout << prng << std::endl;
+        }
+
+        void generate(uint64_t limit);
+
+    private:
+        Params params;
+
+        PRNG prng;
+
+        std::mutex stats_mutex;
+        TimePoint last_stats_report_time;
+
+        // sfen exporter
+        SfenWriter sfen_writer;
+
+        SynchronizedRegionLogger::Region out;
+
+        vector<Key> hash; // 64MB*sizeof(HASH_KEY) = 512MB
+
+        std::unique_ptr<OpeningBook> opening_book;
+
+        static void set_gensfen_search_limits();
+
+        void generate_worker(
+            Thread& th,
+            std::atomic<uint64_t>& counter,
+            uint64_t limit);
+
+        bool was_seen_before(const Position& pos);
+
+        optional<int8_t> get_current_game_result(
+            Position& pos,
+            const vector<int>& move_hist_scores) const;
+
+        vector<uint8_t> generate_random_move_flags();
+
+        optional<Move> choose_random_move(
+            Position& pos,
+            std::vector<uint8_t>& random_move_flag,
+            int ply,
+            int& random_move_c);
+
+        bool commit_psv(
+            Thread& th,
+            PSVector& sfens,
+            int8_t lastTurnIsWin,
+            std::atomic<uint64_t>& counter,
+            uint64_t limit,
+            Color result_color);
+
+        void report(uint64_t done, uint64_t new_done);
+
+        void maybe_report(uint64_t done);
+    };
+
+    void Gensfen::set_gensfen_search_limits()
+    {
+        // About Search::Limits
+        // Be careful because this member variable is global and affects other threads.
+        auto& limits = Search::Limits;
+
+        // Make the search equivalent to the "go infinite" command. (Because it is troublesome if time management is done)
+        limits.infinite = true;
+
+        // Since PV is an obstacle when displayed, erase it.
+        limits.silent = true;
+
+        // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it.
+        limits.nodes = 0;
+
+        // depth is also processed by the one passed as an argument of Learner::search().
+        limits.depth = 0;
+    }
+
+    void Gensfen::generate(uint64_t limit)
+    {
+        last_stats_report_time = 0;
+
+        set_gensfen_search_limits();
+
+        std::atomic<uint64_t> counter{0};
+        Threads.execute_with_workers([&counter, limit, this](Thread& th) {
+            generate_worker(th, counter, limit);
+        });
+        Threads.wait_for_workers_finished();
+
+        sfen_writer.flush();
+
+        if (limit % REPORT_STATS_EVERY != 0)
+        {
+            report(limit, limit % REPORT_STATS_EVERY);
+        }
+
+        std::cout << std::endl;
+    }
+
+    void Gensfen::generate_worker(
+        Thread& th,
+        std::atomic<uint64_t>& counter,
+        uint64_t limit)
+    {
+        // For the time being, it will be treated as a draw
+        // at the maximum number of steps to write.
+        // Maximum StateInfo + Search PV to advance to leaf buffer
+        std::vector<StateInfo, AlignedAllocator<StateInfo>> states(
+            params.write_maxply + MAX_PLY /* == search_depth_min + α */);
+
+        StateInfo si;
+
+        // end flag
+        bool quit = false;
+
+        // repeat until the specified number of times
+        while (!quit)
+        {
+            // It is necessary to set a dependent thread for Position.
+            // When parallelizing, Threads (since this is a vector<Thread*>,
+            // Do the same for up to Threads[0]...Threads[thread_num-1].
+            auto& pos = th.rootPos;
+            if (opening_book != nullptr)
+            {
+                auto& fen = opening_book->next_fen();
+                pos.set(fen, false, &si, &th);
+            }
+            else
+            {
+                pos.set(StartFEN, false, &si, &th);
+            }
+
+            int resign_counter = 0;
+            bool should_resign = prng.rand(10) > 1;
+            // Vector for holding the sfens in the current simulated game.
+            PSVector packed_sfens;
+            packed_sfens.reserve(params.write_maxply + MAX_PLY);
+
+            // Precomputed flags. Used internally by choose_random_move.
+            vector<uint8_t> random_move_flag = generate_random_move_flags();
+
+            // A counter that keeps track of the number of random moves
+            // When random_move_minply == -1, random moves are
+            // performed continuously, so use it at this time.
+            // Used internally by choose_random_move.
+            int actual_random_move_count = 0;
+
+            // Save history of move scores for adjudication
+            vector<int> move_hist_scores;
+
+            auto flush_psv = [&](int8_t result) {
+                quit = commit_psv(th, packed_sfens, result, counter, limit, pos.side_to_move());
+            };
+
+            for (int ply = 0; ; ++ply)
+            {
+                // Current search depth
+                const int depth = params.search_depth_min + (int)prng.rand(params.search_depth_max - params.search_depth_min + 1);
+
+                // Starting search calls init_for_search
+                auto [search_value, search_pv] = Search::search(pos, depth, 1, params.nodes);
+
+                // This has to be performed after search because it needs to know
+                // rootMoves which are filled in init_for_search.
+                const auto result = get_current_game_result(pos, move_hist_scores);
+                if (result.has_value())
+                {
+                    flush_psv(result.value());
+                    break;
+                }
+
+                // Always adjudivate by eval limit.
+                // Also because of this we don't have to check for TB/MATE scores
+                if (abs(search_value) >= params.eval_limit)
+                {
+                    resign_counter++;
+                    if ((should_resign && resign_counter >= 4) || abs(search_value) >= VALUE_KNOWN_WIN) {
+                        flush_psv((search_value >= params.eval_limit) ? 1 : -1);
+                        break;
+                    }
+                }
+                else
+                {
+                    resign_counter = 0;
+                }
+
+                // In case there is no PV and the game was not ended here
+                // there is nothing we can do, we can't continue the game,
+                // we don't know the result, so discard this game.
+                if (search_pv.empty())
+                {
+                    break;
+                }
+
+                // Save the move score for adjudication.
+                move_hist_scores.push_back(search_value);
+
+                // Discard stuff before write_minply is reached
+                // because it can harm training due to overfitting.
+                // Initial positions would be too common.
+                if (ply >= params.write_minply)
+                {
+                    packed_sfens.emplace_back(PackedSfenValue());
+
+                    auto& psv = packed_sfens.back();
+
+                    if (params.ensure_quiet)
+                    {
+                        auto [qsearch_value, qsearch_pv] = Search::qsearch(pos);
+                        if (qsearch_pv.empty())
+                        {
+                            // Here we only write the position data.
+                            // Result is added after the whole game is done.
+                            pos.sfen_pack(psv.sfen);
+
+                            // Already a quiet position
+                            psv.score = search_value;
+                            psv.move = search_pv[0];
+                            psv.gamePly = ply;
+                        }
+                        else
+                        {
+                            // Navigate to a quiet
+                            int old_ply = ply;
+                            for (auto m : qsearch_pv)
+                            {
+                                pos.do_move(m, states[ply++]);
+                            }
+
+                            if (was_seen_before(pos))
+                            {
+                                // Just skip the move.
+                                packed_sfens.pop_back();
+                            }
+                            else
+                            {
+                                // Reevaluate
+                                auto [quiet_search_value, quiet_search_pv] = Search::search(pos, depth, 1, params.nodes);
+                                if (quiet_search_pv.empty())
+                                {
+                                    // Just skip the move.
+                                    packed_sfens.pop_back();
+                                }
+                                else
+                                {
+                                    // Here we only write the position data.
+                                    // Result is added after the whole game is done.
+                                    pos.sfen_pack(psv.sfen);
+
+                                    psv.score = quiet_search_value;
+                                    psv.move = quiet_search_pv[0];
+                                    psv.gamePly = ply;
+                                }
+                            }
+
+                            // Get back to the game
+                            for (auto it = qsearch_pv.rbegin(); it != qsearch_pv.rend(); ++it)
+                            {
+                                pos.undo_move(*it);
+                            }
+                            ply = old_ply;
+                        }
+                    }
+                    else
+                    {
+                        if (was_seen_before(pos))
+                        {
+                            packed_sfens.pop_back();
+                        }
+                        else
+                        {
+                            // Here we only write the position data.
+                            // Result is added after the whole game is done.
+                            pos.sfen_pack(psv.sfen);
+
+                            psv.score = search_value;
+                            psv.move = search_pv[0];
+                            psv.gamePly = ply;
+                        }
+                    }
+                }
+
+                // Update the next move according to best search result or random move.
+                auto random_move = choose_random_move(pos, random_move_flag, ply, actual_random_move_count);
+                const Move next_move = random_move.has_value() ? *random_move : search_pv[0];
+
+                // We don't have the whole game yet, but it ended,
+                // so the writing process ends and the next game starts.
+                // This shouldn't really happen.
+                if (!is_ok(next_move))
+                {
+                    break;
+                }
+
+                // Do move.
+                pos.do_move(next_move, states[ply]);
+            }
+        }
+    }
+
+    bool Gensfen::was_seen_before(const Position& pos)
+    {
+        // Look into the position hashtable to see if the same
+        // position was seen before.
+        // This is a good heuristic to exlude already seen
+        // positions without many false positives.
+        auto key = pos.key();
+        auto hash_index = (size_t)(key & (GENSFEN_HASH_SIZE - 1));
+        auto old_key = hash[hash_index];
+        if (key == old_key)
+        {
+            return true;
+        }
+        else
+        {
+            // Replace with the current key.
+            hash[hash_index] = key;
+            return false;
+        }
+    }
+
+    optional<int8_t> Gensfen::get_current_game_result(
+        Position& pos,
+        const vector<int>& move_hist_scores) const
+    {
+        // Variables for draw adjudication.
+        // Todo: Make this as an option.
+
+        // start the adjudication when ply reaches this value
+        constexpr int adj_draw_ply = 80;
+
+        // 4 move scores for each side have to be checked
+        constexpr int adj_draw_cnt = 8;
+
+        // move score in CP
+        constexpr int adj_draw_score = 0;
+
+        // For the time being, it will be treated as a
+        // draw at the maximum number of steps to write.
+        const int ply = move_hist_scores.size();
+
+        // has it reached the max length or is a draw
+        if (ply >= params.write_maxply || pos.is_draw(ply))
+        {
+            return 0;
+        }
+
+        if(pos.this_thread()->rootMoves.empty())
+        {
+            // If there is no legal move
+            return pos.checkers()
+                ? -1 /* mate */
+                : 0 /* stalemate */;
+        }
+
+        // Adjudicate game to a draw if the last 4 scores of each engine is 0.
+        if (params.detect_draw_by_consecutive_low_score)
+        {
+            if (ply >= adj_draw_ply)
+            {
+                int num_cons_plies_within_draw_score = 0;
+                bool is_adj_draw = false;
+
+                for (auto it = move_hist_scores.rbegin();
+                    it != move_hist_scores.rend(); ++it)
+                {
+                    if (abs(*it) <= adj_draw_score)
+                    {
+                        num_cons_plies_within_draw_score++;
+                    }
+                    else
+                    {
+                        // Draw scores must happen on consecutive plies
+                        break;
+                    }
+
+                    if (num_cons_plies_within_draw_score >= adj_draw_cnt)
+                    {
+                        is_adj_draw = true;
+                        break;
+                    }
+                }
+
+                if (is_adj_draw)
+                {
+                    return 0;
+                }
+            }
+        }
+
+        // Draw by insufficient mating material
+        if (params.detect_draw_by_insufficient_mating_material)
+        {
+            if (pos.count<ALL_PIECES>() <= 4)
+            {
+                int num_pieces = pos.count<ALL_PIECES>();
+
+                // (1) KvK
+                if (num_pieces == 2)
+                {
+                    return 0;
+                }
+
+                // (2) KvK + 1 minor piece
+                if (num_pieces == 3)
+                {
+                    int minor_pc = pos.count<BISHOP>(WHITE) + pos.count<KNIGHT>(WHITE) +
+                        pos.count<BISHOP>(BLACK) + pos.count<KNIGHT>(BLACK);
+                    if (minor_pc == 1)
+                    {
+                        return 0;
+                    }
+                }
+
+                // (3) KBvKB, bishops of the same color
+                else if (num_pieces == 4)
+                {
+                    if (pos.count<BISHOP>(WHITE) == 1 && pos.count<BISHOP>(BLACK) == 1)
+                    {
+                        // Color of bishops is black.
+                        if ((pos.pieces(WHITE, BISHOP) & DarkSquares)
+                            && (pos.pieces(BLACK, BISHOP) & DarkSquares))
+                        {
+                            return 0;
+                        }
+                        // Color of bishops is white.
+                        if ((pos.pieces(WHITE, BISHOP) & ~DarkSquares)
+                            && (pos.pieces(BLACK, BISHOP) & ~DarkSquares))
+                        {
+                            return 0;
+                        }
+                    }
+                }
+            }
+        }
+
+        return nullopt;
+    }
+
+    vector<uint8_t> Gensfen::generate_random_move_flags()
+    {
+        vector<uint8_t> random_move_flag;
+
+        // Depending on random move selection parameters setup
+        // the array of flags that indicates whether a random move
+        // be taken at a given ply.
+
+        // Make an array like a[0] = 0 ,a[1] = 1, ...
+        // Fisher-Yates shuffle and take out the first N items.
+        // Actually, I only want N pieces, so I only need
+        // to shuffle the first N pieces with Fisher-Yates.
+
+        vector<int> a;
+        a.reserve((size_t)params.random_move_maxply);
+
+        // random_move_minply ,random_move_maxply is specified by 1 origin,
+        // Note that we are handling 0 origin here.
+        for (int i = std::max(params.random_move_minply - 1, 0); i < params.random_move_maxply; ++i)
+        {
+            a.push_back(i);
+        }
+
+        // In case of Apery random move, insert() may be called random_move_count times.
+        // Reserve only the size considering it.
+        random_move_flag.resize((size_t)params.random_move_maxply + params.random_move_count);
+
+        // A random move that exceeds the size() of a[] cannot be applied, so limit it.
+        for (int i = 0; i < std::min(params.random_move_count, (int)a.size()); ++i)
+        {
+            swap(a[i], a[prng.rand((uint64_t)a.size() - i) + i]);
+            random_move_flag[a[i]] = true;
+        }
+
+        return random_move_flag;
+    }
+
+    optional<Move> Gensfen::choose_random_move(
+        Position& pos,
+        std::vector<uint8_t>& random_move_flag,
+        int ply,
+        int& random_move_c)
+    {
+        optional<Move> random_move;
+
+        // Randomly choose one from legal move
+        if (
+            // 1. Random move of random_move_count times from random_move_minply to random_move_maxply
+            (params.random_move_minply != -1 && ply < (int)random_move_flag.size() && random_move_flag[ply]) ||
+            // 2. A mode to perform random move of random_move_count times after leaving the startpos
+            (params.random_move_minply == -1 && random_move_c < params.random_move_count))
+        {
+            ++random_move_c;
+
+            // It's not a mate, so there should be one legal move...
+            if (params.random_multi_pv == 0)
+            {
+                // Normal random move
+                MoveList<LEGAL> list(pos);
+
+                // I don't really know the goodness and badness of making this the Apery method.
+                if (params.random_move_like_apery == 0
+                    || prng.rand(params.random_move_like_apery) != 0)
+                {
+                    // Normally one move from legal move
+                    random_move = list.at((size_t)prng.rand((uint64_t)list.size()));
+                }
+                else
+                {
+                    // if you can move the king, move the king
+                    Move moves[8]; // Near 8
+                    Move* p = &moves[0];
+                    for (auto& m : list)
+                    {
+                        if (type_of(pos.moved_piece(m)) == KING)
+                        {
+                            *(p++) = m;
+                        }
+                    }
+
+                    size_t n = p - &moves[0];
+                    if (n != 0)
+                    {
+                        // move to move the king
+                        random_move = moves[prng.rand(n)];
+
+                        // In Apery method, at this time there is a 1/2 chance
+                        // that the opponent will also move randomly
+                        if (prng.rand(2) == 0)
+                        {
+                            // Is it a simple hack to add a "1" next to random_move_flag[ply]?
+                            random_move_flag.insert(random_move_flag.begin() + ply + 1, 1, true);
+                        }
+                    }
+                    else
+                    {
+                        // Normally one move from legal move
+                        random_move = list.at((size_t)prng.rand((uint64_t)list.size()));
+                    }
+                }
+            }
+            else
+            {
+                Search::search(pos, params.random_multi_pv_depth, params.random_multi_pv);
+
+                // Select one from the top N hands of root Moves
+                auto& rm = pos.this_thread()->rootMoves;
+
+                uint64_t s = min((uint64_t)rm.size(), (uint64_t)params.random_multi_pv);
+                for (uint64_t i = 1; i < s; ++i)
+                {
+                    // The difference from the evaluation value of rm[0] must
+                    // be within the range of random_multi_pv_diff.
+                    // It can be assumed that rm[x].score is arranged in descending order.
+                    if (rm[0].score > rm[i].score + params.random_multi_pv_diff)
+                    {
+                        s = i;
+                        break;
+                    }
+                }
+
+                random_move = rm[prng.rand(s)].pv[0];
+            }
+        }
+
+        return random_move;
+    }
+
+    // Write out the phases loaded in sfens to a file.
+    // result: win/loss in the next phase after the final phase in sfens
+    // 1 when winning. -1 when losing. Pass 0 for a draw.
+    // Return value: true if the specified number of
+    // sfens has already been reached and the process ends.
+    bool Gensfen::commit_psv(
+        Thread& th,
+        PSVector& sfens,
+        int8_t result,
+        std::atomic<uint64_t>& counter,
+        uint64_t limit,
+        Color result_color)
+    {
+        if (!params.write_out_draw_game_in_training_data_generation && result == 0)
+        {
+            // We didn't write anything so why quit.
+            return false;
+        }
+
+        auto side_to_move_from_sfen = [](auto& sfen){
+            return (Color)(sfen.sfen.data[0] & 1);
+        };
+
+        // From the final stage (one step before) to the first stage, give information on the outcome of the game for each stage.
+        // The phases stored in sfens are assumed to be continuous (in order).
+        for (auto it = sfens.rbegin(); it != sfens.rend(); ++it)
+        {
+            // The side to move is packed as the lowest bit of the first byte
+            const Color side_to_move = side_to_move_from_sfen(*it);
+            it->game_result = side_to_move == result_color ? result : -result;
+        }
+
+        // Write sfens in move order to make potential compression easier
+        for (auto& sfen : sfens)
+        {
+            // Return true if there is already enough data generated.
+            const auto iter = counter.fetch_add(1);
+            if (iter >= limit)
+                return true;
+
+            // because `iter` was done, now we do one more
+            maybe_report(iter + 1);
+
+            // Write out one sfen.
+            sfen_writer.write(th.thread_idx(), sfen);
+        }
+
+        return false;
+    }
+
+    void Gensfen::report(uint64_t done, uint64_t new_done)
+    {
+        const auto now_time = now();
+        const TimePoint elapsed = now_time - last_stats_report_time + 1;
+
+        out
+            << endl
+            << done << " sfens, "
+            << new_done * 1000 / elapsed << " sfens/second, "
+            << "at " << now_string() << sync_endl;
+
+        last_stats_report_time = now_time;
+
+        out = sync_region_cout.new_region();
+    }
+
+    void Gensfen::maybe_report(uint64_t done)
+    {
+        if (done % REPORT_DOT_EVERY == 0)
+        {
+            std::lock_guard lock(stats_mutex);
+
+            if (last_stats_report_time == 0)
+            {
+                last_stats_report_time = now();
+                out = sync_region_cout.new_region();
+            }
+
+            if (done != 0)
+            {
+                out << '.';
+
+                if (done % REPORT_STATS_EVERY == 0)
+                {
+                    report(done, REPORT_STATS_EVERY);
+                }
+            }
+        }
+    }
+
+    // Command to generate a game record
+    void gensfen(istringstream& is)
+    {
+        // Number of generated game records default = 8 billion phases (Ponanza specification)
+        uint64_t loop_max = 8000000000UL;
+
+        Gensfen::Params params;
+
+        // Add a random number to the end of the file name.
+        bool random_file_name = false;
+        std::string sfen_format = "binpack";
+
+        string token;
+        while (true)
+        {
+            token = "";
+            is >> token;
+            if (token == "")
+                break;
+
+            if (token == "depth")
+                is >> params.search_depth_min;
+            else if (token == "depth2")
+                is >> params.search_depth_max;
+            else if (token == "nodes")
+                is >> params.nodes;
+            else if (token == "loop")
+                is >> loop_max;
+            else if (token == "output_file_name")
+                is >> params.output_file_name;
+            else if (token == "eval_limit")
+                is >> params.eval_limit;
+            else if (token == "random_move_minply")
+                is >> params.random_move_minply;
+            else if (token == "random_move_maxply")
+                is >> params.random_move_maxply;
+            else if (token == "random_move_count")
+                is >> params.random_move_count;
+            else if (token == "random_move_like_apery")
+                is >> params.random_move_like_apery;
+            else if (token == "random_multi_pv")
+                is >> params.random_multi_pv;
+            else if (token == "random_multi_pv_diff")
+                is >> params.random_multi_pv_diff;
+            else if (token == "random_multi_pv_depth")
+                is >> params.random_multi_pv_depth;
+            else if (token == "write_minply")
+                is >> params.write_minply;
+            else if (token == "write_maxply")
+                is >> params.write_maxply;
+            else if (token == "save_every")
+                is >> params.save_every;
+            else if (token == "book")
+                is >> params.book;
+            else if (token == "random_file_name")
+                is >> random_file_name;
+            // Accept also the old option name.
+            else if (token == "use_draw_in_training_data_generation" || token == "write_out_draw_game_in_training_data_generation")
+                is >> params.write_out_draw_game_in_training_data_generation;
+            // Accept also the old option name.
+            else if (token == "use_game_draw_adjudication" || token == "detect_draw_by_consecutive_low_score")
+                is >> params.detect_draw_by_consecutive_low_score;
+            else if (token == "detect_draw_by_insufficient_mating_material")
+                is >> params.detect_draw_by_insufficient_mating_material;
+            else if (token == "sfen_format")
+                is >> sfen_format;
+            else if (token == "seed")
+                is >> params.seed;
+            else if (token == "set_recommended_uci_options")
+            {
+                UCI::setoption("Contempt", "0");
+                UCI::setoption("Skill Level", "20");
+                UCI::setoption("UCI_Chess960", "false");
+                UCI::setoption("UCI_AnalyseMode", "false");
+                UCI::setoption("UCI_LimitStrength", "false");
+                UCI::setoption("PruneAtShallowDepth", "false");
+                UCI::setoption("EnableTranspositionTable", "true");
+            }
+            else if (token == "ensure_quiet")
+            {
+                params.ensure_quiet = true;
+            }
+            else
+                cout << "ERROR: Ignoring unknown option " << token << endl;
+        }
+
+        if (!sfen_format.empty())
+        {
+            if (sfen_format == "bin")
+                params.sfen_format = SfenOutputType::Bin;
+            else if (sfen_format == "binpack")
+                params.sfen_format = SfenOutputType::Binpack;
+            else
+                cout << "WARNING: Unknown sfen format `" << sfen_format << "`. Using bin\n";
+        }
+
+        if (params.ensure_quiet)
+        {
+            // Otherwise we can't ensure quiet positions...
+            UCI::setoption("EnableTranspositionTable", "false");
+        }
+
+        if (random_file_name)
+        {
+            // Give a random number to output_file_name at this point.
+            // Do not use std::random_device().  Because it always the same integers on MinGW.
+            PRNG r(params.seed);
+
+            // Just in case, reassign the random numbers.
+            for (int i = 0; i < 10; ++i)
+                r.rand(1);
+
+            auto to_hex = [](uint64_t u) {
+                std::stringstream ss;
+                ss << std::hex << u;
+                return ss.str();
+            };
+
+            // I don't want to wear 64bit numbers by accident, so I'next_move going to make a 64bit number 2 just in case.
+            params.output_file_name += "_" + to_hex(r.rand<uint64_t>()) + to_hex(r.rand<uint64_t>());
+        }
+
+        params.enforce_constraints();
+
+        std::cout << "INFO: Executing gensfen command\n";
+
+        std::cout << "INFO: Parameters:\n";
+        std::cout
+            << "  - search_depth_min       = " << params.search_depth_min << endl
+            << "  - search_depth_max       = " << params.search_depth_max << endl
+            << "  - nodes                  = " << params.nodes << endl
+            << "  - num sfens to generate  = " << loop_max << endl
+            << "  - eval_limit             = " << params.eval_limit << endl
+            << "  - num threads (UCI)      = " << params.num_threads << endl
+            << "  - random_move_minply     = " << params.random_move_minply << endl
+            << "  - random_move_maxply     = " << params.random_move_maxply << endl
+            << "  - random_move_count      = " << params.random_move_count << endl
+            << "  - random_move_like_apery = " << params.random_move_like_apery << endl
+            << "  - random_multi_pv        = " << params.random_multi_pv << endl
+            << "  - random_multi_pv_diff   = " << params.random_multi_pv_diff << endl
+            << "  - random_multi_pv_depth  = " << params.random_multi_pv_depth << endl
+            << "  - write_minply           = " << params.write_minply << endl
+            << "  - write_maxply           = " << params.write_maxply << endl
+            << "  - book                   = " << params.book << endl
+            << "  - output_file_name       = " << params.output_file_name << endl
+            << "  - save_every             = " << params.save_every << endl
+            << "  - random_file_name       = " << random_file_name << endl
+            << "  - write_drawn_games      = " << params.write_out_draw_game_in_training_data_generation << endl
+            << "  - draw by low score      = " << params.detect_draw_by_consecutive_low_score << endl
+            << "  - draw by insuff. mat.   = " << params.detect_draw_by_insufficient_mating_material << endl;
+
+        // Show if the training data generator uses NNUE.
+        Eval::NNUE::verify_eval_file_loaded();
+
+        Threads.main()->ponder = false;
+
+        Gensfen gensfen(params);
+        gensfen.generate(loop_max);
+
+        std::cout << "INFO: Gensfen finished." << endl;
+    }
+}
@@ -0,0 +1,14 @@
+#ifndef _GENSFEN_H_
+#define _GENSFEN_H_
+
+#include "position.h"
+
+#include <sstream>
+
+namespace Learner {
+
+    // Automatic generation of teacher position
+    void gensfen(std::istringstream& is);
+}
+
+#endif
@@ -1 +0,0 @@
-// just a place holder
@@ -7,126 +7,126 @@
 // Floating point operation by 16bit type
 // Assume that the float type code generated by the compiler is in IEEE 754 format and use it.

-#include "../types.h"
+#include "types.h"

 namespace HalfFloat
 {
-	// IEEE 754 float 32 format is :
-	//   sign(1bit) + exponent(8bits) + fraction(23bits) = 32bits
-	//
-	// Our float16 format is :
-	//   sign(1bit) + exponent(5bits) + fraction(10bits) = 16bits
-	union float32_converter
-	{
-		int32_t n;
-		float f;
-	};
+    // IEEE 754 float 32 format is :
+    //   sign(1bit) + exponent(8bits) + fraction(23bits) = 32bits
+    //
+    // Our float16 format is :
+    //   sign(1bit) + exponent(5bits) + fraction(10bits) = 16bits
+    union float32_converter
+    {
+        int32_t n;
+        float f;
+    };


-	// 16-bit float
-	struct float16
-	{
-		// --- constructors
+    // 16-bit float
+    struct float16
+    {
+        // --- constructors

-		float16() {}
-		float16(int16_t n) { from_float((float)n);  }
-		float16(int32_t n) { from_float((float)n); }
-		float16(float n) { from_float(n); }
-		float16(double n) { from_float((float)n); }
+        float16() {}
+        float16(int16_t n) { from_float((float)n);  }
+        float16(int32_t n) { from_float((float)n); }
+        float16(float n) { from_float(n); }
+        float16(double n) { from_float((float)n); }

-		// build from a float
-		void from_float(float f) { *this = to_float16(f); }
+        // build from a float
+        void from_float(float f) { *this = to_float16(f); }

-		// --- implicit converters
+        // --- implicit converters

-		operator int32_t() const { return (int32_t)to_float(*this); }
-		operator float() const { return to_float(*this); }
-		operator double() const { return double(to_float(*this)); }
+        operator int32_t() const { return (int32_t)to_float(*this); }
+        operator float() const { return to_float(*this); }
+        operator double() const { return double(to_float(*this)); }

-		// --- operators
+        // --- operators

-		float16 operator += (float16 rhs) { from_float(to_float(*this) + to_float(rhs)); return *this; }
-		float16 operator -= (float16 rhs) { from_float(to_float(*this) - to_float(rhs)); return *this; }
-		float16 operator *= (float16 rhs) { from_float(to_float(*this) * to_float(rhs)); return *this; }
-		float16 operator /= (float16 rhs) { from_float(to_float(*this) / to_float(rhs)); return *this; }
-		float16 operator + (float16 rhs) const { return float16(*this) += rhs; }
-		float16 operator - (float16 rhs) const { return float16(*this) -= rhs; }
-		float16 operator * (float16 rhs) const { return float16(*this) *= rhs; }
-		float16 operator / (float16 rhs) const { return float16(*this) /= rhs; }
-		float16 operator - () const { return float16(-to_float(*this)); }
-		bool operator == (float16 rhs) const { return this->v_ == rhs.v_; }
-		bool operator != (float16 rhs) const { return !(*this == rhs); }
+        float16 operator += (float16 rhs) { from_float(to_float(*this) + to_float(rhs)); return *this; }
+        float16 operator -= (float16 rhs) { from_float(to_float(*this) - to_float(rhs)); return *this; }
+        float16 operator *= (float16 rhs) { from_float(to_float(*this) * to_float(rhs)); return *this; }
+        float16 operator /= (float16 rhs) { from_float(to_float(*this) / to_float(rhs)); return *this; }
+        float16 operator + (float16 rhs) const { return float16(*this) += rhs; }
+        float16 operator - (float16 rhs) const { return float16(*this) -= rhs; }
+        float16 operator * (float16 rhs) const { return float16(*this) *= rhs; }
+        float16 operator / (float16 rhs) const { return float16(*this) /= rhs; }
+        float16 operator - () const { return float16(-to_float(*this)); }
+        bool operator == (float16 rhs) const { return this->v_ == rhs.v_; }
+        bool operator != (float16 rhs) const { return !(*this == rhs); }

-		static void UnitTest() { unit_test(); }
+        static void UnitTest() { unit_test(); }

-	private:
+    private:

-		// --- entity
+        // --- entity

-		uint16_t v_;
+        uint16_t v_;

-		// --- conversion between float and float16
+        // --- conversion between float and float16

-		static float16 to_float16(float f)
-		{
-			float32_converter c;
-			c.f = f;
-			u32 n = c.n;
+        static float16 to_float16(float f)
+        {
+            float32_converter c;
+            c.f = f;
+            u32 n = c.n;

-			// The sign bit is MSB in common.
-			uint16_t sign_bit = (n >> 16) & 0x8000;
+            // The sign bit is MSB in common.
+            uint16_t sign_bit = (n >> 16) & 0x8000;

-			// The exponent of IEEE 754's float 32 is biased +127 , so we change this bias into +15 and limited to 5-bit.
-			uint16_t exponent = (((n >> 23) - 127 + 15) & 0x1f) << 10;
+            // The exponent of IEEE 754's float 32 is biased +127 , so we change this bias into +15 and limited to 5-bit.
+            uint16_t exponent = (((n >> 23) - 127 + 15) & 0x1f) << 10;

-			// The fraction is limited to 10-bit.
-			uint16_t fraction = (n >> (23-10)) & 0x3ff;
+            // The fraction is limited to 10-bit.
+            uint16_t fraction = (n >> (23-10)) & 0x3ff;

-			float16 f_;
-			f_.v_ = sign_bit | exponent | fraction;
+            float16 f_;
+            f_.v_ = sign_bit | exponent | fraction;

-			return f_;
-		}
+            return f_;
+        }

-		static float to_float(float16 v)
-		{
-			u32 sign_bit = (v.v_ & 0x8000) << 16;
-			u32 exponent = ((((v.v_ >> 10) & 0x1f) - 15 + 127) & 0xff) << 23;
-			u32 fraction = (v.v_ & 0x3ff) << (23 - 10);
+        static float to_float(float16 v)
+        {
+            u32 sign_bit = (v.v_ & 0x8000) << 16;
+            u32 exponent = ((((v.v_ >> 10) & 0x1f) - 15 + 127) & 0xff) << 23;
+            u32 fraction = (v.v_ & 0x3ff) << (23 - 10);

-			float32_converter c;
-			c.n = sign_bit | exponent | fraction;
-			return c.f;
-		}
+            float32_converter c;
+            c.n = sign_bit | exponent | fraction;
+            return c.f;
+        }

-		// It is not a unit test, but I confirmed that it can be calculated. I'll fix the code later (maybe).
-		static void unit_test()
-		{
-			float16 a, b, c, d;
-			a = 1;
-			std::cout << (float)a << std::endl;
-			b = -118.625;
-			std::cout << (float)b << std::endl;
-			c = 2.5;
-			std::cout << (float)c << std::endl;
-			d = a + c;
-			std::cout << (float)d << std::endl;
+        // It is not a unit test, but I confirmed that it can be calculated. I'll fix the code later (maybe).
+        static void unit_test()
+        {
+            float16 a, b, c, d;
+            a = 1;
+            std::cout << (float)a << std::endl;
+            b = -118.625;
+            std::cout << (float)b << std::endl;
+            c = 2.5;
+            std::cout << (float)c << std::endl;
+            d = a + c;
+            std::cout << (float)d << std::endl;

-			c *= 1.5;
-			std::cout << (float)c << std::endl;
+            c *= 1.5;
+            std::cout << (float)c << std::endl;

-			b /= 3;
-			std::cout << (float)b << std::endl;
+            b /= 3;
+            std::cout << (float)b << std::endl;

-			float f1 = 1.5;
-			a += f1;
-			std::cout << (float)a << std::endl;
+            float f1 = 1.5;
+            a += f1;
+            std::cout << (float)a << std::endl;

-			a += f1 * (float)a;
-			std::cout << (float)a << std::endl;
-		}
+            a += f1 * (float)a;
+            std::cout << (float)a << std::endl;
+        }

-	};
+    };

 }

@@ -1,101 +1,6 @@
 #ifndef _LEARN_H_
 #define _LEARN_H_

-#if defined(EVAL_LEARN)
-
-#include <vector>
-
-// =====================
-// Settings for learning
-// =====================
-
-// If you select one of the following, the details after that will be automatically selected.
-// If you don't select any of them, you need to set the subsequent details one by one.
-
-// Learning setting by elmo method. This is the default setting.
-// To make a standard squeeze diaphragm, specify "lambda 1" with the learn command.
-#define LEARN_ELMO_METHOD
-
-
-// ----------------------
-// update formula
-// ----------------------
-
-// Ada Grad. Recommended because it is stable.
-// #define ADA_GRAD_UPDATE
-
-// SGD looking only at the sign of the gradient. It requires less memory, but the accuracy is...
-// #define SGD_UPDATE
-
-// ----------------------
-// Settings for learning
-// ----------------------
-
-// mini-batch size.
-// Calculate the gradient by combining this number of phases.
-// If you make it smaller, the number of update_weights() will increase and the convergence will be faster. The gradient is incorrect.
-// If you increase it, the number of update_weights() decreases, so the convergence will be slow. The slope will come out accurately.
-// I don't think you need to change this value in most cases.
-
-#define LEARN_MINI_BATCH_SIZE (1000 * 1000 * 1)
-
-// The number of phases to read from the file at one time. After reading this much, shuffle.
-// It is better to have a certain size, but this number x 40 bytes x 3 times as much memory is consumed. 400MB*3 is consumed in the 10M phase.
-// Must be a multiple of THREAD_BUFFER_SIZE(=10000).
-
-#define LEARN_SFEN_READ_SIZE (1000 * 1000 * 10)
-
-// Saving interval of evaluation function at learning. Save each time you learn this number of phases.
-// Needless to say, the longer the saving interval, the shorter the learning time.
-// Folder name is incremented for each save like 0/, 1/, 2/...
-// By default, once every 1 billion phases.
-#define LEARN_EVAL_SAVE_INTERVAL (1000000000ULL)
-
-
-// ----------------------
-// Select the objective function
-// ----------------------
-
-// The objective function is the sum of squares of the difference in winning percentage
-// See learner.cpp for more information.
-
-//#define LOSS_FUNCTION_IS_WINNING_PERCENTAGE
-
-// Objective function is cross entropy
-// See learner.cpp for more information.
-// So-called ordinary "rag cloth squeezer"
-//#define LOSS_FUNCTION_IS_CROSS_ENTOROPY
-
-// A version in which the objective function is cross entropy, but the win rate function is not passed
-// #define LOSS_FUNCTION_IS_CROSS_ENTOROPY_FOR_VALUE
-
-// elmo (WCSC27) method
-// #define LOSS_FUNCTION_IS_ELMO_METHOD
-
-// ※ Other things may be added.
-
-
-// ----------------------
-// debug settings for learning
-// ----------------------
-
-// Reduce the output of rmse during learning to 1 for this number of times.
-// rmse calculation is done in one thread, so it takes some time, so reducing the output is effective.
-#define LEARN_RMSE_OUTPUT_INTERVAL 1
-
-
-// ----------------------
-// learning from zero vector
-// ----------------------
-
-// Start learning the evaluation function parameters from the zero vector.
-// Initialize to zero, generate a game, learn from zero vector,
-// Game generation → If you repeat learning, you will get parameters that do not depend on the professional game. (maybe)
-// (very time consuming)
-
-//#define RESET_TO_ZERO_VECTOR
-
-
 // ----------------------
 // Floating point for learning
 // ----------------------
@@ -105,7 +10,7 @@
 // Even if it is a double type, there is almost no difference in the way of convergence, so fix it to float.

 // when using float
-typedef float LearnFloatType;
+using LearnFloatType = float;

 // when using double
 //typedef double LearnFloatType;
@@ -114,59 +19,6 @@ typedef float LearnFloatType;
 //#include "half_float.h"
 //typedef HalfFloat::float16 LearnFloatType;

-// ----------------------
-// save memory
-// ----------------------
-
-// Use a triangular array for the Weight array (of which is KPP) to save memory.
-// If this is used, the weight array for learning will be about 3 times as large as the evaluation function file.
-
-#define USE_TRIANGLE_WEIGHT_ARRAY
-
-// ----------------------
-// dimension down
-// ----------------------
-
-// Dimension reduction for mirrors (left/right symmetry) and inverse (forward/backward symmetry).
-// All on by default.
-
-// Dimension reduction using mirror and inverse for KK. (Unclear effect)
-// USE_KK_MIRROR_WRITE must be on when USE_KK_INVERSE_WRITE is on.
-#define USE_KK_MIRROR_WRITE
-#define USE_KK_INVERSE_WRITE
-
-// Dimension reduction using Mirror and Inverse for KKP. (Inverse is not so effective)
-// When USE_KKP_INVERSE_WRITE is turned on, USE_KKP_MIRROR_WRITE must also be turned on.
-#define USE_KKP_MIRROR_WRITE
-#define USE_KKP_INVERSE_WRITE
-
-// Perform dimension reduction using a mirror for KPP. (Turning this off requires double the teacher position)
-// KPP has no inverse. (Because there is only K on the front side)
-#define USE_KPP_MIRROR_WRITE
-
-// Perform a dimension reduction using a mirror for KPPP. (Turning this off requires double the teacher position)
-// KPPP has no inverse. (Because there is only K on the front side)
-#define USE_KPPP_MIRROR_WRITE
-
-// Reduce the dimension by KPP for learning the KKPP component.
-// Learning is very slow.
-// Do not use as it is not debugged.
-//#define USE_KKPP_LOWER_DIM
-
-
-// ======================
-// Settings for creating teacher phases
-// ======================
-
-// ----------------------
-// write out the draw
-// ----------------------
-
-// When you reach a draw, write it out as a teacher position
-// It's subtle whether it's better to do this.
-// #define LEARN_GENSFEN_USE_DRAW_RESULT
-
-
 // ======================
 // configure
 // ======================
@@ -175,63 +27,122 @@ typedef float LearnFloatType;
 // Learning with the method of elmo (WCSC27)
 // ----------------------

-#if defined( LEARN_ELMO_METHOD )
-#define LOSS_FUNCTION_IS_ELMO_METHOD
-#define ADA_GRAD_UPDATE
-#endif
-
+#define LOSS_FUNCTION "ELMO_METHOD(WCSC27)"

 // ----------------------
 // Definition of struct used in Learner
 // ----------------------
-#include "../position.h"
+
+#include "autograd.h"
+#include "packed_sfen.h"
+
+#include "position.h"
+
+#include <sstream>
+#include <vector>
+#include <mutex>
+#include <string>

 namespace Learner
 {
-	//Structure in which PackedSfen and evaluation value are integrated
-	// If you write different contents for each option, it will be a problem when reusing the teacher game
-	// For the time being, write all the following members regardless of the options.
-	struct PackedSfenValue
-	{
-		// phase
-		PackedSfen sfen;
+    // ----------------------
+    // Settings for learning
+    // ----------------------

-		// Evaluation value returned from Learner::search()
-		int16_t score;
+    // mini-batch size.
+    // Calculate the gradient by combining this number of phases.
+    // If you make it smaller, the number of update_weights() will increase and the convergence will be faster. The gradient is incorrect.
+    // If you increase it, the number of update_weights() decreases, so the convergence will be slow. The slope will come out accurately.
+    // I don't think you need to change this value in most cases.

-		// PV first move
-		// Used when finding the match rate with the teacher
-		uint16_t move;
+    constexpr std::size_t LEARN_MINI_BATCH_SIZE = 1000 * 1000 * 1;

-		// Trouble of the phase from the initial phase.
-		uint16_t gamePly;
+    // Saving interval of evaluation function at learning. Save each time you learn this number of phases.
+    // Needless to say, the longer the saving interval, the shorter the learning time.
+    // Folder name is incremented for each save like 0/, 1/, 2/...
+    // By default, once every 1 billion phases.
+    constexpr std::size_t LEARN_EVAL_SAVE_INTERVAL = 100'000'000ULL;

-		// 1 if the player on this side ultimately wins the game. -1 if you are losing.
-		// 0 if a draw is reached.
-		// The draw is in the teacher position generation command gensfen,
-		// Only write if LEARN_GENSFEN_DRAW_RESULT is enabled.
-		int8_t game_result;
+    // Reduce the output of rmse during learning to 1 for this number of times.
+    // rmse calculation is done in one thread, so it takes some time, so reducing the output is effective.
+    constexpr std::size_t LEARN_RMSE_OUTPUT_INTERVAL = 1;

-		// When exchanging the file that wrote the teacher aspect with other people
-		//Because this structure size is not fixed, pad it so that it is 40 bytes in any environment.
-		uint8_t padding;
+    // Learning from the generated game record
+    void learn(std::istringstream& is);

-		// 32 + 2 + 2 + 2 + 1 + 1 = 40bytes
-	};
+    using CalcLossFunc = ValueWithGrad<double>(Value, Value, int, int);

-	// Type that returns the reading line and the evaluation value at that time
-	// Used in Learner::search(), Learner::qsearch().
-	typedef std::pair<Value, std::vector<Move> > ValueAndPV;
+    struct Loss
+    {
+        double value() const
+        {
+            return m_loss.value;
+        }

-	// So far, only Yaneura King 2018 Otafuku has this stub
-	// This stub is required if EVAL_LEARN is defined.
-	extern Learner::ValueAndPV  search(Position& pos, int depth , size_t multiPV = 1 , uint64_t NodesLimit = 0);
-	extern Learner::ValueAndPV qsearch(Position& pos);
+        double grad() const
+        {
+            return m_loss.grad;
+        }

-	double calc_grad(Value shallow, const PackedSfenValue& psv);
+        uint64_t count() const
+        {
+            return m_count;
+        }

+        Loss() = default;
+
+        Loss(const Loss& other) :
+            m_loss(other.m_loss),
+            m_count(other.m_count)
+        {
+        }
+
+        Loss& operator += (const ValueWithGrad<double>& rhs)
+        {
+            std::unique_lock lock(m_mutex);
+
+            m_loss += rhs.abs();
+            m_count += 1;
+
+            return *this;
+        }
+
+        Loss& operator += (const Loss& rhs)
+        {
+            std::unique_lock lock(m_mutex);
+
+            m_loss += rhs.m_loss.abs();
+            m_count += rhs.m_count;
+
+            return *this;
+        }
+
+        void reset()
+        {
+            std::unique_lock lock(m_mutex);
+
+            m_loss = ValueWithGrad<double>{ 0.0, 0.0 };
+            m_count = 0;
+        }
+
+        template <typename StreamT>
+        void print_with_grad(const std::string& prefix, StreamT& s) const
+        {
+            s << "  - " << prefix << "_loss       = " << m_loss.value / (double)m_count << std::endl;
+            s << "  - " << prefix << "_grad_norm  = " << m_loss.grad / (double)m_count << std::endl;
+        }
+
+        template <typename StreamT>
+        void print_only_loss(const std::string& prefix, StreamT& s) const
+        {
+            s << "  - " << prefix << "_loss       = " << m_loss.value / (double)m_count << std::endl;
+        }
+
+    private:
+        ValueWithGrad<double> m_loss{ 0.0, 0.0 };
+        uint64_t m_count{0};
+        std::mutex m_mutex;
+    };
 }

-#endif
-
 #endif // ifndef _LEARN_H_
@@ -1,25 +0,0 @@
-#include "learning_tools.h"
-
-#if defined (EVAL_LEARN)
-
-#if defined(_OPENMP)
-#include <omp.h>
-#endif
-#include "../misc.h"
-
-using namespace Eval;
-
-namespace EvalLearningTools
-{
-
-	// --- static variables
-
-	double Weight::eta;
-	double Weight::eta1;
-	double Weight::eta2;
-	double Weight::eta3;
-	uint64_t Weight::eta1_epoch;
-	uint64_t Weight::eta2_epoch;
-}
-
-#endif
@@ -1,200 +0,0 @@
-#ifndef __LEARN_WEIGHT_H__
-#define __LEARN_WEIGHT_H__
-
-// A set of machine learning tools related to the weight array used for machine learning of evaluation functions
-
-#include "learn.h"
-#if defined (EVAL_LEARN)
-#include <array>
-
-#if defined(SGD_UPDATE) || defined(USE_KPPP_MIRROR_WRITE)
-#include "../misc.h"  // PRNG , my_insertion_sort
-#endif
-
-#include <cmath>	// std::sqrt()
-
-namespace EvalLearningTools
-{
-	// -------------------------------------------------
-	//   Array for learning that stores gradients etc.
-	// -------------------------------------------------
-
-#if defined(_MSC_VER)
-#pragma pack(push,2)
-#elif defined(__GNUC__)
-#pragma pack(2)
-#endif
-	struct Weight
-	{
-		// cumulative value of one mini-batch gradient
-		LearnFloatType g = LearnFloatType(0);
-
-		// When ADA_GRAD_UPDATE. LearnFloatType == float,
-		// total 4*2 + 4*2 + 1*2 = 18 bytes
-		// It suffices to secure a Weight array that is 4.5 times the size of the evaluation function parameter of 1GB.
-		// However, sizeof(Weight)==20 code is generated if the structure alignment is in 4-byte units, so
-		// Specify pragma pack(2).
-
-		// For SGD_UPDATE, this structure is reduced by 10 bytes to 8 bytes.
-
-		// Learning rate η(eta) such as AdaGrad.
-		// It is assumed that eta1,2,3,eta1_epoch,eta2_epoch have been set by the time updateFV() is called.
-		// The epoch of update_weights() gradually changes from eta1 to eta2 until eta1_epoch.
-		// After eta2_epoch, gradually change from eta2 to eta3.
-		static double eta;
-		static double eta1;
-		static double eta2;
-		static double eta3;
-		static uint64_t eta1_epoch;
-		static uint64_t eta2_epoch;
-
-		// Batch initialization of eta. If 0 is passed, the default value will be set.
-		static void init_eta(double eta1, double eta2, double eta3, uint64_t eta1_epoch, uint64_t eta2_epoch)
-		{
-			Weight::eta1 = (eta1 != 0) ? eta1 : 30.0;
-			Weight::eta2 = (eta2 != 0) ? eta2 : 30.0;
-			Weight::eta3 = (eta3 != 0) ? eta3 : 30.0;
-			Weight::eta1_epoch = (eta1_epoch != 0) ? eta1_epoch : 0;
-			Weight::eta2_epoch = (eta2_epoch != 0) ? eta2_epoch : 0;
-		}
-
-		// Set eta according to epoch.
-		static void calc_eta(uint64_t epoch)
-		{
-			if (Weight::eta1_epoch == 0) // Exclude eta2
-				Weight::eta = Weight::eta1;
-			else if (epoch < Weight::eta1_epoch)
-				// apportion
-				Weight::eta = Weight::eta1 + (Weight::eta2 - Weight::eta1) * epoch / Weight::eta1_epoch;
-			else if (Weight::eta2_epoch == 0) // Exclude eta3
-				Weight::eta = Weight::eta2;
-			else if (epoch < Weight::eta2_epoch)
-				Weight::eta = Weight::eta2 + (Weight::eta3 - Weight::eta2) * (epoch - Weight::eta1_epoch) / (Weight::eta2_epoch - Weight::eta1_epoch);
-			else
-				Weight::eta = Weight::eta3;
-		}
-
-		template <typename T> void updateFV(T& v) { updateFV(v, 1.0); }
-
-#if defined (ADA_GRAD_UPDATE)
-
-		// Since the maximum value that can be accurately calculated with float is INT16_MAX*256-1
-		// Keep the small value as a marker.
-		const LearnFloatType V0_NOT_INIT = (INT16_MAX * 128);
-
-		// What holds v internally. The previous implementation kept a fixed decimal with only a fractional part to save memory,
-		// Since it is doubtful in accuracy and the visibility is bad, it was abolished.
-		LearnFloatType v0 = LearnFloatType(V0_NOT_INIT);
-
-		// AdaGrad g2
-		LearnFloatType g2 = LearnFloatType(0);
-
-		// update with AdaGrad
-		// When executing this function, the value of g and the member do not change
-		// Guaranteed by the caller. It does not have to be an atomic operation.
-		// k is a coefficient for eta. 1.0 is usually sufficient. If you want to lower eta for your turn item, set this to 1/8.0 etc.
-		template <typename T>
-		void updateFV(T& v,double k)
-		{
-			// AdaGrad update formula
-			// Gradient vector is g, vector to be updated is v, η(eta) is a constant,
-			//     g2 = g2 + g^2
-			//     v = v - ηg/sqrt(g2)
-
-			constexpr double epsilon = 0.000001;
-
-			if (g == LearnFloatType(0))
-				return;
-
-			g2 += g * g;
-
-			// If v0 is V0_NOT_INIT, it means that the value is not initialized with the value of KK/KKP/KPP array,
-			// In this case, read the value of v from the one passed in the argument.
-			double V = (v0 == V0_NOT_INIT) ? v : v0;
-
-			V -= k * eta * (double)g / sqrt((double)g2 + epsilon);
-
-			// Limit the value of V to be within the range of types.
-			// By the way, windows.h defines the min and max macros, so to avoid it,
-			// Here, it is enclosed in parentheses so that it is not treated as a function-like macro.
-			V = (std::min)((double)(std::numeric_limits<T>::max)() , V);
-			V = (std::max)((double)(std::numeric_limits<T>::min)() , V);
-
-			v0 = (LearnFloatType)V;
-			v = (T)round(V);
-
-			// Clear g because one update of mini-batch for this element is over
-			// g[i] = 0;
-			// → There is a problem of dimension reduction, so this will be done by the caller.
-		}
-
-#elif defined(SGD_UPDATE)
-
-		// See only the sign of the gradient Update with SGD
-		// When executing this function, the value of g and the member do not change
-		// Guaranteed by the caller. It does not have to be an atomic operation.
-		template <typename T>
-		void updateFV(T & v , double k)
-		{
-			if (g == 0)
-				return;
-
-			// See only the sign of g and update.
-			// If g <0, add v a little.
-			// If g> 0, subtract v slightly.
-
-			// Since we only add integers, no decimal part is required.
-
-			// It's a good idea to move around 0-5.
-			// It is better to have a Gaussian distribution, so generate a 5-bit random number (each bit has a 1/2 probability of 1),
-			// Pop_count() it. At this time, it has a binomial distribution.
-			//int16_t diff = (int16_t)POPCNT32((u32)prng.rand(31));
-			// → If I do this with 80 threads, this AsyncPRNG::rand() locks, so I slowed down. This implementation is not good.
-			int16_t diff = 1;
-
-			double V = v;
-			if (g > 0.0)
-				V-= diff;
-			else
-				V+= diff;
-
-			V = (std::min)((double)(std::numeric_limits<T>::max)(), V);
-			V = (std::max)((double)(std::numeric_limits<T>::min)(), V);
-
-			v = (T)V;
-		}
-
-#endif
-
-		// grad setting
-		template <typename T> void set_grad(const T& g_) { g = g_; }
-
-		// Add grad
-		template <typename T> void add_grad(const T& g_) { g += g_; }
-
-		LearnFloatType get_grad() const { return g; }
-	};
-#if defined(_MSC_VER)
-#pragma pack(pop)
-#elif defined(__GNUC__)
-#pragma pack(0)
-#endif
-
-	// Turned weight array
-	// In order to be able to handle it transparently, let's have the same member as Weight.
-	struct Weight2
-	{
-		Weight w[2];
-
-		//Evaluate your turn, eta 1/8.
-		template <typename T> void updateFV(std::array<T, 2>& v) { w[0].updateFV(v[0] , 1.0); w[1].updateFV(v[1],1.0/8.0); }
-
-		template <typename T> void set_grad(const std::array<T, 2>& g) { for (int i = 0; i<2; ++i) w[i].set_grad(g[i]); }
-		template <typename T> void add_grad(const std::array<T, 2>& g) { for (int i = 0; i<2; ++i) w[i].add_grad(g[i]); }
-
-		std::array<LearnFloatType, 2> get_grad() const { return std::array<LearnFloatType, 2>{w[0].get_grad(), w[1].get_grad()}; }
-	};
-}
-
-#endif // defined (EVAL_LEARN)
-#endif
@@ -1,123 +0,0 @@
-#include "../types.h"
-
-#if defined(EVAL_LEARN)
-
-#include "multi_think.h"
-#include "../tt.h"
-#include "../uci.h"
-
-#include <thread>
-
-void MultiThink::go_think()
-{
-	// Keep a copy to restore the Options settings later.
-	auto oldOptions = Options;
-
-	// When using the constant track, it takes a lot of time to perform on the fly & the part to access the file is
-	// Since it is not thread safe, it is guaranteed here that it is being completely read in memory.
-	Options["BookOnTheFly"] = std::string("false");
-
-	// Read evaluation function, etc.
-	// In the case of the learn command, the value of the evaluation function may be corrected after reading the evaluation function, so
-	// Skip memory corruption check.
-	Eval::init_NNUE();
-
-	// Call the derived class's init().
-	init();
-
-	// The loop upper limit is set with set_loop_max().
-	loop_count = 0;
-	done_count = 0;
-
-	// Create threads as many as Options["Threads"] and start thinking.
-	std::vector<std::thread> threads;
-	auto thread_num = (size_t)Options["Threads"];
-
-	// Secure end flag of worker thread
-	thread_finished.resize(thread_num);
-	
-	// start worker thread
-	for (size_t i = 0; i < thread_num; ++i)
-	{
-		thread_finished[i] = 0;
-		threads.push_back(std::thread([i, this]
-		{ 
-			// exhaust all processor threads.
-			WinProcGroup::bindThisThread(i);
-
-			// execute the overridden process
-			this->thread_worker(i);
-
-			// Set the end flag because the thread has ended
-			this->thread_finished[i] = 1;
-		}));
-	}
-
-	// wait for all threads to finish
-	// for (auto& th :threads)
-	// th.join();
-	// If you write like, the thread will rush here while it is still working,
-	// During that time, callback_func() cannot be called and you cannot save.
-	// Therefore, you need to check the end flag yourself.
-
-	// function to determine if all threads have finished
-	auto threads_done = [&]()
-	{
-		// returns false if no one is finished
-		for (auto& f : thread_finished)
-			if (!f)
-				return false;
-		return true;
-	};
-
-	// Call back if the callback function is set.
-	auto do_a_callback = [&]()
-	{
-		if (callback_func)
-			callback_func();
-	};
-
-
-	for (uint64_t i = 0 ; ; )
-	{
-		// If all threads have finished, exit the loop.
-		if (threads_done())
-			break;
-
-		sleep(1000);
-
-		// callback_func() is called every callback_seconds.
-		if (++i == callback_seconds)
-		{
-			do_a_callback();
-			// Since I am returning from ↑, I reset the counter, so
-			// no matter how long it takes to save() etc. in do_a_callback()
-			// The next call will take a certain amount of time.
-			i = 0;
-		}
-	}
-
-	// Last save.
-	std::cout << std::endl << "finalize..";
-
-	// do_a_callback();
-	// → It should be saved by the caller, so I feel that it is not necessary here.
-
-	// It is possible that the exit code of the thread is running but the exit code of the thread is running, so
-	// We need to wait for the end with join().
-	for (auto& th : threads)
-		th.join();
-
-	// The file writing thread etc. are still running only when all threads are finished
-	// Since the work itself may not have completed, output only that all threads have finished.
-	std::cout << "all threads are joined." << std::endl;
-
-	// Restored because Options were rewritten.
-	// Restore the handler because the handler will not start unless you assign a value.
-	for (auto& s : oldOptions)
-		Options[s.first] = std::string(s.second);
-
-}
-
-
-#endif // defined(EVAL_LEARN)
@@ -1,152 +0,0 @@
-#ifndef _MULTI_THINK_
-#define _MULTI_THINK_
-
-#if defined(EVAL_LEARN)
-
-#include <functional>
-#include <mutex>
-
-#include "../misc.h"
-#include "../learn/learn.h"
-#include "../thread_win32_osx.h"
-
-#include <atomic>
-
-// Learning from a game record, when making yourself think and generating a fixed track, etc.
-// Helper class used when multiple threads want to call Search::think() individually.
-// Derive and use this class.
-struct MultiThink
-{
-	MultiThink() : prng(std::chrono::system_clock::now().time_since_epoch().count())
-	{
-		loop_count = 0;
-	}
-
-	// Call this function from the master thread, each thread will think,
-	// Return control when the thought ending condition is satisfied.
-	// Do something else.
-	// ・It is safe for each thread to call Learner::search(),qsearch()
-	// Separates the substitution table for each thread. (It will be restored after the end.)
-	// ・Book is not thread safe when in on the fly mode, so temporarily change this mode.
-	// Turn it off.
-	// [Requirements]
-	// 1) Override thread_worker()
-	// 2) Set the loop count with set_loop_max()
-	// 3) set a function to be called back periodically (if necessary)
-	// callback_func and callback_interval
-	void go_think();
-
-	// If there is something you want to initialize on the derived class side, override this,
-	// Called when initialization is completed with go_think().
-	// It is better to read the fixed trace at that timing.
-	virtual void init() {}
-
-	// A thread worker that is called by creating a thread when you go_think()
-	// Override and use this.
-	virtual void thread_worker(size_t thread_id) = 0;
-
-	// Called back every callback_seconds [seconds] when go_think().
-	std::function<void()> callback_func;
-	uint64_t callback_seconds = 600;
-
-	// Set the number of times worker processes (calls Search::think()).
-	void set_loop_max(uint64_t loop_max_) { loop_max = loop_max_; }
-
-	// Get the value set by set_loop_max().
-	uint64_t get_loop_max() const { return loop_max; }
-
-	// [ASYNC] Take the value of the loop counter and add the loop counter after taking it out.
-	// If the loop counter has reached loop_max, return UINT64_MAX.
-	// If you want to generate a phase, you must call this function at the time of generating the phase,
-	// Please note that the number of generated phases and the value of the counter will not match.
-	uint64_t get_next_loop_count() {
-		std::unique_lock<std::mutex> lk(loop_mutex);
-		if (loop_count >= loop_max)
-			return UINT64_MAX;
-		return loop_count++;
-	}
-
-	// [ASYNC] For returning the processed number. Each time it is called, it returns a counter that is incremented.
-	uint64_t get_done_count() {
-		std::unique_lock<std::mutex> lk(loop_mutex);
-		return ++done_count;
-	}
-
-	// Mutex when worker thread accesses I/O
-	std::mutex io_mutex;
-
-protected:
-	// Random number generator body
-	AsyncPRNG prng;
-
-private:
-	// number of times worker processes (calls Search::think())
-	std::atomic<uint64_t> loop_max;
-	// number of times the worker has processed (calls Search::think())
-	std::atomic<uint64_t> loop_count;
-	// To return the number of times it has been processed.
-	std::atomic<uint64_t> done_count;
-
-	// Mutex when changing the variables in ↑
-	std::mutex loop_mutex;
-
-	// Thread end flag.
-	// vector<bool> may not be reflected properly when trying to rewrite from multiple threads...
-	typedef uint8_t Flag;
-	std::vector<Flag> thread_finished;
-
-};
-
-// Mechanism to process task during idle time.
-// master passes the task with push_task_async() whenever you like.
-// When slave executes on_idle() in its spare time, it retrieves one task and continues execution until there is no queue.
-// Convenient to use when you want to write MultiThink thread worker in master-slave method.
-struct TaskDispatcher
-{
-	typedef std::function<void(size_t /* thread_id */)> Task;
-
-	// slave calls this function during idle.
-	void on_idle(size_t thread_id)
-	{
-		Task task;
-		while ((task = get_task_async()) != nullptr)
-			task(thread_id);
-
-		sleep(1);
-	}
-
-	// Stack [ASYNC] task.
-	void push_task_async(Task task)
-	{
-		std::unique_lock<std::mutex> lk(task_mutex);
-		tasks.push_back(task);
-	}
-
-	// Allocate size array elements for task in advance.
-	void task_reserve(size_t size)
-	{
-		tasks.reserve(size);
-	}
-
-protected:
-	// set of tasks
-	std::vector<Task> tasks;
-
-	// Take out one [ASYNC] task. Called from on_idle().
-	Task get_task_async()
-	{
-		std::unique_lock<std::mutex> lk(task_mutex);
-		if (tasks.size() == 0)
-			return nullptr;
-		Task task = *tasks.rbegin();
-		tasks.pop_back();
-		return task;
-	}
-
-	// a mutex for accessing tasks
-	std::mutex task_mutex;
-};
-
-#endif // defined(EVAL_LEARN) && defined(YANEURAOU_2018_OTAFUKU_ENGINE)
-
-#endif
@@ -0,0 +1,43 @@
+#include "opening_book.h"
+
+#include <fstream>
+
+namespace Learner {
+
+    EpdOpeningBook::EpdOpeningBook(const std::string& file, PRNG& prng) :
+        OpeningBook(file)
+    {
+        std::ifstream in(file);
+        if (!in)
+        {
+            return;
+        }
+
+        std::string line;
+        while (std::getline(in, line))
+        {
+            if (line.empty())
+                continue;
+
+            fens.emplace_back(line);
+        }
+
+        Algo::shuffle(fens, prng);
+    }
+
+    static bool ends_with(const std::string& lhs, const std::string& end)
+    {
+        if (end.size() > lhs.size()) return false;
+
+        return std::equal(end.rbegin(), end.rend(), lhs.rbegin());
+    }
+
+    std::unique_ptr<OpeningBook> open_opening_book(const std::string& filename, PRNG& prng)
+    {
+        if (ends_with(filename, ".epd"))
+            return std::make_unique<EpdOpeningBook>(filename, prng);
+
+        return nullptr;
+    }
+
+}
@@ -0,0 +1,56 @@
+#ifndef LEARN_OPENING_BOOK_H
+#define LEARN_OPENING_BOOK_H
+
+#include "misc.h"
+#include "position.h"
+#include "thread.h"
+
+#include <vector>
+#include <random>
+#include <optional>
+#include <string>
+#include <cstdint>
+#include <memory>
+
+namespace Learner {
+
+    struct OpeningBook {
+
+        const std::string& next_fen()
+        {
+            assert(fens.size() > 0);
+
+            auto& fen = fens[current_index++];
+            if (current_index >= fens.size())
+                current_index = 0;
+
+            return fen;
+        }
+
+        std::size_t size() const { return fens.size(); }
+
+        const std::string& get_filename() const { return filename; }
+
+    protected:
+        OpeningBook(const std::string& file) :
+            filename(file),
+            current_index(0)
+        {
+        }
+
+
+        std::string filename;
+        std::vector<std::string> fens;
+        std::size_t current_index;
+    };
+
+    struct EpdOpeningBook : OpeningBook {
+
+        EpdOpeningBook(const std::string& file, PRNG& prng);
+    };
+
+    std::unique_ptr<OpeningBook> open_opening_book(const std::string& filename, PRNG& prng);
+
+}
+
+#endif
@@ -0,0 +1,46 @@
+#ifndef _PACKED_SFEN_H_
+#define _PACKED_SFEN_H_
+
+#include <vector>
+#include <cstdint>
+
+namespace Learner {
+
+    // packed sfen
+    struct PackedSfen { std::uint8_t data[32]; };
+
+    // Structure in which PackedSfen and evaluation value are integrated
+    // If you write different contents for each option, it will be a problem when reusing the teacher game
+    // For the time being, write all the following members regardless of the options.
+    struct PackedSfenValue
+    {
+        // phase
+        PackedSfen sfen;
+
+        // Evaluation value returned from Learner::search()
+        std::int16_t score;
+
+        // PV first move
+        // Used when finding the match rate with the teacher
+        std::uint16_t move;
+
+        // Trouble of the phase from the initial phase.
+        std::uint16_t gamePly;
+
+        // 1 if the player on this side ultimately wins the game. -1 if you are losing.
+        // 0 if a draw is reached.
+        // The draw is in the teacher position generation command gensfen,
+        // Only write if LEARN_GENSFEN_DRAW_RESULT is enabled.
+        std::int8_t game_result;
+
+        // When exchanging the file that wrote the teacher aspect with other people
+        //Because this structure size is not fixed, pad it so that it is 40 bytes in any environment.
+        std::uint8_t padding;
+
+        // 32 + 2 + 2 + 2 + 1 + 1 = 40bytes
+    };
+
+    // Phase array: PSVector stands for packed sfen vector.
+    using PSVector = std::vector<PackedSfenValue>;
+}
+#endif
@@ -0,0 +1,386 @@
+#include "sfen_packer.h"
+
+#include "packed_sfen.h"
+
+#include "misc.h"
+#include "position.h"
+
+#include <sstream>
+#include <fstream>
+#include <cstring> // std::memset()
+
+using namespace std;
+
+namespace Learner {
+
+    // Class that handles bitstream
+    // useful when doing aspect encoding
+    struct BitStream
+    {
+        // Set the memory to store the data in advance.
+        // Assume that memory is cleared to 0.
+        void set_data(std::uint8_t* data_) { data = data_; reset(); }
+
+        // Get the pointer passed in set_data().
+        uint8_t* get_data() const { return data; }
+
+        // Get the cursor.
+        int get_cursor() const { return bit_cursor; }
+
+        // reset the cursor
+        void reset() { bit_cursor = 0; }
+
+        // Write 1bit to the stream.
+        // If b is non-zero, write out 1. If 0, write 0.
+        void write_one_bit(int b)
+        {
+            if (b)
+                data[bit_cursor / 8] |= 1 << (bit_cursor & 7);
+
+            ++bit_cursor;
+        }
+
+        // Get 1 bit from the stream.
+        int read_one_bit()
+        {
+            int b = (data[bit_cursor / 8] >> (bit_cursor & 7)) & 1;
+            ++bit_cursor;
+
+            return b;
+        }
+
+        // write n bits of data
+        // Data shall be written out from the lower order of d.
+        void write_n_bit(int d, int n)
+        {
+            for (int i = 0; i <n; ++i)
+                write_one_bit(d & (1 << i));
+        }
+
+        // read n bits of data
+        // Reverse conversion of write_n_bit().
+        int read_n_bit(int n)
+        {
+            int result = 0;
+            for (int i = 0; i < n; ++i)
+                result |= read_one_bit() ? (1 << i) : 0;
+
+            return result;
+        }
+
+    private:
+        // Next bit position to read/write.
+        int bit_cursor;
+
+        // data entity
+        std::uint8_t* data;
+    };
+
+    // Class for compressing/decompressing sfen
+    // sfen can be packed to 256bit (32bytes) by Huffman coding.
+    // This is proven by mini. The above is Huffman coding.
+    //
+    // Internal format = 1-bit turn + 7-bit king position *2 + piece on board (Huffman coding) + hand piece (Huffman coding)
+    // Side to move (White = 0, Black = 1) (1bit)
+    // White King Position (6 bits)
+    // Black King Position (6 bits)
+    // Huffman Encoding of the board
+    // Castling availability (1 bit x 4)
+    // En passant square (1 or 1 + 6 bits)
+    // Rule 50 (6 bits)
+    // Game play (8 bits)
+    //
+    // TODO(someone): Rename SFEN to FEN.
+    //
+    struct SfenPacker
+    {
+        void pack(const Position& pos);
+
+        // sfen packed by pack() (256bit = 32bytes)
+        // Or sfen to decode with unpack()
+        uint8_t *data; // uint8_t[32];
+
+        BitStream stream;
+
+        // Output the board pieces to stream.
+        void write_board_piece_to_stream(Piece pc);
+
+        // Read one board piece from stream
+        Piece read_board_piece_from_stream();
+    };
+
+
+    // Huffman coding
+    // * is simplified from mini encoding to make conversion easier.
+    //
+    // Huffman Encoding
+    //
+    // Empty  xxxxxxx0
+    // Pawn   xxxxx001 + 1 bit (Color)
+    // Knight xxxxx011 + 1 bit (Color)
+    // Bishop xxxxx101 + 1 bit (Color)
+    // Rook   xxxxx111 + 1 bit (Color)
+    // Queen   xxxx1001 + 1 bit (Color)
+    //
+    // Worst case:
+    // - 32 empty squares    32 bits
+    // - 30 pieces           150 bits
+    // - 2 kings             12 bits
+    // - castling rights     4 bits
+    // - ep square           7 bits
+    // - rule50              7 bits
+    // - game ply            16 bits
+    // - TOTAL               228 bits < 256 bits
+
+    struct HuffmanedPiece
+    {
+        int code; // how it will be coded
+        int bits; // How many bits do you have
+    };
+
+    constexpr HuffmanedPiece huffman_table[] =
+    {
+        {0b0000,1}, // NO_PIECE
+        {0b0001,4}, // PAWN
+        {0b0011,4}, // KNIGHT
+        {0b0101,4}, // BISHOP
+        {0b0111,4}, // ROOK
+        {0b1001,4}, // QUEEN
+    };
+
+    // Pack sfen and store in data[32].
+    void SfenPacker::pack(const Position& pos)
+    {
+        memset(data, 0, 32 /* 256bit */);
+        stream.set_data(data);
+
+        // turn
+        // Side to move.
+        stream.write_one_bit((int)(pos.side_to_move()));
+
+        // 7-bit positions for leading and trailing balls
+        // White king and black king, 6 bits for each.
+        for(auto c: Colors)
+            stream.write_n_bit(pos.king_square(c), 6);
+
+        // Write the pieces on the board other than the kings.
+        for (Rank r = RANK_8; r >= RANK_1; --r)
+        {
+            for (File f = FILE_A; f <= FILE_H; ++f)
+            {
+                Piece pc = pos.piece_on(make_square(f, r));
+                if (type_of(pc) == KING)
+                    continue;
+                write_board_piece_to_stream(pc);
+            }
+        }
+
+        // TODO(someone): Support chess960.
+        stream.write_one_bit(pos.can_castle(WHITE_OO));
+        stream.write_one_bit(pos.can_castle(WHITE_OOO));
+        stream.write_one_bit(pos.can_castle(BLACK_OO));
+        stream.write_one_bit(pos.can_castle(BLACK_OOO));
+
+        if (pos.ep_square() == SQ_NONE) {
+            stream.write_one_bit(0);
+        }
+        else {
+            stream.write_one_bit(1);
+            stream.write_n_bit(static_cast<int>(pos.ep_square()), 6);
+        }
+
+        stream.write_n_bit(pos.state()->rule50, 6);
+
+        const int fm = 1 + (pos.game_ply()-(pos.side_to_move() == BLACK)) / 2;
+        stream.write_n_bit(fm, 8);
+
+        // Write high bits of half move. This is a fix for the
+        // limited range of half move counter.
+        // This is backwards compatibile.
+        stream.write_n_bit(fm >> 8, 8);
+
+        // Write the highest bit of rule50 at the end. This is a backwards
+        // compatibile fix for rule50 having only 6 bits stored.
+        // This bit is just ignored by the old parsers.
+        stream.write_n_bit(pos.state()->rule50 >> 6, 1);
+
+        assert(stream.get_cursor() <= 256);
+    }
+
+    // Output the board pieces to stream.
+    void SfenPacker::write_board_piece_to_stream(Piece pc)
+    {
+        // piece type
+        PieceType pr = type_of(pc);
+        auto c = huffman_table[pr];
+        stream.write_n_bit(c.code, c.bits);
+
+        if (pc == NO_PIECE)
+            return;
+
+        // first and second flag
+        stream.write_one_bit(color_of(pc));
+    }
+
+    // Read one board piece from stream
+    Piece SfenPacker::read_board_piece_from_stream()
+    {
+        PieceType pr = NO_PIECE_TYPE;
+        int code = 0, bits = 0;
+        while (true)
+        {
+            code |= stream.read_one_bit() << bits;
+            ++bits;
+
+            assert(bits <= 6);
+
+            for (pr = NO_PIECE_TYPE; pr <KING; ++pr)
+                if (huffman_table[pr].code == code
+                    && huffman_table[pr].bits == bits)
+                    goto Found;
+        }
+    Found:;
+        if (pr == NO_PIECE_TYPE)
+            return NO_PIECE;
+
+        // first and second flag
+        Color c = (Color)stream.read_one_bit();
+
+        return make_piece(c, pr);
+    }
+
+    int set_from_packed_sfen(Position& pos, const PackedSfen& sfen, StateInfo* si, Thread* th)
+    {
+        SfenPacker packer;
+        auto& stream = packer.stream;
+
+        // TODO: separate streams for writing and reading. Here we actually have to
+        // const_cast which is not safe in the long run.
+        stream.set_data(const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(&sfen)));
+
+        pos.clear();
+        std::memset(si, 0, sizeof(StateInfo));
+        std::fill_n(&pos.pieceList[0][0], sizeof(pos.pieceList) / sizeof(Square), SQ_NONE);
+        pos.st = si;
+
+        // Active color
+        pos.sideToMove = (Color)stream.read_one_bit();
+
+        pos.pieceList[W_KING][0] = SQUARE_NB;
+        pos.pieceList[B_KING][0] = SQUARE_NB;
+
+        // First the position of the ball
+        for (auto c : Colors)
+            pos.board[stream.read_n_bit(6)] = make_piece(c, KING);
+
+        // Piece placement
+        for (Rank r = RANK_8; r >= RANK_1; --r)
+        {
+            for (File f = FILE_A; f <= FILE_H; ++f)
+            {
+                auto sq = make_square(f, r);
+
+                // it seems there are already balls
+                Piece pc;
+                if (type_of(pos.board[sq]) != KING)
+                {
+                    assert(pos.board[sq] == NO_PIECE);
+                    pc = packer.read_board_piece_from_stream();
+                }
+                else
+                {
+                    pc = pos.board[sq];
+                    // put_piece() will catch ASSERT unless you remove it all.
+                    pos.board[sq] = NO_PIECE;
+                }
+
+                // There may be no pieces, so skip in that case.
+                if (pc == NO_PIECE)
+                    continue;
+
+                pos.put_piece(Piece(pc), sq);
+
+                if (stream.get_cursor()> 256)
+                    return 1;
+            }
+        }
+
+        // Castling availability.
+        // TODO(someone): Support chess960.
+        pos.st->castlingRights = 0;
+        if (stream.read_one_bit()) {
+            Square rsq;
+            for (rsq = relative_square(WHITE, SQ_H1); pos.piece_on(rsq) != W_ROOK; --rsq) {}
+            pos.set_castling_right(WHITE, rsq);
+        }
+        if (stream.read_one_bit()) {
+            Square rsq;
+            for (rsq = relative_square(WHITE, SQ_A1); pos.piece_on(rsq) != W_ROOK; ++rsq) {}
+            pos.set_castling_right(WHITE, rsq);
+        }
+        if (stream.read_one_bit()) {
+            Square rsq;
+            for (rsq = relative_square(BLACK, SQ_H1); pos.piece_on(rsq) != B_ROOK; --rsq) {}
+            pos.set_castling_right(BLACK, rsq);
+        }
+        if (stream.read_one_bit()) {
+            Square rsq;
+            for (rsq = relative_square(BLACK, SQ_A1); pos.piece_on(rsq) != B_ROOK; ++rsq) {}
+            pos.set_castling_right(BLACK, rsq);
+        }
+
+        // En passant square. Ignore if no pawn capture is possible
+        if (stream.read_one_bit()) {
+            Square ep_square = static_cast<Square>(stream.read_n_bit(6));
+            pos.st->epSquare = ep_square;
+
+            if (!(pos.attackers_to(pos.st->epSquare) & pos.pieces(pos.sideToMove, PAWN))
+                || !(pos.pieces(~pos.sideToMove, PAWN) & (pos.st->epSquare + pawn_push(~pos.sideToMove))))
+                pos.st->epSquare = SQ_NONE;
+        }
+        else {
+            pos.st->epSquare = SQ_NONE;
+        }
+
+        // Halfmove clock
+        pos.st->rule50 = stream.read_n_bit(6);
+
+        // Fullmove number
+        pos.gamePly = stream.read_n_bit(8);
+
+        // Read the highest bit of rule50. This was added as a fix for rule50
+        // counter having only 6 bits stored.
+        // In older entries this will just be a zero bit.
+        pos.gamePly |= stream.read_n_bit(8) << 8;
+
+        // Read the highest bit of rule50. This was added as a fix for rule50
+        // counter having only 6 bits stored.
+        // In older entries this will just be a zero bit.
+        pos.st->rule50 |= stream.read_n_bit(1) << 6;
+
+        // Convert from fullmove starting from 1 to gamePly starting from 0,
+        // handle also common incorrect FEN with fullmove = 0.
+        pos.gamePly = std::max(2 * (pos.gamePly - 1), 0) + (pos.sideToMove == BLACK);
+
+        assert(stream.get_cursor() <= 256);
+
+        pos.chess960 = false;
+        pos.thisThread = th;
+        pos.set_state(pos.st);
+
+        assert(pos.pos_is_ok());
+
+        return 0;
+    }
+
+    PackedSfen sfen_pack(Position& pos)
+    {
+        PackedSfen sfen;
+
+        SfenPacker sp;
+        sp.data = (uint8_t*)&sfen;
+        sp.pack(pos);
+
+        return sfen;
+    }
+}
@@ -0,0 +1,20 @@
+#ifndef _SFEN_PACKER_H_
+#define _SFEN_PACKER_H_
+
+#include "types.h"
+
+#include "learn/packed_sfen.h"
+
+#include <cstdint>
+
+class Position;
+struct StateInfo;
+class Thread;
+
+namespace Learner {
+
+    int set_from_packed_sfen(Position& pos, const PackedSfen& sfen, StateInfo* si, Thread* th);
+    PackedSfen sfen_pack(Position& pos);
+}
+
+#endif
@@ -0,0 +1,365 @@
+#include "sfen_stream.h"
+
+#include "packed_sfen.h"
+
+#include "misc.h"
+
+#include <string>
+#include <vector>
+#include <deque>
+#include <memory>
+#include <mutex>
+#include <list>
+#include <atomic>
+#include <optional>
+#include <iostream>
+#include <cstdint>
+#include <thread>
+
+namespace Learner{
+
+    enum struct SfenReaderMode
+    {
+        Sequential,
+        Cyclic
+    };
+
+    // Sfen reader
+    struct SfenReader
+    {
+        // Number of phases buffered by each thread 0.1M phases. 4M phase at 40HT
+        static constexpr size_t DEFAULT_THREAD_BUFFER_SIZE = 10 * 1000;
+
+        // Buffer for reading files (If this is made larger,
+        // the shuffle becomes larger and the phases may vary.
+        // If it is too large, the memory consumption will increase.
+        // SFEN_READ_SIZE is a multiple of THREAD_BUFFER_SIZE.
+        static constexpr const size_t DEFAULT_SFEN_READ_SIZE = 1000 * 1000 * 10;
+
+        // Do not use std::random_device().
+        // Because it always the same integers on MinGW.
+        SfenReader(
+            const std::vector<std::string>& filenames_,
+            bool do_shuffle,
+            SfenReaderMode mode_,
+            int thread_num,
+            const std::string& seed,
+            size_t read_size = DEFAULT_SFEN_READ_SIZE,
+            size_t buffer_size = DEFAULT_THREAD_BUFFER_SIZE
+        ) :
+            filenames(filenames_.begin(), filenames_.end()),
+            mode(mode_),
+            sfen_read_size(read_size),
+            thread_buffer_size(buffer_size),
+            prng(seed)
+        {
+            packed_sfens.resize(thread_num);
+            total_read = 0;
+            end_of_files = false;
+            shuffle = do_shuffle;
+            stop_flag = false;
+
+            file_worker_thread = std::thread([&] {
+                this->file_read_worker();
+            });
+        }
+
+        ~SfenReader()
+        {
+            stop_flag = true;
+
+            if (file_worker_thread.joinable())
+                file_worker_thread.join();
+        }
+
+        // Load the phase for calculation such as mse.
+        PSVector read_for_mse(uint64_t count)
+        {
+            PSVector sfen_for_mse;
+            sfen_for_mse.reserve(count);
+
+            for (uint64_t i = 0; i < count; ++i)
+            {
+                PackedSfenValue ps;
+                if (!read_to_thread_buffer(0, ps))
+                {
+                    std::cout << "ERROR (sfen_reader): Reading failed." << std::endl;
+                    return sfen_for_mse;
+                }
+
+                sfen_for_mse.push_back(ps);
+            }
+
+            return sfen_for_mse;
+        }
+
+        PSVector read_validation_set(const std::string& file_name, int eval_limit, bool use_draw_games)
+        {
+            PSVector sfen_for_mse;
+
+            auto input = open_sfen_input_file(file_name);
+
+            while(!input->eof())
+            {
+                std::optional<PackedSfenValue> p_opt = input->next();
+                if (p_opt.has_value())
+                {
+                    auto& p = *p_opt;
+
+                    if (eval_limit < abs(p.score))
+                        continue;
+
+                    if (!use_draw_games && p.game_result == 0)
+                        continue;
+
+                    sfen_for_mse.push_back(p);
+                }
+                else
+                {
+                    break;
+                }
+            }
+
+            return sfen_for_mse;
+        }
+
+        // [ASYNC] Thread returns one aspect. Otherwise returns false.
+        bool read_to_thread_buffer(size_t thread_id, PackedSfenValue& ps)
+        {
+            // If there are any positions left in the thread buffer
+            // then retrieve one and return it.
+            auto& thread_ps = packed_sfens[thread_id];
+
+            // Fill the read buffer if there is no remaining buffer,
+            // but if it doesn't even exist, finish.
+            // If the buffer is empty, fill it.
+            if ((thread_ps == nullptr || thread_ps->empty())
+                && !read_to_thread_buffer_impl(thread_id))
+                return false;
+
+            // read_to_thread_buffer_impl() returned true,
+            // Since the filling of the thread buffer with the
+            // phase has been completed successfully
+            // thread_ps->rbegin() is alive.
+
+            ps = thread_ps->back();
+            thread_ps->pop_back();
+
+            // If you've run out of buffers, call delete yourself to free this buffer.
+            if (thread_ps->empty())
+            {
+                thread_ps.reset();
+            }
+
+            return true;
+        }
+
+        // [ASYNC] Read some aspects into thread buffer.
+        bool read_to_thread_buffer_impl(size_t thread_id)
+        {
+            while (true)
+            {
+                {
+                    std::unique_lock<std::mutex> lk(mutex);
+                    // If you can fill from the file buffer, that's fine.
+                    if (packed_sfens_pool.size() != 0)
+                    {
+                        // It seems that filling is possible, so fill and finish.
+
+                        packed_sfens[thread_id] = std::move(packed_sfens_pool.front());
+                        packed_sfens_pool.pop_front();
+
+                        total_read += thread_buffer_size;
+
+                        return true;
+                    }
+                }
+
+                // The file to read is already gone. No more use.
+                if (end_of_files)
+                    return false;
+
+                // Waiting for file worker to fill packed_sfens_pool.
+                // The mutex isn't locked, so it should fill up soon.
+                // Poor man's condition variable.
+                sleep(1);
+            }
+
+        }
+
+        void file_read_worker()
+        {
+            std::string currentFilename;
+            uint64_t numEntriesReadFromCurrentFile = 0;
+
+            auto open_next_file = [&]() {
+                // no more
+                for(;;)
+                {
+                    sfen_input_stream.reset();
+
+                    if (filenames.empty())
+                        return false;
+
+                    // Get the next file name.
+                    currentFilename = filenames.front();
+                    filenames.pop_front();
+
+                    numEntriesReadFromCurrentFile = 0;
+
+                    sfen_input_stream = open_sfen_input_file(currentFilename);
+
+                    auto out = sync_region_cout.new_region();
+                    if (sfen_input_stream == nullptr)
+                    {
+                        out << "INFO (sfen_reader): File does not exist: " << currentFilename << '\n';
+                    }
+                    else
+                    {
+                        out << "INFO (sfen_reader): Opened file for reading: " << currentFilename << '\n';
+
+                        // in case the file is empty or was deleted.
+                        if (sfen_input_stream->eof())
+                        {
+                            out << "  - File empty, nothing to read.\n";
+                        }
+                        else
+                        {
+                            return true;
+                        }
+                    }
+                }
+            };
+
+            if (sfen_input_stream == nullptr && !open_next_file())
+            {
+                auto out = sync_region_cout.new_region();
+                out << "INFO (sfen_reader): End of files." << std::endl;
+                end_of_files = true;
+                return;
+            }
+
+            while (true)
+            {
+                // Wait for the buffer to run out.
+                // This size() is read only, so you don't need to lock it.
+                while (!stop_flag && packed_sfens_pool.size() >= sfen_read_size / thread_buffer_size)
+                    sleep(100);
+
+                if (stop_flag)
+                    return;
+
+                PSVector sfens;
+                sfens.reserve(sfen_read_size);
+
+                // Read from the file into the file buffer.
+                while (sfens.size() < sfen_read_size)
+                {
+                    std::optional<PackedSfenValue> p = sfen_input_stream->next();
+                    if (p.has_value())
+                    {
+                        sfens.push_back(*p);
+                        ++numEntriesReadFromCurrentFile;
+                    }
+                    else
+                    {
+                        if (mode == SfenReaderMode::Cyclic
+                            && numEntriesReadFromCurrentFile > 0)
+                        {
+                            // The file contained data so we add it again to the end of the queue.
+                            filenames.emplace_back(currentFilename);
+                        }
+
+                        if(!open_next_file())
+                        {
+                            // There was no next file. Abort.
+                            auto out = sync_region_cout.new_region();
+                            out << "INFO (sfen_reader): End of files." << std::endl;
+                            end_of_files = true;
+                            return;
+                        }
+                    }
+                }
+
+                // Shuffle the read phase data.
+                if (shuffle)
+                {
+                    Algo::shuffle(sfens, prng);
+                }
+
+                // Divide this by thread_buffer_size. There should be size pieces.
+                // sfen_read_size shall be a multiple of thread_buffer_size.
+                assert((sfen_read_size % thread_buffer_size) == 0);
+
+                auto size = size_t(sfen_read_size / thread_buffer_size);
+                std::vector<std::unique_ptr<PSVector>> buffers;
+                buffers.reserve(size);
+
+                for (size_t i = 0; i < size; ++i)
+                {
+                    // Delete this pointer on the receiving side.
+                    auto buf = std::make_unique<PSVector>();
+                    buf->resize(thread_buffer_size);
+                    memcpy(
+                        buf->data(),
+                        &sfens[i * thread_buffer_size],
+                        sizeof(PackedSfenValue) * thread_buffer_size);
+
+                    buffers.emplace_back(std::move(buf));
+                }
+
+                {
+                    std::unique_lock<std::mutex> lk(mutex);
+
+                    // The mutex lock is required because the%
+                    // contents of packed_sfens_pool are changed.
+
+                    for (auto& buf : buffers)
+                        packed_sfens_pool.emplace_back(std::move(buf));
+                }
+            }
+        }
+
+    protected:
+
+        // worker thread reading file in background
+        std::thread file_worker_thread;
+
+        // sfen files
+        std::deque<std::string> filenames;
+
+        std::atomic<bool> stop_flag;
+
+        // number of phases read (file to memory buffer)
+        std::atomic<uint64_t> total_read;
+
+        // Do not shuffle when reading the phase.
+        bool shuffle;
+
+        SfenReaderMode mode;
+
+        size_t sfen_read_size;
+        size_t thread_buffer_size;
+
+        // Random number to shuffle when reading the phase
+        PRNG prng;
+
+        // Did you read the files and reached the end?
+        std::atomic<bool> end_of_files;
+
+        // handle of sfen file
+        std::unique_ptr<BasicSfenInputStream> sfen_input_stream;
+
+        // sfen for each thread
+        // (When the thread is used up, the thread should call delete to release it.)
+        std::vector<std::unique_ptr<PSVector>> packed_sfens;
+
+        // Mutex when accessing packed_sfens_pool
+        std::mutex mutex;
+
+        // pool of sfen. The worker thread read from the file is added here.
+        // Each worker thread fills its own packed_sfens[thread_id] from here.
+        // * Lock and access the mutex.
+        std::list<std::unique_ptr<PSVector>> packed_sfens_pool;
+    };
+}
@@ -0,0 +1,222 @@
+#ifndef _SFEN_STREAM_H_
+#define _SFEN_STREAM_H_
+
+#include "packed_sfen.h"
+
+#include "extra/nnue_data_binpack_format.h"
+
+#include <optional>
+#include <fstream>
+#include <string>
+#include <memory>
+
+namespace Learner {
+
+    enum struct SfenOutputType
+    {
+        Bin,
+        Binpack
+    };
+
+    static bool ends_with(const std::string& lhs, const std::string& end)
+    {
+        if (end.size() > lhs.size()) return false;
+
+        return std::equal(end.rbegin(), end.rend(), lhs.rbegin());
+    }
+
+    static bool has_extension(const std::string& filename, const std::string& extension)
+    {
+        return ends_with(filename, "." + extension);
+    }
+
+    static std::string filename_with_extension(const std::string& filename, const std::string& ext)
+    {
+        if (ends_with(filename, ext))
+        {
+            return filename;
+        }
+        else
+        {
+            return filename + "." + ext;
+        }
+    }
+
+    struct BasicSfenInputStream
+    {
+        virtual std::optional<PackedSfenValue> next() = 0;
+        virtual bool eof() const = 0;
+        virtual ~BasicSfenInputStream() {}
+    };
+
+    struct BinSfenInputStream : BasicSfenInputStream
+    {
+        static constexpr auto openmode = std::ios::in | std::ios::binary;
+        static inline const std::string extension = "bin";
+
+        BinSfenInputStream(std::string filename) :
+            m_stream(filename, openmode),
+            m_eof(!m_stream)
+        {
+        }
+
+        std::optional<PackedSfenValue> next() override
+        {
+            PackedSfenValue e;
+            if(m_stream.read(reinterpret_cast<char*>(&e), sizeof(PackedSfenValue)))
+            {
+                return e;
+            }
+            else
+            {
+                m_eof = true;
+                return std::nullopt;
+            }
+        }
+
+        bool eof() const override
+        {
+            return m_eof;
+        }
+
+        ~BinSfenInputStream() override {}
+
+    private:
+        std::fstream m_stream;
+        bool m_eof;
+    };
+
+    struct BinpackSfenInputStream : BasicSfenInputStream
+    {
+        static constexpr auto openmode = std::ios::in | std::ios::binary;
+        static inline const std::string extension = "binpack";
+
+        BinpackSfenInputStream(std::string filename) :
+            m_stream(filename, openmode),
+            m_eof(!m_stream.hasNext())
+        {
+        }
+
+        std::optional<PackedSfenValue> next() override
+        {
+            static_assert(sizeof(binpack::nodchip::PackedSfenValue) == sizeof(PackedSfenValue));
+
+            if (!m_stream.hasNext())
+            {
+                m_eof = true;
+                return std::nullopt;
+            }
+
+            auto training_data_entry = m_stream.next();
+            auto v = binpack::trainingDataEntryToPackedSfenValue(training_data_entry);
+            PackedSfenValue psv;
+            // same layout, different types. One is from generic library.
+            std::memcpy(&psv, &v, sizeof(PackedSfenValue));
+
+            return psv;
+        }
+
+        bool eof() const override
+        {
+            return m_eof;
+        }
+
+        ~BinpackSfenInputStream() override {}
+
+    private:
+        binpack::CompressedTrainingDataEntryReader m_stream;
+        bool m_eof;
+    };
+
+    struct BasicSfenOutputStream
+    {
+        virtual void write(const PSVector& sfens) = 0;
+        virtual ~BasicSfenOutputStream() {}
+    };
+
+    struct BinSfenOutputStream : BasicSfenOutputStream
+    {
+        static constexpr auto openmode = std::ios::out | std::ios::binary | std::ios::app;
+        static inline const std::string extension = "bin";
+
+        BinSfenOutputStream(std::string filename) :
+            m_stream(filename_with_extension(filename, extension), openmode)
+        {
+        }
+
+        void write(const PSVector& sfens) override
+        {
+            m_stream.write(reinterpret_cast<const char*>(sfens.data()), sizeof(PackedSfenValue) * sfens.size());
+        }
+
+        ~BinSfenOutputStream() override {}
+
+    private:
+        std::fstream m_stream;
+    };
+
+    struct BinpackSfenOutputStream : BasicSfenOutputStream
+    {
+        static constexpr auto openmode = std::ios::out | std::ios::binary | std::ios::app;
+        static inline const std::string extension = "binpack";
+
+        BinpackSfenOutputStream(std::string filename) :
+            m_stream(filename_with_extension(filename, extension), openmode)
+        {
+        }
+
+        void write(const PSVector& sfens) override
+        {
+            static_assert(sizeof(binpack::nodchip::PackedSfenValue) == sizeof(PackedSfenValue));
+
+            for(auto& sfen : sfens)
+            {
+                // The library uses a type that's different but layout-compatibile.
+                binpack::nodchip::PackedSfenValue e;
+                std::memcpy(&e, &sfen, sizeof(binpack::nodchip::PackedSfenValue));
+                m_stream.addTrainingDataEntry(binpack::packedSfenValueToTrainingDataEntry(e));
+            }
+        }
+
+        ~BinpackSfenOutputStream() override {}
+
+    private:
+        binpack::CompressedTrainingDataEntryWriter m_stream;
+    };
+
+    inline std::unique_ptr<BasicSfenInputStream> open_sfen_input_file(const std::string& filename)
+    {
+        if (has_extension(filename, BinSfenInputStream::extension))
+            return std::make_unique<BinSfenInputStream>(filename);
+        else if (has_extension(filename, BinpackSfenInputStream::extension))
+            return std::make_unique<BinpackSfenInputStream>(filename);
+
+        return nullptr;
+    }
+
+    inline std::unique_ptr<BasicSfenOutputStream> create_new_sfen_output(const std::string& filename, SfenOutputType sfen_output_type)
+    {
+        switch(sfen_output_type)
+        {
+            case SfenOutputType::Bin:
+                return std::make_unique<BinSfenOutputStream>(filename);
+            case SfenOutputType::Binpack:
+                return std::make_unique<BinpackSfenOutputStream>(filename);
+        }
+
+        assert(false);
+        return nullptr;
+    }
+
+    inline std::unique_ptr<BasicSfenOutputStream> create_new_sfen_output(const std::string& filename)
+    {
+        if (has_extension(filename, BinSfenOutputStream::extension))
+            return std::make_unique<BinSfenOutputStream>(filename);
+        else if (has_extension(filename, BinpackSfenOutputStream::extension))
+            return std::make_unique<BinpackSfenOutputStream>(filename);
+
+        return nullptr;
+    }
+}
+
+#endif
@@ -0,0 +1,206 @@
+#include "packed_sfen.h"
+#include "sfen_stream.h"
+
+#include "misc.h"
+
+#include "extra/nnue_data_binpack_format.h"
+
+#include "syzygy/tbprobe.h"
+
+#include <cstring>
+#include <filesystem>
+#include <fstream>
+#include <limits>
+#include <list>
+#include <memory>
+#include <optional>
+#include <shared_mutex>
+#include <thread>
+#include <atomic>
+
+using namespace std;
+
+namespace Learner {
+
+    // Helper class for exporting Sfen
+    struct SfenWriter
+    {
+        // Amount of sfens required to flush the buffer.
+        static constexpr size_t SFEN_WRITE_SIZE = 5000;
+
+        // File name to write and number of threads to create
+        SfenWriter(string filename_, int thread_num, uint64_t save_count, SfenOutputType sfen_output_type)
+        {
+            sfen_buffers_pool.reserve((size_t)thread_num * 10);
+            sfen_buffers.resize(thread_num);
+
+            auto out = sync_region_cout.new_region();
+            out << "INFO (sfen_writer): Creating new data file at " << filename_ << endl;
+
+            sfen_format = sfen_output_type;
+            output_file_stream = create_new_sfen_output(filename_, sfen_format);
+            filename = filename_;
+            save_every = save_count;
+
+            finished = false;
+
+            file_worker_thread = std::thread([&] { this->file_write_worker(); });
+        }
+
+        ~SfenWriter()
+        {
+            flush();
+
+            finished = true;
+            file_worker_thread.join();
+            output_file_stream.reset();
+
+#if !defined(NDEBUG)
+            {
+                // All buffers should be empty since file_worker_thread
+                // should have written everything before exiting.
+                for (const auto& p : sfen_buffers) { assert(p == nullptr); (void)p ; }
+                assert(sfen_buffers_pool.empty());
+            }
+#endif
+        }
+
+        void write(size_t thread_id, const PackedSfenValue& psv)
+        {
+            // We have a buffer for each thread and add it there.
+            // If the buffer overflows, write it to a file.
+
+            // This buffer is prepared for each thread.
+            auto& buf = sfen_buffers[thread_id];
+
+            // Secure since there is no buf at the first time
+            // and immediately after writing the thread buffer.
+            if (!buf)
+            {
+                buf = std::make_unique<PSVector>();
+                buf->reserve(SFEN_WRITE_SIZE);
+            }
+
+            // Buffer is exclusive to this thread.
+            // There is no need for a critical section.
+            buf->push_back(psv);
+
+            if (buf->size() >= SFEN_WRITE_SIZE)
+            {
+                // If you load it in sfen_buffers_pool, the worker will do the rest.
+
+                // Critical section since sfen_buffers_pool is shared among threads.
+                std::unique_lock<std::mutex> lk(mutex);
+                sfen_buffers_pool.emplace_back(std::move(buf));
+            }
+        }
+
+        void flush()
+        {
+            for (size_t i = 0; i < sfen_buffers.size(); ++i)
+            {
+                flush(i);
+            }
+        }
+
+        // Move what remains in the buffer for your thread to a buffer for writing to a file.
+        void flush(size_t thread_id)
+        {
+            std::unique_lock<std::mutex> lk(mutex);
+
+            auto& buf = sfen_buffers[thread_id];
+
+            // There is a case that buf==nullptr, so that check is necessary.
+            if (buf && buf->size() != 0)
+            {
+                sfen_buffers_pool.emplace_back(std::move(buf));
+            }
+        }
+
+        // Dedicated thread to write to file
+        void file_write_worker()
+        {
+            while (!finished || sfen_buffers_pool.size())
+            {
+                vector<std::unique_ptr<PSVector>> buffers;
+                {
+                    std::unique_lock<std::mutex> lk(mutex);
+
+                    // Atomically swap take the filled buffers and
+                    // create a new buffer pool for threads to fill.
+                    buffers = std::move(sfen_buffers_pool);
+                    sfen_buffers_pool = std::vector<std::unique_ptr<PSVector>>();
+                }
+
+                if (!buffers.size())
+                {
+                    // Poor man's condition variable.
+                    sleep(100);
+                }
+                else
+                {
+                    for (auto& buf : buffers)
+                    {
+                        output_file_stream->write(*buf);
+
+                        sfen_write_count += buf->size();
+
+                        // Add the processed number here, and if it exceeds save_every,
+                        // change the file name and reset this counter.
+                        sfen_write_count_current_file += buf->size();
+                        if (sfen_write_count_current_file >= save_every)
+                        {
+                            sfen_write_count_current_file = 0;
+
+                            // Sequential number attached to the file
+                            int n = (int)(sfen_write_count / save_every);
+
+                            // Rename the file and open it again.
+                            // Add ios::app in consideration of overwriting.
+                            // (Depending on the operation, it may not be necessary.)
+                            string new_filename = filename + "_" + std::to_string(n);
+                            output_file_stream = create_new_sfen_output(new_filename, sfen_format);
+
+                            auto out = sync_region_cout.new_region();
+                            out << "INFO (sfen_writer): Creating new data file at " << new_filename << endl;
+                        }
+                    }
+                }
+            }
+        }
+
+    private:
+
+        std::unique_ptr<BasicSfenOutputStream> output_file_stream;
+
+        // A new net is saved after every save_every sfens are processed.
+        uint64_t save_every = std::numeric_limits<uint64_t>::max();
+
+        // File name passed in the constructor
+        std::string filename;
+
+        // Thread to write to the file
+        std::thread file_worker_thread;
+
+        // Flag that all threads have finished
+        atomic<bool> finished;
+
+        SfenOutputType sfen_format;
+
+        // buffer before writing to file
+        // sfen_buffers is the buffer for each thread
+        // sfen_buffers_pool is a buffer for writing.
+        // After loading the phase in the former buffer by SFEN_WRITE_SIZE,
+        // transfer it to the latter.
+        std::vector<std::unique_ptr<PSVector>> sfen_buffers;
+        std::vector<std::unique_ptr<PSVector>> sfen_buffers_pool;
+
+        // Mutex required to access sfen_buffers_pool
+        std::mutex mutex;
+
+        // Number of sfens written in total, and the
+        // number of sfens written in the current file.
+        uint64_t sfen_write_count = 0;
+        uint64_t sfen_write_count_current_file = 0;
+    };
+}
@@ -0,0 +1,242 @@
+#include "transform.h"
+
+#include "sfen_stream.h"
+#include "packed_sfen.h"
+
+#include "thread.h"
+#include "position.h"
+#include "evaluate.h"
+
+#include "nnue/evaluate_nnue.h"
+
+#include <string>
+#include <map>
+#include <iostream>
+#include <cmath>
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+
+namespace Learner
+{
+    using CommandFunc = void(*)(std::istringstream&);
+
+    enum struct NudgedStaticMode
+    {
+        Absolute,
+        Relative,
+        Interpolate
+    };
+
+    struct NudgedStaticParams
+    {
+        std::string input_filename = "in.binpack";
+        std::string output_filename = "out.binpack";
+        NudgedStaticMode mode = NudgedStaticMode::Absolute;
+        int absolute_nudge = 5;
+        float relative_nudge = 0.1;
+        float interpolate_nudge = 0.1;
+
+        void enforce_constraints()
+        {
+            relative_nudge = std::max(relative_nudge, 0.0f);
+            absolute_nudge = std::max(absolute_nudge, 0);
+        }
+    };
+
+    [[nodiscard]] std::int16_t nudge(NudgedStaticParams& params, std::int16_t static_eval_i16, std::int16_t deep_eval_i16)
+    {
+        auto saturate_i32_to_i16 = [](int v) {
+            return static_cast<std::int16_t>(
+                std::clamp(
+                    v,
+                    (int)std::numeric_limits<std::int16_t>::min(),
+                    (int)std::numeric_limits<std::int16_t>::max()
+                )
+            );
+        };
+
+        auto saturate_f32_to_i16 = [saturate_i32_to_i16](float v) {
+            return saturate_i32_to_i16((int)v);
+        };
+
+        int static_eval = static_eval_i16;
+        int deep_eval = deep_eval_i16;
+
+        switch(params.mode)
+        {
+            case NudgedStaticMode::Absolute:
+                return saturate_i32_to_i16(
+                    static_eval + std::clamp(
+                        deep_eval - static_eval,
+                        -params.absolute_nudge,
+                        params.absolute_nudge
+                    )
+                );
+
+            case NudgedStaticMode::Relative:
+                return saturate_f32_to_i16(
+                    (float)static_eval * std::clamp(
+                        (float)deep_eval / (float)static_eval,
+                        (1.0f - params.relative_nudge),
+                        (1.0f + params.relative_nudge)
+                    )
+                );
+
+            case NudgedStaticMode::Interpolate:
+                return saturate_f32_to_i16(
+                    (float)static_eval * (1.0f - params.interpolate_nudge)
+                    + (float)deep_eval * params.interpolate_nudge
+                );
+
+            default:
+                assert(false);
+                return 0;
+        }
+    }
+
+    void do_nudged_static(NudgedStaticParams& params)
+    {
+        Thread* th = Threads.main();
+        Position& pos = th->rootPos;
+        StateInfo si;
+
+        auto in = Learner::open_sfen_input_file(params.input_filename);
+        auto out = Learner::create_new_sfen_output(params.output_filename);
+
+        if (in == nullptr)
+        {
+            std::cerr << "Invalid input file type.\n";
+            return;
+        }
+
+        if (out == nullptr)
+        {
+            std::cerr << "Invalid output file type.\n";
+            return;
+        }
+
+        PSVector buffer;
+        uint64_t batch_size = 1'000'000;
+
+        buffer.reserve(batch_size);
+
+        uint64_t num_processed = 0;
+        for (;;)
+        {
+            auto v = in->next();
+            if (!v.has_value())
+                break;
+
+            auto& ps = v.value();
+
+            pos.set_from_packed_sfen(ps.sfen, &si, th);
+            auto static_eval = Eval::evaluate(pos);
+            auto deep_eval = ps.score;
+            ps.score = nudge(params, static_eval, deep_eval);
+
+            buffer.emplace_back(ps);
+            if (buffer.size() >= batch_size)
+            {
+                num_processed += buffer.size();
+
+                out->write(buffer);
+                buffer.clear();
+
+                std::cout << "Processed " << num_processed << " positions.\n";
+            }
+        }
+
+        if (!buffer.empty())
+        {
+            num_processed += buffer.size();
+
+            out->write(buffer);
+            buffer.clear();
+
+            std::cout << "Processed " << num_processed << " positions.\n";
+        }
+
+        std::cout << "Finished.\n";
+    }
+
+    void nudged_static(std::istringstream& is)
+    {
+        NudgedStaticParams params{};
+
+        while(true)
+        {
+            std::string token;
+            is >> token;
+
+            if (token == "")
+                break;
+
+            if (token == "absolute")
+            {
+                params.mode = NudgedStaticMode::Absolute;
+                is >> params.absolute_nudge;
+            }
+            else if (token == "relative")
+            {
+                params.mode = NudgedStaticMode::Relative;
+                is >> params.relative_nudge;
+            }
+            else if (token == "interpolate")
+            {
+                params.mode = NudgedStaticMode::Interpolate;
+                is >> params.interpolate_nudge;
+            }
+            else if (token == "input_file")
+                is >> params.input_filename;
+            else if (token == "output_file")
+                is >> params.output_filename;
+        }
+
+        std::cout << "Performing transform nudged_static with parameters:\n";
+        std::cout << "input_file          : " << params.input_filename << '\n';
+        std::cout << "output_file         : " << params.output_filename << '\n';
+        std::cout << "\n";
+        if (params.mode == NudgedStaticMode::Absolute)
+        {
+            std::cout << "mode                : absolute\n";
+            std::cout << "absolute_nudge      : " << params.absolute_nudge << '\n';
+        }
+        else if (params.mode == NudgedStaticMode::Relative)
+        {
+            std::cout << "mode                : relative\n";
+            std::cout << "relative_nudge      : " << params.relative_nudge << '\n';
+        }
+        else if (params.mode == NudgedStaticMode::Interpolate)
+        {
+            std::cout << "mode                : interpolate\n";
+            std::cout << "interpolate_nudge   : " << params.interpolate_nudge << '\n';
+        }
+        std::cout << '\n';
+
+        params.enforce_constraints();
+        do_nudged_static(params);
+    }
+
+    void transform(std::istringstream& is)
+    {
+        const std::map<std::string, CommandFunc> subcommands = {
+            { "nudged_static", &nudged_static }
+        };
+
+        Eval::NNUE::init();
+
+        std::string subcommand;
+        is >> subcommand;
+
+        auto func = subcommands.find(subcommand);
+        if (func == subcommands.end())
+        {
+            std::cout << "Invalid subcommand " << subcommand << ". Exiting...\n";
+            return;
+        }
+
+        func->second(is);
+    }
+
+}
@@ -0,0 +1,12 @@
+#ifndef _TRANSFORM_H_
+#define _TRANSFORM_H_
+
+#include <sstream>
+
+namespace Learner {
+
+    void transform(std::istringstream& is);
+
+}
+
+#endif
@@ -18,6 +18,8 @@

 #include <iostream>

+#include "nnue/evaluate_nnue.h"
+
 #include "bitboard.h"
 #include "endgame.h"
 #include "position.h"
@@ -35,6 +37,7 @@ int main(int argc, char* argv[]) {

  std::cout << engine_info() << std::endl;

+  CommandLine::init(argc, argv);
  UCI::init(Options);
  Tune::init();
  PSQT::init();
@@ -44,7 +47,7 @@ int main(int argc, char* argv[]) {
  Endgames::init();
  Threads.set(size_t(Options["Threads"]));
  Search::clear(); // After threads are up
-  Eval::init_NNUE();
+  Eval::NNUE::init();

  UCI::loop(argc, argv);

@@ -61,6 +61,8 @@ typedef bool(*fun3_t)(HANDLE, CONST GROUP_AFFINITY*, PGROUP_AFFINITY);

 using namespace std;

+SynchronizedRegionLogger sync_region_cout(std::cout);
+
 namespace {

 /// Version number. If Version is left empty, then compile date in the format
@@ -132,6 +134,7 @@ public:

 } // namespace

+
 /// engine_info() returns the full name of the current Stockfish version. This
 /// will be either "Stockfish <Tag> DD-MM-YY" (where DD-MM-YY is the date when
 /// the program was compiled) or "Stockfish <Version>", depending on whether
@@ -356,27 +359,11 @@ void std_aligned_free(void* ptr) {
 #endif
 }

-/// aligned_ttmem_alloc() will return suitably aligned memory, if possible using large pages.
-/// The returned pointer is the aligned one, while the mem argument is the one that needs
-/// to be passed to free. With c++17 some of this functionality could be simplified.
+/// aligned_large_pages_alloc() will return suitably aligned memory, if possible using large pages.

-#if defined(__linux__) && !defined(__ANDROID__)
+#if defined(_WIN32)

-void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
-
-  constexpr size_t alignment = 2 * 1024 * 1024; // assumed 2MB page sizes
-  size_t size = ((allocSize + alignment - 1) / alignment) * alignment; // multiple of alignment
-  if (posix_memalign(&mem, alignment, size))
-     mem = nullptr;
-#if defined(MADV_HUGEPAGE)
-  madvise(mem, allocSize, MADV_HUGEPAGE);
-#endif
-  return mem;
-}
-
-#elif defined(_WIN64)
-
-static void* aligned_ttmem_alloc_large_pages(size_t allocSize) {
+static void* aligned_large_pages_alloc_win(size_t allocSize) {

  HANDLE hProcessToken { };
  LUID luid { };
@@ -421,23 +408,10 @@ static void* aligned_ttmem_alloc_large_pages(size_t allocSize) {
  return mem;
 }

-void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
-
-  static bool firstCall = true;
+void* aligned_large_pages_alloc(size_t allocSize) {

  // Try to allocate large pages
-  mem = aligned_ttmem_alloc_large_pages(allocSize);
-
-  // Suppress info strings on the first call. The first call occurs before 'uci'
-  // is received and in that case this output confuses some GUIs.
-  if (!firstCall)
-  {
-      if (mem)
-          sync_cout << "info string Hash table allocation: Windows large pages used." << sync_endl;
-      else
-          sync_cout << "info string Hash table allocation: Windows large pages not used." << sync_endl;
-  }
-  firstCall = false;
+  void* mem = aligned_large_pages_alloc_win(allocSize);

  // Fall back to regular, page aligned, allocation if necessary
  if (!mem)
@@ -448,23 +422,31 @@ void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {

 #else

-void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
+void* aligned_large_pages_alloc(size_t allocSize) {

-  constexpr size_t alignment = 64; // assumed cache line size
-  size_t size = allocSize + alignment - 1; // allocate some extra space
-  mem = malloc(size);
-  void* ret = reinterpret_cast<void*>((uintptr_t(mem) + alignment - 1) & ~uintptr_t(alignment - 1));
-  return ret;
+#if defined(__linux__)
+  constexpr size_t alignment = 2 * 1024 * 1024; // assumed 2MB page size
+#else
+  constexpr size_t alignment = 4096; // assumed small page size
+#endif
+
+  // round up to multiples of alignment
+  size_t size = ((allocSize + alignment - 1) / alignment) * alignment;
+  void *mem = std_aligned_alloc(alignment, size);
+#if defined(MADV_HUGEPAGE)
+  madvise(mem, size, MADV_HUGEPAGE);
+#endif
+  return mem;
 }

 #endif


-/// aligned_ttmem_free() will free the previously allocated ttmem
+/// aligned_large_pages_free() will free the previously allocated ttmem

-#if defined(_WIN64)
+#if defined(_WIN32)

-void aligned_ttmem_free(void* mem) {
+void aligned_large_pages_free(void* mem) {

  if (mem && !VirtualFree(mem, 0, MEM_RELEASE))
  {
@@ -477,8 +459,8 @@ void aligned_ttmem_free(void* mem) {

 #else

-void aligned_ttmem_free(void *mem) {
-  free(mem);
+void aligned_large_pages_free(void *mem) {
+  std_aligned_free(mem);
 }

 #endif
@@ -590,6 +572,62 @@ void bindThisThread(size_t idx) {

 } // namespace WinProcGroup

+#ifdef _WIN32
+#include <direct.h>
+#define GETCWD _getcwd
+#else
+#include <unistd.h>
+#define GETCWD getcwd
+#endif
+
+namespace CommandLine {
+
+string argv0;            // path+name of the executable binary, as given by argv[0]
+string binaryDirectory;  // path of the executable directory
+string workingDirectory; // path of the working directory
+
+void init(int argc, char* argv[]) {
+    (void)argc;
+    string pathSeparator;
+
+    // extract the path+name of the executable binary
+    argv0 = argv[0];
+
+#ifdef _WIN32
+    pathSeparator = "\\";
+  #ifdef _MSC_VER
+    // Under windows argv[0] may not have the extension. Also _get_pgmptr() had
+    // issues in some windows 10 versions, so check returned values carefully.
+    char* pgmptr = nullptr;
+    if (!_get_pgmptr(&pgmptr) && pgmptr != nullptr && *pgmptr)
+        argv0 = pgmptr;
+  #endif
+#else
+    pathSeparator = "/";
+#endif
+
+    // extract the working directory
+    workingDirectory = "";
+    char buff[40000];
+    char* cwd = GETCWD(buff, 40000);
+    if (cwd)
+        workingDirectory = cwd;
+
+    // extract the binary directory path from argv0
+    binaryDirectory = argv0;
+    size_t pos = binaryDirectory.find_last_of("\\/");
+    if (pos == std::string::npos)
+        binaryDirectory = "." + pathSeparator;
+    else
+        binaryDirectory.resize(pos + 1);
+
+    // pattern replacement: "./" at the start of path is replaced by the working directory
+    if (binaryDirectory.find("." + pathSeparator) == 0)
+        binaryDirectory.replace(0, 1, workingDirectory);
+}
+
+
+} // namespace CommandLine
 // Returns a string that represents the current time. (Used when learning evaluation functions)
 std::string now_string()
 {
@@ -627,18 +665,27 @@ void* aligned_malloc(size_t size, size_t align)
    return p;
 }

+std::uint64_t get_file_size(std::fstream& fs)
+{
+    auto pos = fs.tellg();
+
+    fs.seekg(0, fstream::end);
+    const uint64_t eofPos = (uint64_t)fs.tellg();
+    fs.clear(); // Otherwise, the next seek may fail.
+    fs.seekg(0, fstream::beg);
+    const uint64_t begPos = (uint64_t)fs.tellg();
+    fs.seekg(pos);
+
+    return eofPos - begPos;
+}
+
 int read_file_to_memory(std::string filename, std::function<void* (uint64_t)> callback_func)
 {
    fstream fs(filename, ios::in | ios::binary);
    if (fs.fail())
        return 1;

-    fs.seekg(0, fstream::end);
-    uint64_t eofPos = (uint64_t)fs.tellg();
-    fs.clear(); // Otherwise the next seek may fail.
-    fs.seekg(0, fstream::beg);
-    uint64_t begPos = (uint64_t)fs.tellg();
-    uint64_t file_size = eofPos - begPos;
+    const uint64_t file_size = get_file_size(fs);
    //std::cout << "filename = " << filename << " , file_size = " << file_size << endl;

    // I know the file size, so call callback_func to get a buffer for this,
@@ -687,66 +734,3 @@ int write_memory_to_file(std::string filename, void* ptr, uint64_t size)
    fs.close();
    return 0;
 }
-
-// ----------------------------
-//     mkdir wrapper
-// ----------------------------
-
-// Specify relative to the current folder. Returns 0 on success, non-zero on failure.
-// Create a folder. Japanese is not used.
-// In case of gcc under msys2 environment, folder creation fails with _wmkdir(). Cause unknown.
-// Use _mkdir() because there is no help for it.
-
-#if defined(_WIN32)
-// for Windows
-
-#if defined(_MSC_VER)
-#include <codecvt> // I need this because I want wstring to mkdir
-#include <locale> // This is required for wstring_convert.
-
-namespace Dependency {
-    int mkdir(std::string dir_name)
-    {
-        std::wstring_convert<std::codecvt_utf8<wchar_t>, wchar_t> cv;
-        return _wmkdir(cv.from_bytes(dir_name).c_str());
-        //	::CreateDirectory(cv.from_bytes(dir_name).c_str(),NULL);
-    }
-}
-
-#elif defined(__GNUC__) 
-
-#include <direct.h>
-namespace Dependency {
-    int mkdir(std::string dir_name)
-    {
-        return _mkdir(dir_name.c_str());
-    }
-}
-
-#endif
-#elif defined(__linux__)
-
-// In the linux environment, this symbol _LINUX is defined in the makefile.
-
-// mkdir implementation for Linux.
-#include "sys/stat.h"
-
-namespace Dependency {
-    int mkdir(std::string dir_name)
-    {
-        return ::mkdir(dir_name.c_str(), 0777);
-    }
-}
-#else
-
-// In order to judge whether it is a Linux environment, we have to divide the makefile..
-// The function to dig a folder on linux is good for the time being... Only used to save the evaluation function file...
-
-namespace Dependency {
-    int mkdir(std::string dir_name)
-    {
-        return 0;
-    }
-}
-
-#endif
@@ -19,6 +19,7 @@
 #ifndef MISC_H_INCLUDED
 #define MISC_H_INCLUDED

+#include <algorithm>
 #include <cassert>
 #include <chrono>
 #include <functional>
@@ -27,6 +28,12 @@
 #include <string>
 #include <vector>

+#include <cstdint>
+#include <cmath>
+#include <cctype>
+#include <sstream>
+#include <deque>
+
 #include "types.h"

 const std::string engine_info(bool to_uci = false);
@@ -35,8 +42,8 @@ void prefetch(void* addr);
 void start_logger(const std::string& fname);
 void* std_aligned_alloc(size_t alignment, size_t size);
 void std_aligned_free(void* ptr);
-void* aligned_ttmem_alloc(size_t size, void*& mem);
-void aligned_ttmem_free(void* mem); // nop if mem == nullptr
+void* aligned_large_pages_alloc(size_t size); // memory aligned by page size, min alignment: 4096 bytes
+void aligned_large_pages_free(void* mem); // nop if mem == nullptr

 void dbg_hit_on(bool b);
 void dbg_hit_on(bool c, bool b);
@@ -44,9 +51,7 @@ void dbg_mean_of(int v);
 void dbg_print();

 typedef std::chrono::milliseconds::rep TimePoint; // A value in milliseconds
-
 static_assert(sizeof(TimePoint) == sizeof(int64_t), "TimePoint should be 64 bits");
-
 inline TimePoint now() {
  return std::chrono::duration_cast<std::chrono::milliseconds>
        (std::chrono::steady_clock::now().time_since_epoch()).count();
@@ -67,6 +72,232 @@ std::ostream& operator<<(std::ostream&, SyncCout);
 #define sync_cout std::cout << IO_LOCK
 #define sync_endl std::endl << IO_UNLOCK

+// `ptr` must point to an array of size at least
+// `sizeof(T) * N + alignment` bytes, where `N` is the
+// number of elements in the array.
+template <uintptr_t Alignment, typename T>
+T* align_ptr_up(T* ptr)
+{
+  static_assert(alignof(T) < Alignment);
+
+  const uintptr_t ptrint = reinterpret_cast<uintptr_t>(reinterpret_cast<char*>(ptr));
+  return reinterpret_cast<T*>(reinterpret_cast<char*>((ptrint + (Alignment - 1)) / Alignment * Alignment));
+}
+
+// This logger allows printing many parts in a region atomically
+// but doesn't block the threads trying to append to other regions.
+// Instead if some region tries to pring while other region holds
+// the lock the messages are queued to be printed as soon as the
+// current region releases the lock.
+struct SynchronizedRegionLogger
+{
+  using RegionId = std::uint64_t;
+
+  struct Region
+  {
+    friend struct SynchronizedRegionLogger;
+
+    Region() :
+      logger(nullptr), region_id(0), is_held(false)
+    {
+    }
+
+    Region(const Region&) = delete;
+    Region& operator=(const Region&) = delete;
+
+    Region(Region&& other) :
+      logger(other.logger), region_id(other.region_id), is_held(other.is_held)
+    {
+      other.logger = nullptr;
+      other.is_held = false;
+    }
+
+    Region& operator=(Region&& other) {
+      if (is_held && logger != nullptr)
+      {
+        logger->release_region(region_id);
+      }
+
+      logger = other.logger;
+      region_id = other.region_id;
+      is_held = other.is_held;
+
+      other.is_held = false;
+
+      return *this;
+    }
+
+    ~Region() { unlock(); }
+
+    void unlock() {
+      if (is_held) {
+        is_held = false;
+
+        if (logger != nullptr)
+          logger->release_region(region_id);
+      }
+    }
+
+    Region& operator << (std::ostream&(*pManip)(std::ostream&)) {
+      if (logger != nullptr)
+        logger->write(region_id, pManip);
+
+      return *this;
+    }
+
+    template <typename T>
+    Region& operator << (const T& value) {
+      if (logger != nullptr)
+        logger->write(region_id, value);
+
+      return *this;
+    }
+
+  private:
+    SynchronizedRegionLogger* logger;
+    RegionId region_id;
+    bool is_held;
+
+    Region(SynchronizedRegionLogger& log, RegionId id) :
+      logger(&log), region_id(id), is_held(true)
+    {
+    }
+  };
+
+private:
+  struct RegionBookkeeping
+  {
+    RegionBookkeeping(RegionId rid) : id(rid), is_held(true) {}
+
+    std::vector<std::string> pending_parts;
+    RegionId id;
+    bool is_held;
+  };
+
+  RegionId init_next_region()
+  {
+    static RegionId next_id = 0;
+
+    std::lock_guard lock(mutex);
+
+    const auto id = next_id++;
+    regions.emplace_back(id);
+
+    return id;
+  }
+
+  void write(RegionId id, std::ostream&(*pManip)(std::ostream&)) {
+    std::lock_guard lock(mutex);
+
+    if (regions.empty())
+      return;
+
+    if (id == regions.front().id) {
+      // We can just directly print to the output because
+      // we are at the front of the region queue.
+      out << *pManip;
+    } else {
+      // We have to schedule the print until previous regions are
+      // processed
+      auto* region = find_region_nolock(id);
+      if (region == nullptr)
+        return;
+
+      std::stringstream ss;
+      ss << *pManip;
+      region->pending_parts.emplace_back(std::move(ss).str());
+    }
+  }
+
+  template <typename T>
+  void write(RegionId id, const T& value) {
+    std::lock_guard lock(mutex);
+
+    if (regions.empty())
+      return;
+
+    if (id == regions.front().id) {
+      // We can just directly print to the output because
+      // we are at the front of the region queue.
+      out << value;
+    } else {
+      // We have to schedule the print until previous regions are
+      // processed
+      auto* region = find_region_nolock(id);
+      if (region == nullptr)
+        return;
+
+      std::stringstream ss;
+      ss << value;
+      region->pending_parts.emplace_back(std::move(ss).str());
+    }
+  }
+
+  std::ostream& out;
+
+  std::deque<RegionBookkeeping> regions;
+
+  std::mutex mutex;
+
+  RegionBookkeeping* find_region_nolock(RegionId id) {
+    // Linear search because the amount of concurrent regions should be small.
+    auto it = std::find_if(
+      regions.begin(),
+      regions.end(),
+      [id](const RegionBookkeeping& r) { return r.id == id; });
+
+    if (it == regions.end())
+      return nullptr;
+    else
+      return &*it;
+  }
+
+  void release_region(RegionId id) {
+    std::lock_guard lock(mutex);
+
+    auto* region = find_region_nolock(id);
+    if (region == nullptr)
+      return;
+
+    region->is_held = false;
+
+    process_backlog_nolock();
+  }
+
+  void process_backlog_nolock()
+  {
+    while(!regions.empty()) {
+      auto& region = regions.front();
+
+      for(auto& part : region.pending_parts) {
+        out << part;
+      }
+
+      // If the region is still held then we don't
+      // want to start printing stuff from the next region.
+      if (region.is_held)
+        break;
+
+      regions.pop_front();
+    }
+  }
+
+public:
+
+  SynchronizedRegionLogger(std::ostream& s) :
+    out(s)
+  {
+  }
+
+  [[nodiscard]] Region new_region() {
+    const auto id = init_next_region();
+    return Region(*this, id);
+  }
+
+};
+
+extern SynchronizedRegionLogger sync_region_cout;
+

 /// xorshift64star Pseudo-Random Number Generator
 /// This class is based on original code written and dedicated
@@ -83,6 +314,19 @@ std::ostream& operator<<(std::ostream&, SyncCout);
 /// For further analysis see
 ///   <http://vigna.di.unimi.it/ftp/papers/xorshift.pdf>

+static uint64_t string_hash(const std::string& str)
+{
+  uint64_t h = 525201411107845655ull;
+
+  for (auto c : str) {
+    h ^= static_cast<uint64_t>(c);
+    h *= 0x5bd1e9955bd1e995ull;
+    h ^= h >> 47;
+  }
+
+  return h;
+}
+
 class PRNG {

  uint64_t s;
@@ -94,7 +338,9 @@ class PRNG {
  }

 public:
+  PRNG() { set_seed_from_time(); }
  PRNG(uint64_t seed) : s(seed) { assert(seed); }
+  PRNG(const std::string& seed) { set_seed(seed); }

  template<typename T> T rand() { return T(rand64()); }

@@ -107,6 +353,40 @@ public:

  // Return the random seed used internally.
  uint64_t get_seed() const { return s; }
+
+  void set_seed(uint64_t seed) { s = seed; }
+
+  uint64_t next_random_seed()
+  {
+    uint64_t seed = 0;
+    for(int i = 0; i < 64; ++i)
+    {
+      const auto off = rand64() % 64;
+      seed |= (rand64() & (uint64_t(1) << off)) >> off;
+      seed <<= 1;
+    }
+    return seed;
+  }
+
+  void set_seed_from_time()
+  {
+      set_seed(std::chrono::system_clock::now().time_since_epoch().count());
+  }
+
+  void set_seed(const std::string& str)
+  {
+    if (str.empty())
+    {
+      set_seed_from_time();
+    }
+    else if (std::all_of(str.begin(), str.end(), [](char c) { return std::isdigit(c);} )) {
+      set_seed(std::stoull(str));
+    }
+    else
+    {
+      set_seed(string_hash(str));
+    }
+  }
 };

 // Display a random seed. (For debugging)
@@ -130,6 +410,74 @@ inline uint64_t mul_hi64(uint64_t a, uint64_t b) {
 #endif
 }

+// This bitset can be accessed concurrently, provided
+// the concurrent accesses are performed on distinct
+// instances of underlying type. That means the cuncurrent
+// accesses need to be spaced by at least
+// bits_per_bucket bits.
+// But at least best_concurrent_access_stride bits
+// is recommended to prevent false sharing.
+template <uint64_t N>
+struct LargeBitset
+{
+private:
+    constexpr static uint64_t cache_line_size = 64;
+
+public:
+    using UnderlyingType = uint64_t;
+
+    constexpr static uint64_t num_bits = N;
+    constexpr static uint64_t bits_per_bucket = 8 * sizeof(uint64_t);
+    constexpr static uint64_t num_buckets = (num_bits + bits_per_bucket - 1) / bits_per_bucket;
+    constexpr static uint64_t best_concurrent_access_stride = 8 * cache_line_size;
+
+    LargeBitset()
+    {
+        std::fill(std::begin(bits), std::end(bits), 0);
+    }
+
+    void set(uint64_t idx)
+    {
+        const uint64_t bucket = idx / bits_per_bucket;
+        const uint64_t bit = uint64_t(1) << (idx % bits_per_bucket);
+        bits[bucket] |= bit;
+    }
+
+    bool test(uint64_t idx) const
+    {
+        const uint64_t bucket = idx / bits_per_bucket;
+        const uint64_t bit = uint64_t(1) << (idx % bits_per_bucket);
+        return bits[bucket] & bit;
+    }
+
+    uint64_t count() const
+    {
+        uint64_t c = 0;
+        uint64_t i = 0;
+
+        for (; i < num_buckets - 3; i += 4)
+        {
+            uint64_t c0 = popcount(bits[i+0]);
+            uint64_t c1 = popcount(bits[i+1]);
+            uint64_t c2 = popcount(bits[i+2]);
+            uint64_t c3 = popcount(bits[i+3]);
+            c0 += c1;
+            c2 += c3;
+            c += c0 + c2;
+        }
+
+        for (; i < num_buckets; ++i)
+        {
+            c += popcount(bits[i]);
+        }
+
+        return c;
+    }
+
+private:
+    alignas(cache_line_size) UnderlyingType bits[num_buckets];
+};
+
 /// Under Windows it is not possible for a process to run on more than one
 /// logical processor group. This usually means to be limited to use max 64
 /// cores. To overcome this, some special platform specific API should be
@@ -155,6 +503,7 @@ std::string now_string();
 // Also, if the buffer cannot be allocated in the callback function or if the file size is different from the expected file size,
 // Return nullptr. At this time, read_file_to_memory() interrupts reading and returns with an error.

+std::uint64_t get_file_size(std::fstream& fs);
 int read_file_to_memory(std::string filename, std::function<void* (uint64_t)> callback_func);
 int write_memory_to_file(std::string filename, void* ptr, uint64_t size);

@@ -165,7 +514,9 @@ int write_memory_to_file(std::string filename, void* ptr, uint64_t size);
 // async version of PRNG
 struct AsyncPRNG
 {
+  AsyncPRNG() : prng() { }
  AsyncPRNG(uint64_t seed) : prng(seed) { assert(seed); }
+  AsyncPRNG(const std::string& seed) : prng(seed) { }
  // [ASYNC] Extract one random number.
  template<typename T> T rand() {
    std::unique_lock<std::mutex> lk(mutex);
@@ -199,20 +550,51 @@ inline std::ostream& operator<<(std::ostream& os, AsyncPRNG& prng)

 // Mathematical function used for progress calculation and learning
 namespace Math {
-	// Sigmoid function
-	// = 1.0 / (1.0 + std::exp(-x))
-	double sigmoid(double x);
+    inline double sigmoid(double x)
+    {
+        return 1.0 / (1.0 + std::exp(-x));
+    }

-	// Differentiation of sigmoid function
-	// = sigmoid(x) * (1.0-sigmoid(x))
-	double dsigmoid(double x);
+    inline double dsigmoid(double x)
+    {
+        // Sigmoid function
+        // f(x) = 1/(1+exp(-x))
+        // the first derivative is
+        // f'(x) = df/dx = f(x)・{ 1-f(x)}
+        // becomes
+
+        return sigmoid(x) * (1.0 - sigmoid(x));
+    }

 	// Clip v so that it fits between [lo,hi].
 	// * In Stockfish, this function is written in bitboard.h.
 	template<class T> constexpr const T& clamp(const T& v, const T& lo, const T& hi) {
 		return v < lo ? lo : v > hi ? hi : v;
 	}
+}

+namespace Algo {
+    // Fisher-Yates
+    template <typename Rng, typename T>
+    void shuffle(std::vector<T>& buf, Rng&& prng)
+    {
+        const auto size = buf.size();
+        for (uint64_t i = 0; i < size; ++i)
+            std::swap(buf[i], buf[prng.rand(size - i) + i]);
+    }
+
+    // split the string
+    inline std::vector<std::string> split(const std::string& input, char delimiter) {
+        std::istringstream stream(input);
+        std::string field;
+        std::vector<std::string> fields;
+
+        while (std::getline(stream, field, delimiter)) {
+            fields.push_back(field);
+        }
+
+        return fields;
+    }
 }

 // --------------------
@@ -225,7 +607,7 @@ struct Path
 {
 	// Combine the path name and file name and return it.
 	// If the folder name is not an empty string, append it if there is no'/' or'\\' at the end.
-	static std::string Combine(const std::string& folder, const std::string& filename)
+	static std::string combine(const std::string& folder, const std::string& filename)
 	{
 		if (folder.length() >= 1 && *folder.rbegin() != '/' && *folder.rbegin() != '\\')
 			return folder + "/" + filename;
@@ -234,7 +616,7 @@ struct Path
 	}

 	// Get the file name part (excluding the folder name) from the full path expression.
-	static std::string GetFileName(const std::string& path)
+	static std::string get_file_name(const std::string& path)
 	{
 		// I don't know which "\" or "/" is used.
 		auto path_index1 = path.find_last_of("\\") + 1;
@@ -259,7 +641,24 @@ public:
  template <typename U> AlignedAllocator(const AlignedAllocator<U>&) {}

  T* allocate(std::size_t n) { return (T*)std_aligned_alloc(alignof(T), n * sizeof(T)); }
-  void deallocate(T* p, std::size_t n) { std_aligned_free(p); }
+  void deallocate(T* p, std::size_t ) { std_aligned_free(p); }
+};
+
+template <typename T>
+class CacheLineAlignedAllocator {
+public:
+    using value_type = T;
+
+    constexpr static uint64_t cache_line_size = 64;
+
+    CacheLineAlignedAllocator() {}
+    CacheLineAlignedAllocator(const CacheLineAlignedAllocator&) {}
+    CacheLineAlignedAllocator(CacheLineAlignedAllocator&&) {}
+
+    template <typename U> CacheLineAlignedAllocator(const CacheLineAlignedAllocator<U>&) {}
+
+    T* allocate(std::size_t n) { return (T*)std_aligned_alloc(cache_line_size, n * sizeof(T)); }
+    void deallocate(T* p, std::size_t) { std_aligned_free(p); }
 };

 // --------------------
@@ -273,11 +672,13 @@ namespace Dependency
  // So when calling getline() on fstream,
  // just write getline() instead of std::getline() and use this function.
  extern bool getline(std::ifstream& fs, std::string& s);
+}

-  // Create a folder.
-  // Specify relative to the current folder. Japanese is not used for dir_name.
-  // Returns 0 on success, non-zero on failure.
-  extern int mkdir(std::string dir_name);
+namespace CommandLine {
+  void init(int argc, char* argv[]);
+
+  extern std::string binaryDirectory;  // path of the executable directory
+  extern std::string workingDirectory; // path of the working directory
 }

 #endif // #ifndef MISC_H_INCLUDED
@@ -73,8 +73,9 @@ MovePicker::MovePicker(const Position& p, Move ttm, Depth d, const ButterflyHist
  assert(d <= 0);

  stage = (pos.checkers() ? EVASION_TT : QSEARCH_TT) +
-           !(ttm && (depth > DEPTH_QS_RECAPTURES || to_sq(ttm) == recaptureSquare)
-                 && pos.pseudo_legal(ttm));
+          !(   ttm
+            && (pos.checkers() || depth > DEPTH_QS_RECAPTURES || to_sq(ttm) == recaptureSquare)
+            && pos.pseudo_legal(ttm));
 }

 /// MovePicker constructor for ProbCut: we generate captures with SEE greater
@@ -0,0 +1,54 @@
+/*
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+// Definition of input features and network structure used in NNUE evaluation function
+
+#ifndef NNUE_HALFKA_256X2_32_32_H_INCLUDED
+#define NNUE_HALFKA_256X2_32_32_H_INCLUDED
+
+#include "nnue/features/feature_set.h"
+#include "nnue/features/half_ka.h"
+
+#include "nnue/layers/input_slice.h"
+#include "nnue/layers/affine_transform.h"
+#include "nnue/layers/clipped_relu.h"
+
+namespace Eval::NNUE {
+
+    // Input features used in evaluation function
+    using RawFeatures = Features::FeatureSet<
+        Features::HalfKA<Features::Side::kFriend>>;
+
+    // Number of input feature dimensions after conversion
+    constexpr IndexType kTransformedFeatureDimensions = 256;
+
+    namespace Layers {
+
+        // Define network structure
+        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+        using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+
+    }  // namespace Layers
+
+    using Network = Layers::OutputLayer;
+
+}  // namespace Eval::NNUE
+
+#endif // #ifndef NNUE_HALFA_256X2_32_32_H_INCLUDED
@@ -1,42 +1,57 @@
+/*
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
 // Definition of input features and network structure used in NNUE evaluation function

-#ifndef HALFKP_CR_EP_256X2_32_32_H
-#define HALFKP_CR_EP_256X2_32_32_H
+#ifndef NNUE_HALFKP_CR_EP_256X2_32_32_H_INCLUDED
+#define NNUE_HALFKP_CR_EP_256X2_32_32_H_INCLUDED

-#include "../features/feature_set.h"
-#include "../features/half_kp.h"
-#include "../features/castling_right.h"
-#include "../features/enpassant.h"
+#include "nnue/features/feature_set.h"
+#include "nnue/features/half_kp.h"
+#include "nnue/features/castling_right.h"
+#include "nnue/features/enpassant.h"

-#include "../layers/input_slice.h"
-#include "../layers/affine_transform.h"
-#include "../layers/clipped_relu.h"
+#include "nnue/layers/input_slice.h"
+#include "nnue/layers/affine_transform.h"
+#include "nnue/layers/clipped_relu.h"

-namespace Eval {
-
-  namespace NNUE {
+namespace Eval::NNUE {

    // Input features used in evaluation function
    using RawFeatures = Features::FeatureSet<
-      Features::HalfKP<Features::Side::kFriend>, Features::CastlingRight,
-      Features::EnPassant>;
+        Features::HalfKP<Features::Side::kFriend>, Features::CastlingRight,
+        Features::EnPassant>;

    // Number of input feature dimensions after conversion
    constexpr IndexType kTransformedFeatureDimensions = 256;

    namespace Layers {

-      // define network structure
-      using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-      using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-      using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-      using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+        // Define network structure
+        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+        using OutputLayer = AffineTransform<HiddenLayer2, 1>;

    }  // namespace Layers

    using Network = Layers::OutputLayer;

-  }  // namespace NNUE
+}  // namespace Eval::NNUE

-}  // namespace Eval
-#endif // HALFKP_CR_EP_256X2_32_32_H
+#endif // #ifndef NNUE_HALFKP_CR_EP_256X2_32_32_H_INCLUDED
@@ -0,0 +1,37 @@
+// Definition of input features and network structure used in NNUE evaluation function
+
+#ifndef NNUE_HALFKP_CR_256X2_32_32_H_INCLUDED
+#define NNUE_HALFKP_CR_256X2_32_32_H_INCLUDED
+
+#include "nnue/features/feature_set.h"
+#include "nnue/features/half_kp.h"
+#include "nnue/features/castling_right.h"
+
+#include "nnue/layers/input_slice.h"
+#include "nnue/layers/affine_transform.h"
+#include "nnue/layers/clipped_relu.h"
+
+namespace Eval::NNUE {
+
+    // Input features used in evaluation function
+    using RawFeatures = Features::FeatureSet<
+        Features::HalfKP<Features::Side::kFriend>, Features::CastlingRight>;
+
+    // Number of input feature dimensions after conversion
+    constexpr IndexType kTransformedFeatureDimensions = 256;
+
+    namespace Layers {
+
+        // Define network structure
+        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+        using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+
+    }  // namespace Layers
+
+    using Network = Layers::OutputLayer;
+
+}  // namespace Eval::NNUE
+
+#endif // #ifndef NNUE_HALFKP_CR_256X2_32_32_H_INCLUDED
@@ -1,19 +1,19 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)

-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.

-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.

-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

 // Definition of input features and network structure used in NNUE evaluation function
@@ -21,33 +21,33 @@
 #ifndef NNUE_HALFKP_256X2_32_32_H_INCLUDED
 #define NNUE_HALFKP_256X2_32_32_H_INCLUDED

-#include "../features/feature_set.h"
-#include "../features/half_kp.h"
+#include "nnue/features/feature_set.h"
+#include "nnue/features/half_kp.h"

-#include "../layers/input_slice.h"
-#include "../layers/affine_transform.h"
-#include "../layers/clipped_relu.h"
+#include "nnue/layers/input_slice.h"
+#include "nnue/layers/affine_transform.h"
+#include "nnue/layers/clipped_relu.h"

 namespace Eval::NNUE {

-// Input features used in evaluation function
-using RawFeatures = Features::FeatureSet<
-    Features::HalfKP<Features::Side::kFriend>>;
+    // Input features used in evaluation function
+    using RawFeatures = Features::FeatureSet<
+        Features::HalfKP<Features::Side::kFriend>>;

-// Number of input feature dimensions after conversion
-constexpr IndexType kTransformedFeatureDimensions = 256;
+    // Number of input feature dimensions after conversion
+    constexpr IndexType kTransformedFeatureDimensions = 256;

-namespace Layers {
+    namespace Layers {

-// Define network structure
-using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+        // Define network structure
+        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+        using OutputLayer = AffineTransform<HiddenLayer2, 1>;

-}  // namespace Layers
+    }  // namespace Layers

-using Network = Layers::OutputLayer;
+    using Network = Layers::OutputLayer;

 }  // namespace Eval::NNUE

@@ -3,37 +3,33 @@
 #ifndef HALFKP_384X2_32_32_H
 #define HALFKP_384X2_32_32_H

-#include "../features/feature_set.h"
-#include "../features/half_kp.h"
+#include "nnue/features/feature_set.h"
+#include "nnue/features/half_kp.h"

-#include "../layers/input_slice.h"
-#include "../layers/affine_transform.h"
-#include "../layers/clipped_relu.h"
+#include "nnue/layers/input_slice.h"
+#include "nnue/layers/affine_transform.h"
+#include "nnue/layers/clipped_relu.h"

-namespace Eval {
+namespace Eval::NNUE {

-namespace NNUE {
+    // Input features used in evaluation function
+    using RawFeatures = Features::FeatureSet<
+        Features::HalfKP<Features::Side::kFriend>>;

-// Input features used in evaluation function
-using RawFeatures = Features::FeatureSet<
-    Features::HalfKP<Features::Side::kFriend>>;
+    // Number of input feature dimensions after conversion
+    constexpr IndexType kTransformedFeatureDimensions = 384;

-// Number of input feature dimensions after conversion
-constexpr IndexType kTransformedFeatureDimensions = 384;
+    namespace Layers {

-namespace Layers {
+        // define network structure
+        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+        using OutputLayer = AffineTransform<HiddenLayer2, 1>;

-// define network structure
-using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+    }  // namespace Layers

-}  // namespace Layers
+    using Network = Layers::OutputLayer;

-using Network = Layers::OutputLayer;
-
-}  // namespace NNUE
-
-}  // namespace Eval
+}  // namespace Eval::NNUE
 #endif // HALFKP_384X2_32_32_H
@@ -1,42 +0,0 @@
-// Definition of input features and network structure used in NNUE evaluation function
-
-#ifndef K_P_CR_EP_256X2_32_32_H
-#define K_P_CR_EP_256X2_32_32_H
-
-#include "../features/feature_set.h"
-#include "../features/k.h"
-#include "../features/p.h"
-#include "../features/castling_right.h"
-#include "../features/enpassant.h"
-
-#include "../layers/input_slice.h"
-#include "../layers/affine_transform.h"
-#include "../layers/clipped_relu.h"
-
-namespace Eval {
-
-  namespace NNUE {
-
-    // Input features used in evaluation function
-    using RawFeatures = Features::FeatureSet<Features::K, Features::P,
-      Features::CastlingRight, Features::EnPassant>;
-
-    // Number of input feature dimensions after conversion
-    constexpr IndexType kTransformedFeatureDimensions = 256;
-
-    namespace Layers {
-
-      // define network structure
-      using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-      using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-      using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-      using OutputLayer = AffineTransform<HiddenLayer2, 1>;
-
-    }  // namespace Layers
-
-    using Network = Layers::OutputLayer;
-
-  }  // namespace NNUE
-
-}  // namespace Eval
-#endif // K_P_CR_EP_256X2_32_32_H
@@ -1,41 +0,0 @@
-// Definition of input features and network structure used in NNUE evaluation function
-
-#ifndef K_P_CR_256X2_32_32_H
-#define K_P_CR_256X2_32_32_H
-
-#include "../features/feature_set.h"
-#include "../features/k.h"
-#include "../features/p.h"
-#include "../features/castling_right.h"
-
-#include "../layers/input_slice.h"
-#include "../layers/affine_transform.h"
-#include "../layers/clipped_relu.h"
-
-namespace Eval {
-
-  namespace NNUE {
-
-    // Input features used in evaluation function
-    using RawFeatures = Features::FeatureSet<Features::K, Features::P,
-      Features::CastlingRight>;
-
-    // Number of input feature dimensions after conversion
-    constexpr IndexType kTransformedFeatureDimensions = 256;
-
-    namespace Layers {
-
-      // define network structure
-      using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-      using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-      using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-      using OutputLayer = AffineTransform<HiddenLayer2, 1>;
-
-    }  // namespace Layers
-
-    using Network = Layers::OutputLayer;
-
-  }  // namespace NNUE
-
-}  // namespace Eval
-#endif // K_P_CR_256X2_32_32_H
@@ -1,38 +0,0 @@
-// Definition of input features and network structure used in NNUE evaluation function
-#ifndef K_P_256X2_32_32_H
-#define K_P_256X2_32_32_H
-
-#include "../features/feature_set.h"
-#include "../features/k.h"
-#include "../features/p.h"
-
-#include "../layers/input_slice.h"
-#include "../layers/affine_transform.h"
-#include "../layers/clipped_relu.h"
-
-namespace Eval {
-
-namespace NNUE {
-
-// Input features used in evaluation function
-using RawFeatures = Features::FeatureSet<Features::K, Features::P>;
-
-// Number of input feature dimensions after conversion
-constexpr IndexType kTransformedFeatureDimensions = 256;
-
-namespace Layers {
-
-// define network structure
-using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-using OutputLayer = AffineTransform<HiddenLayer2, 1>;
-
-}  // namespace Layers
-
-using Network = Layers::OutputLayer;
-
-}  // namespace NNUE
-
-}  // namespace Eval
-#endif // K_P_256X2_32_32_H
@@ -18,20 +18,29 @@

 // Code for calculating NNUE evaluation function

-#include <fstream>
+#include "evaluate_nnue.h"
+
+#include "position.h"
+#include "misc.h"
+#include "uci.h"
+#include "types.h"
+
 #include <iostream>
+#include <string>
+#include <fstream>
 #include <set>

 #include "../evaluate.h"
 #include "../position.h"
 #include "../misc.h"
 #include "../uci.h"
+#include "../types.h"

 #include "evaluate_nnue.h"

 namespace Eval::NNUE {

-  uint32_t kpp_board_index[PIECE_NB][COLOR_NB] = {
+  const uint32_t kpp_board_index[PIECE_NB][COLOR_NB] = {
   // convention: W - us, B - them
   // viewed from other side, W and B are reversed
      { PS_NONE,     PS_NONE     },
@@ -53,7 +62,7 @@ namespace Eval::NNUE {
  };

  // Input feature converter
-  AlignedPtr<FeatureTransformer> feature_transformer;
+  LargePagePtr<FeatureTransformer> feature_transformer;

  // Evaluation function
  AlignedPtr<Network> network;
@@ -65,50 +74,77 @@ namespace Eval::NNUE {
  std::string savedfileName = "nn.bin";

  // Get a string that represents the structure of the evaluation function
-  std::string GetArchitectureString() {
-    return "Features=" + FeatureTransformer::GetStructureString() +
-      ",Network=" + Network::GetStructureString();
+  std::string get_architecture_string() {
+    return "Features=" + FeatureTransformer::get_structure_string() +
+        ",Network=" + Network::get_structure_string();
  }

+  std::string get_layers_info() {
+    return
+        FeatureTransformer::get_layers_info()
+        + '\n' + Network::get_layers_info();
+  }
+
+  UseNNUEMode useNNUE;
+  std::string eval_file_loaded = "None";
+
  namespace Detail {

  // Initialize the evaluation function parameters
  template <typename T>
-  void Initialize(AlignedPtr<T>& pointer) {
+  void initialize(AlignedPtr<T>& pointer) {

    pointer.reset(reinterpret_cast<T*>(std_aligned_alloc(alignof(T), sizeof(T))));
    std::memset(pointer.get(), 0, sizeof(T));
  }

+  template <typename T>
+  void initialize(LargePagePtr<T>& pointer) {
+
+    static_assert(alignof(T) <= 4096, "aligned_large_pages_alloc() may fail for such a big alignment requirement of T");
+    pointer.reset(reinterpret_cast<T*>(aligned_large_pages_alloc(sizeof(T))));
+    std::memset(pointer.get(), 0, sizeof(T));
+  }
+
  // Read evaluation function parameters
  template <typename T>
-  bool ReadParameters(std::istream& stream, const AlignedPtr<T>& pointer) {
+  bool ReadParameters(std::istream& stream, T& reference) {

    std::uint32_t header;
    header = read_little_endian<std::uint32_t>(stream);
    if (!stream || header != T::GetHashValue()) return false;
-    return pointer->ReadParameters(stream);
+    return reference.ReadParameters(stream);
  }

  // write evaluation function parameters
  template <typename T>
  bool WriteParameters(std::ostream& stream, const AlignedPtr<T>& pointer) {
    constexpr std::uint32_t header = T::GetHashValue();
+
    stream.write(reinterpret_cast<const char*>(&header), sizeof(header));
+
    return pointer->WriteParameters(stream);
  }

+  template <typename T>
+  bool WriteParameters(std::ostream& stream, const LargePagePtr<T>& pointer) {
+    constexpr std::uint32_t header = T::GetHashValue();
+
+    stream.write(reinterpret_cast<const char*>(&header), sizeof(header));
+
+    return pointer->WriteParameters(stream);
+  }
  }  // namespace Detail

  // Initialize the evaluation function parameters
-  void Initialize() {
+  void initialize() {

-    Detail::Initialize(feature_transformer);
-    Detail::Initialize(network);
+    Detail::initialize(feature_transformer);
+    Detail::initialize(network);
  }

  // Read network header
-  bool ReadHeader(std::istream& stream, std::uint32_t* hash_value, std::string* architecture)
+  bool read_header(std::istream& stream, std::uint32_t* hash_value, std::string* architecture)
  {
    std::uint32_t version, size;

@@ -122,13 +158,17 @@ namespace Eval::NNUE {
  }

  // write the header
-  bool WriteHeader(std::ostream& stream,
+  bool write_header(std::ostream& stream,
    std::uint32_t hash_value, const std::string& architecture) {
+
    stream.write(reinterpret_cast<const char*>(&kVersion), sizeof(kVersion));
    stream.write(reinterpret_cast<const char*>(&hash_value), sizeof(hash_value));
+
    const std::uint32_t size = static_cast<std::uint32_t>(architecture.size());
+
    stream.write(reinterpret_cast<const char*>(&size), sizeof(size));
    stream.write(architecture.data(), size);
+
    return !stream.fail();
  }

@@ -137,81 +177,176 @@ namespace Eval::NNUE {

    std::uint32_t hash_value;
    std::string architecture;
-    if (!ReadHeader(stream, &hash_value, &architecture)) return false;
+    if (!read_header(stream, &hash_value, &architecture)) return false;
    if (hash_value != kHashValue) return false;
-    if (!Detail::ReadParameters(stream, feature_transformer)) return false;
-    if (!Detail::ReadParameters(stream, network)) return false;
+    if (!Detail::ReadParameters(stream, *feature_transformer)) return false;
+    if (!Detail::ReadParameters(stream, *network)) return false;
    return stream && stream.peek() == std::ios::traits_type::eof();
  }

  // write evaluation function parameters
  bool WriteParameters(std::ostream& stream) {
-    if (!WriteHeader(stream, kHashValue, GetArchitectureString())) return false;
-    if (!Detail::WriteParameters(stream, feature_transformer)) return false;
-    if (!Detail::WriteParameters(stream, network)) return false;
+
+    if (!write_header(stream, kHashValue, get_architecture_string()))
+        return false;
+
+    if (!Detail::WriteParameters(stream, feature_transformer))
+        return false;
+
+    if (!Detail::WriteParameters(stream, network))
+        return false;
+
    return !stream.fail();
-  }
-
-  // Proceed with the difference calculation if possible
-  static void UpdateAccumulatorIfPossible(const Position& pos) {
-
-    feature_transformer->UpdateAccumulatorIfPossible(pos);
-  }
-
-  // Calculate the evaluation value
-  static Value ComputeScore(const Position& pos, bool refresh) {
-
-    auto& accumulator = pos.state()->accumulator;
-    if (!refresh && accumulator.computed_score) {
-      return accumulator.score;
-    }
-
-    alignas(kCacheLineSize) TransformedFeatureType
-        transformed_features[FeatureTransformer::kBufferSize];
-    feature_transformer->Transform(pos, transformed_features, refresh);
-    alignas(kCacheLineSize) char buffer[Network::kBufferSize];
-    const auto output = network->Propagate(transformed_features, buffer);
-
-    auto score = static_cast<Value>(output[0] / FV_SCALE);
-
-    accumulator.score = score;
-    accumulator.computed_score = true;
-    return accumulator.score;
-  }
-
-  // Load the evaluation function file
-  bool load_eval_file(const std::string& evalFile) {
-
-    Initialize();
-
-    if (Options["SkipLoadingEval"])
-    {
-      std::cout << "info string SkipLoadingEval set to true, Net not loaded!" << std::endl;
-      return true;
-    }
-
-    fileName = evalFile;
-
-    std::ifstream stream(evalFile, std::ios::binary);
-
-    const bool result = ReadParameters(stream);
-
-    return result;
-  }
+}

  // Evaluation function. Perform differential calculation.
  Value evaluate(const Position& pos) {
-    return ComputeScore(pos, false);
+
+    // We manually align the arrays on the stack because with gcc < 9.3
+    // overaligning stack variables with alignas() doesn't work correctly.
+
+    constexpr uint64_t alignment = kCacheLineSize;
+
+#if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN)
+    TransformedFeatureType transformed_features_unaligned[
+      FeatureTransformer::kBufferSize + alignment / sizeof(TransformedFeatureType)];
+    char buffer_unaligned[Network::kBufferSize + alignment];
+
+    auto* transformed_features = align_ptr_up<alignment>(&transformed_features_unaligned[0]);
+    auto* buffer = align_ptr_up<alignment>(&buffer_unaligned[0]);
+#else
+    alignas(alignment)
+      TransformedFeatureType transformed_features[FeatureTransformer::kBufferSize];
+    alignas(alignment) char buffer[Network::kBufferSize];
+#endif
+
+    ASSERT_ALIGNED(transformed_features, alignment);
+    ASSERT_ALIGNED(buffer, alignment);
+
+    feature_transformer->Transform(pos, transformed_features);
+    const auto output = network->Propagate(transformed_features, buffer);
+
+    return static_cast<Value>(output[0] / FV_SCALE);
  }

-  // Evaluation function. Perform full calculation.
-  Value compute_eval(const Position& pos) {
-    return ComputeScore(pos, true);
+  // Load eval, from a file stream or a memory stream
+  bool load_eval(std::string name, std::istream& stream) {
+
+    initialize();
+    fileName = name;
+    return ReadParameters(stream);
+}
+
+static UseNNUEMode nnue_mode_from_option(const UCI::Option& mode)
+{
+  if (mode == "false")
+    return UseNNUEMode::False;
+  else if (mode == "true")
+     return UseNNUEMode::True;
+  else if (mode == "pure")
+    return UseNNUEMode::Pure;
+
+  return UseNNUEMode::False;
+}
+
+void init() {
+
+  useNNUE = nnue_mode_from_option(Options["Use NNUE"]);
+
+  if (Options["SkipLoadingEval"] || useNNUE == UseNNUEMode::False)
+  {
+    eval_file_loaded.clear();
+    return;
  }

-  // Proceed with the difference calculation if possible
-  void update_eval(const Position& pos) {
-    UpdateAccumulatorIfPossible(pos);
+  std::string eval_file = std::string(Options["EvalFile"]);
+
+#if defined(DEFAULT_NNUE_DIRECTORY)
+#define stringify2(x) #x
+#define stringify(x) stringify2(x)
+  std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory , stringify(DEFAULT_NNUE_DIRECTORY) };
+#else
+  std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory };
+#endif
+
+  for (std::string directory : dirs)
+  {
+    if (eval_file_loaded != eval_file)
+    {
+      std::ifstream stream(directory + eval_file, std::ios::binary);
+      if (load_eval(eval_file, stream))
+      {
+        sync_cout << "info string Loaded eval file " << directory + eval_file << sync_endl;
+        eval_file_loaded = eval_file;
+      }
+      else
+      {
+        sync_cout << "info string ERROR: failed to load eval file " << directory + eval_file << sync_endl;
+        eval_file_loaded.clear();
+      }
+    }
  }

+#undef stringify2
+#undef stringify
+}
+
+/// NNUE::verify() verifies that the last net used was loaded successfully
+void verify_eval_file_loaded() {
+
+  std::string eval_file = std::string(Options["EvalFile"]);
+
+  if (useNNUE != UseNNUEMode::False && eval_file_loaded != eval_file)
+  {
+    UCI::OptionsMap defaults;
+    UCI::init(defaults);
+
+    std::string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available.";
+    std::string msg2 = "The option is set to true, but the network file " + eval_file + " was not loaded successfully.";
+    std::string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file.";
+    std::string msg4 = "The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/" + std::string(defaults["EvalFile"]);
+    std::string msg5 = "The engine will be terminated now.";
+
+    sync_cout << "info string ERROR: " << msg1 << sync_endl;
+    sync_cout << "info string ERROR: " << msg2 << sync_endl;
+    sync_cout << "info string ERROR: " << msg3 << sync_endl;
+    sync_cout << "info string ERROR: " << msg4 << sync_endl;
+    sync_cout << "info string ERROR: " << msg5 << sync_endl;
+
+    std::exit(EXIT_FAILURE);
+  }
+
+  if (useNNUE != UseNNUEMode::False)
+    sync_cout << "info string NNUE evaluation using " << eval_file << " enabled" << sync_endl;
+  else
+    sync_cout << "info string classical evaluation enabled" << sync_endl;
+}
+
+/// In training we override eval file so this is useful.
+void verify_any_net_loaded() {
+
+  if (!Options["SkipLoadingEval"] && useNNUE != UseNNUEMode::False && eval_file_loaded.empty())
+  {
+    UCI::OptionsMap defaults;
+    UCI::init(defaults);
+
+    std::string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available.";
+    std::string msg2 = "The option is set to true, but the network file was not loaded successfully.";
+    std::string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file.";
+    std::string msg5 = "The engine will be terminated now.";
+
+    sync_cout << "info string ERROR: " << msg1 << sync_endl;
+    sync_cout << "info string ERROR: " << msg2 << sync_endl;
+    sync_cout << "info string ERROR: " << msg3 << sync_endl;
+    sync_cout << "info string ERROR: " << msg5 << sync_endl;
+
+    std::exit(EXIT_FAILURE);
+  }
+
+  if (useNNUE != UseNNUEMode::False)
+    sync_cout << "info string NNUE evaluation using " << eval_file_loaded << " enabled" << sync_endl;
+  else
+    sync_cout << "info string classical evaluation enabled" << sync_endl;
+}
+
 } // namespace Eval::NNUE
@@ -23,10 +23,19 @@

 #include "nnue_feature_transformer.h"

+#include "misc.h"
+
 #include <memory>

 namespace Eval::NNUE {

+  enum struct UseNNUEMode
+  {
+    False,
+    True,
+    Pure
+  };
+
  // Hash value of evaluation function structure
  constexpr std::uint32_t kHashValue =
      FeatureTransformer::GetHashValue() ^ Network::GetHashValue();
@@ -40,11 +49,22 @@ namespace Eval::NNUE {
    }
  };

+  template <typename T>
+  struct LargePageDeleter {
+    void operator()(T* ptr) const {
+      ptr->~T();
+      aligned_large_pages_free(ptr);
+    }
+  };
+
  template <typename T>
  using AlignedPtr = std::unique_ptr<T, AlignedDeleter<T>>;

+  template <typename T>
+  using LargePagePtr = std::unique_ptr<T, LargePageDeleter<T>>;
+
  // Input feature converter
-  extern AlignedPtr<FeatureTransformer> feature_transformer;
+  extern LargePagePtr<FeatureTransformer> feature_transformer;

  // Evaluation function
  extern AlignedPtr<Network> network;
@@ -55,16 +75,22 @@ namespace Eval::NNUE {
  // Saved evaluation function file name
  extern std::string savedfileName;

+  extern UseNNUEMode useNNUE;
+
+  extern std::string eval_file_loaded;
+
  // Get a string that represents the structure of the evaluation function
-  std::string GetArchitectureString();
+  std::string get_architecture_string();
+
+  std::string get_layers_info();

  // read the header
-  bool ReadHeader(std::istream& stream,
-    std::uint32_t* hash_value, std::string* architecture);
+  bool read_header(std::istream& stream,
+      std::uint32_t* hash_value, std::string* architecture);

  // write the header
-  bool WriteHeader(std::ostream& stream,
-    std::uint32_t hash_value, const std::string& architecture);
+  bool write_header(std::ostream& stream,
+      std::uint32_t hash_value, const std::string& architecture);

  // read evaluation function parameters
  bool ReadParameters(std::istream& stream);
@@ -72,6 +98,13 @@ namespace Eval::NNUE {
  // write evaluation function parameters
  bool WriteParameters(std::ostream& stream);

+  Value evaluate(const Position& pos);
+  bool load_eval(std::string name, std::istream& stream);
+  void init();
+
+  void verify_eval_file_loaded();
+  void verify_any_net_loaded();
+
 }  // namespace Eval::NNUE

 #endif // #ifndef NNUE_EVALUATE_NNUE_H_INCLUDED
@@ -1,231 +1,342 @@
-// Code for learning NNUE evaluation function
-
-#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
-
-#include <random>
+#include <random>
 #include <fstream>
-
-#include "../learn/learn.h"
-#include "../learn/learning_tools.h"
-
-#include "../position.h"
-#include "../uci.h"
-#include "../misc.h"
-#include "../thread_win32_osx.h"
-
-#include "../eval/evaluate_common.h"
+#include <filesystem>

 #include "evaluate_nnue.h"
 #include "evaluate_nnue_learner.h"
-#include "trainer/features/factorizer_feature_set.h"
-#include "trainer/features/factorizer_half_kp.h"
+
+#include "trainer/features/all_factorizers.h"
+
 #include "trainer/trainer_feature_transformer.h"
 #include "trainer/trainer_input_slice.h"
 #include "trainer/trainer_affine_transform.h"
 #include "trainer/trainer_clipped_relu.h"
 #include "trainer/trainer_sum.h"

-namespace Eval {
+#include "position.h"
+#include "uci.h"
+#include "misc.h"
+#include "thread_win32_osx.h"
+#include "thread.h"

-namespace NNUE {
+// Code for learning NNUE evaluation function
+namespace Eval::NNUE {

-namespace {
+    namespace {

-// learning data
-std::vector<Example> examples;
+        // learning data
+        std::vector<Example> examples;

-// Mutex for exclusive control of examples
-std::mutex examples_mutex;
+        // Mutex for exclusive control of examples
+        std::mutex examples_mutex;

-// number of samples in mini-batch
-uint64_t batch_size;
+        // number of samples in mini-batch
+        uint64_t batch_size;

-// random number generator
-std::mt19937 rng;
+        // random number generator
+        std::mt19937 rng;

-// learner
-std::shared_ptr<Trainer<Network>> trainer;
+        // learner
+        std::shared_ptr<Trainer<Network>> trainer;

-// Learning rate scale
-double global_learning_rate_scale;
+        // Tell the learner options such as hyperparameters
+        void send_messages(std::vector<Message> messages) {
+            for (auto& message : messages) {
+                trainer->send_message(&message);
+                assert(message.num_receivers > 0);
+            }
+        }

-// Get the learning rate scale
-double GetGlobalLearningRateScale() {
-  return global_learning_rate_scale;
-}
+    }  // namespace

-// Tell the learner options such as hyperparameters
-void SendMessages(std::vector<Message> messages) {
-  for (auto& message : messages) {
-    trainer->SendMessage(&message);
-    assert(message.num_receivers > 0);
-  }
-}
+    // Initialize learning
+    void initialize_training(
+        const std::string& seed,
+        SynchronizedRegionLogger::Region& out) {

-}  // namespace
+#if defined (OPENBLAS_VERSION)
+        openblas_set_num_threads(1);
+#elif defined (INTEL_MKL_VERSION)
+        mkl_set_num_threads(1);
+#endif

-// Initialize learning
-void InitializeTraining(double eta1, uint64_t eta1_epoch,
-                        double eta2, uint64_t eta2_epoch, double eta3) {
-  std::cout << "Initializing NN training for "
-            << GetArchitectureString() << std::endl;
+        out << "INFO (initialize_training): Initializing NN training for "
+            << get_architecture_string() << std::endl;

-  assert(feature_transformer);
-  assert(network);
-  trainer = Trainer<Network>::Create(network.get(), feature_transformer.get());
+        out << std::endl;

-  if (Options["SkipLoadingEval"]) {
-    trainer->Initialize(rng);
-  }
+        out << "Layers:\n"
+            << get_layers_info() << std::endl;

-  global_learning_rate_scale = 1.0;
-  EvalLearningTools::Weight::init_eta(eta1, eta2, eta3, eta1_epoch, eta2_epoch);
-}
+        out << std::endl;

-// set the number of samples in the mini-batch
-void SetBatchSize(uint64_t size) {
-  assert(size > 0);
-  batch_size = size;
-}
+        out << "Factorizers:\n"
+            << Features::Factorizer<RawFeatures>::get_factorizers_string() << std::endl;

-// set the learning rate scale
-void SetGlobalLearningRateScale(double scale) {
-  global_learning_rate_scale = scale;
-}
+        out << std::endl;

-// Set options such as hyperparameters
-void SetOptions(const std::string& options) {
-  std::vector<Message> messages;
-  for (const auto& option : Split(options, ',')) {
-    const auto fields = Split(option, '=');
-    assert(fields.size() == 1 || fields.size() == 2);
-    if (fields.size() == 1) {
-      messages.emplace_back(fields[0]);
-    } else {
-      messages.emplace_back(fields[0], fields[1]);
-    }
-  }
-  SendMessages(std::move(messages));
-}
+        assert(feature_transformer);
+        assert(network);

-// Reread the evaluation function parameters for learning from the file
-void RestoreParameters(const std::string& dir_name) {
-  const std::string file_name = Path::Combine(dir_name, NNUE::savedfileName);
-  std::ifstream stream(file_name, std::ios::binary);
-  bool result = ReadParameters(stream);
-  assert(result);
+        trainer = Trainer<Network>::create(network.get(), feature_transformer.get());
+        rng.seed(PRNG(seed).rand<uint64_t>());

-  SendMessages({{"reset"}});
-}
-
-// Add 1 sample of learning data
-void AddExample(Position& pos, Color rootColor,
-                const Learner::PackedSfenValue& psv, double weight) {
-  Example example;
-  if (rootColor == pos.side_to_move()) {
-    example.sign = 1;
-  } else {
-    example.sign = -1;
-  }
-  example.psv = psv;
-  example.weight = weight;
-
-  Features::IndexList active_indices[2];
-  for (const auto trigger : kRefreshTriggers) {
-    RawFeatures::AppendActiveIndices(pos, trigger, active_indices);
-  }
-  if (pos.side_to_move() != WHITE) {
-    active_indices[0].swap(active_indices[1]);
-  }
-  for (const auto color : Colors) {
-    std::vector<TrainingFeature> training_features;
-    for (const auto base_index : active_indices[color]) {
-      static_assert(Features::Factorizer<RawFeatures>::GetDimensions() <
-                    (1 << TrainingFeature::kIndexBits), "");
-      Features::Factorizer<RawFeatures>::AppendTrainingFeatures(
-          base_index, &training_features);
-    }
-    std::sort(training_features.begin(), training_features.end());
-
-    auto& unique_features = example.training_features[color];
-    for (const auto& feature : training_features) {
-      if (!unique_features.empty() &&
-          feature.GetIndex() == unique_features.back().GetIndex()) {
-        unique_features.back() += feature;
-      } else {
-        unique_features.push_back(feature);
-      }
-    }
-  }
-
-  std::lock_guard<std::mutex> lock(examples_mutex);
-  examples.push_back(std::move(example));
-}
-
-// update the evaluation function parameters
-void UpdateParameters(uint64_t epoch) {
-  assert(batch_size > 0);
-
-  EvalLearningTools::Weight::calc_eta(epoch);
-  const auto learning_rate = static_cast<LearnFloatType>(
-      get_eta() / batch_size);
-
-  std::lock_guard<std::mutex> lock(examples_mutex);
-  std::shuffle(examples.begin(), examples.end(), rng);
-  while (examples.size() >= batch_size) {
-    std::vector<Example> batch(examples.end() - batch_size, examples.end());
-    examples.resize(examples.size() - batch_size);
-
-    const auto network_output = trainer->Propagate(batch);
-
-    std::vector<LearnFloatType> gradients(batch.size());
-    for (std::size_t b = 0; b < batch.size(); ++b) {
-      const auto shallow = static_cast<Value>(Round<std::int32_t>(
-          batch[b].sign * network_output[b] * kPonanzaConstant));
-      const auto& psv = batch[b].psv;
-      const double gradient = batch[b].sign * Learner::calc_grad(shallow, psv);
-      gradients[b] = static_cast<LearnFloatType>(gradient * batch[b].weight);
+        if (Options["SkipLoadingEval"]) {
+            out << "INFO (initialize_training): Performing random net initialization.\n";
+            trainer->initialize(rng);
+        }
    }

-    trainer->Backpropagate(gradients.data(), learning_rate);
-  }
-  SendMessages({{"quantize_parameters"}});
-}
+    // set the number of samples in the mini-batch
+    void set_batch_size(uint64_t size) {
+        assert(size > 0);
+        batch_size = size;
+    }

-// Check if there are any problems with learning
-void CheckHealth() {
-  SendMessages({{"check_health"}});
-}
+    // Set options such as hyperparameters
+    void set_options(const std::string& options) {
+        std::vector<Message> messages;
+        for (const auto& option : Algo::split(options, ',')) {
+          const auto fields = Algo::split(option, '=');
+          assert(fields.size() == 1 || fields.size() == 2);

-}  // namespace NNUE
+          if (fields.size() == 1) {
+              messages.emplace_back(fields[0]);
+          } else {
+              messages.emplace_back(fields[0], fields[1]);
+          }
+        }

-// save merit function parameters to a file
-void save_eval(std::string dir_name) {
-  auto eval_dir = Path::Combine(Options["EvalSaveDir"], dir_name);
-  std::cout << "save_eval() start. folder = " << eval_dir << std::endl;
+        send_messages(std::move(messages));
+    }

-  // mkdir() will fail if this folder already exists, but
-  // Apart from that. If not, I just want you to make it.
-  // Also, assume that the folders up to EvalSaveDir have been dug.
-  Dependency::mkdir(eval_dir);
+    // Reread the evaluation function parameters for learning from the file
+    void restore_parameters(const std::string& dir_name) {
+        const std::string file_name = Path::combine(dir_name, NNUE::savedfileName);
+        std::ifstream stream(file_name, std::ios::binary);
+#ifndef NDEBUG
+        bool result =
+#endif
+        ReadParameters(stream);
+#ifndef NDEBUG
+        assert(result);
+#endif

-  if (Options["SkipLoadingEval"] && NNUE::trainer) {
-    NNUE::SendMessages({{"clear_unobserved_feature_weights"}});
-  }
+        send_messages({{"reset"}});
+    }

-  const std::string file_name = Path::Combine(eval_dir, NNUE::savedfileName);
-  std::ofstream stream(file_name, std::ios::binary);
-  const bool result = NNUE::WriteParameters(stream);
-  assert(result);
+    void finalize_net() {
+        send_messages({{"clear_unobserved_feature_weights"}});
+    }

-  std::cout << "save_eval() finished. folder = " << eval_dir << std::endl;
-}
+    // Add 1 sample of learning data
+    void add_example(
+        Position& pos,
+        Color rootColor,
+        Value discrete_nn_eval,
+        const Learner::PackedSfenValue& psv,
+        double weight) {

-// get the current eta
-double get_eta() {
-  return NNUE::GetGlobalLearningRateScale() * EvalLearningTools::Weight::eta;
-}
+        Example example;
+        if (rootColor == pos.side_to_move()) {
+            example.sign = 1;
+        } else {
+            example.sign = -1;
+        }

-}  // namespace Eval
+        example.discrete_nn_eval = discrete_nn_eval;
+        example.psv = psv;
+        example.weight = weight;

-#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+        Features::IndexList active_indices[2];
+        for (const auto trigger : kRefreshTriggers) {
+            RawFeatures::append_active_indices(pos, trigger, active_indices);
+        }
+
+        if (pos.side_to_move() != WHITE) {
+            active_indices[0].swap(active_indices[1]);
+        }
+
+        static thread_local std::vector<TrainingFeature> s_training_features;
+        auto& training_features = s_training_features;
+
+        for (const auto color : Colors) {
+            training_features.clear();
+
+            for (const auto base_index : active_indices[color]) {
+                static_assert(Features::Factorizer<RawFeatures>::get_dimensions() <
+                              (1 << TrainingFeature::kIndexBits), "");
+                Features::Factorizer<RawFeatures>::append_training_features(
+                    base_index, &training_features);
+            }
+
+            std::sort(training_features.begin(), training_features.end());
+
+            auto& unique_features = example.training_features[color];
+            unique_features.reserve(training_features.size());
+            for (const auto& feature : training_features) {
+                if (!unique_features.empty() &&
+                    feature.get_index() == unique_features.back().get_index()) {
+
+                    unique_features.back() += feature;
+                } else {
+                    unique_features.push_back(feature);
+                }
+            }
+        }
+
+        std::lock_guard<std::mutex> lock(examples_mutex);
+        examples.push_back(std::move(example));
+    }
+
+    // update the evaluation function parameters
+    Learner::Loss update_parameters(
+        ThreadPool& thread_pool,
+        uint64_t epoch,
+        bool verbose,
+        double learning_rate,
+        double max_grad,
+        Learner::CalcLossFunc calc_loss)
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        assert(batch_size > 0);
+
+        learning_rate /= batch_size;
+
+        std::lock_guard<std::mutex> lock(examples_mutex);
+
+        double abs_eval_diff_sum = 0.0;
+        double abs_discrete_eval_sum = 0.0;
+        double gradient_norm = 0.0;
+
+        bool collect_stats = verbose;
+
+        Learner::Loss loss_sum{};
+
+        std::vector<double> abs_eval_diff_sum_local(thread_pool.size(), 0.0);
+        std::vector<double> abs_discrete_eval_sum_local(thread_pool.size(), 0.0);
+        std::vector<double> gradient_norm_local(thread_pool.size(), 0.0);
+        std::vector<Learner::Loss> loss_sum_local(thread_pool.size());
+
+        auto prev_batch_begin = examples.end();
+        while ((long)(prev_batch_begin - examples.begin()) >= (long)batch_size) {
+            auto batch_begin = prev_batch_begin - batch_size;
+            auto batch_end = prev_batch_begin;
+            auto size = batch_end - batch_begin;
+            const auto network_output = trainer->step_start(thread_pool, batch_begin, batch_end);
+            std::vector<LearnFloatType> gradients(size);
+
+            thread_pool.for_each_index_chunk_with_workers(
+                std::size_t(0), size,
+                [&](Thread& th, std::size_t offset, std::size_t count) {
+                    const auto thread_id = th.thread_idx();
+
+                    trainer->propagate(th, offset, count);
+
+                    for (std::size_t b = offset; b < offset + count; ++b) {
+                        const auto& e = *(batch_begin + b);
+                        const auto shallow = static_cast<Value>(round<std::int32_t>(
+                            e.sign * network_output[b] * kPonanzaConstant));
+                        const auto discrete = e.sign * e.discrete_nn_eval;
+                        const auto& psv = e.psv;
+                        auto loss = calc_loss(shallow, (Value)psv.score, psv.game_result, psv.gamePly);
+                        loss.grad = std::clamp(
+                            loss.grad * e.sign * kPonanzaConstant * e.weight, -max_grad, max_grad);
+                        gradients[b] = static_cast<LearnFloatType>(loss.grad);
+                        loss_sum_local[thread_id] += loss;
+
+                        // The discrete eval will only be valid before first backpropagation,
+                        // that is only for the first batch.
+                        // Similarily we want only gradients from one batch.
+                        if (collect_stats)
+                        {
+                            abs_eval_diff_sum_local[thread_id] += std::abs(discrete - shallow);
+                            abs_discrete_eval_sum_local[thread_id] += std::abs(discrete);
+                            gradient_norm_local[thread_id] += std::abs(loss.grad);
+                        }
+                    }
+
+                    trainer->backpropagate(th, gradients.data(), offset, count);
+                }
+            );
+
+            // We can asyncronously erase the examples that we used in the previous
+            // step. This can be done safely because we're no longer using these
+            // examples and erase won't invalidate iterators.
+            examples.erase(prev_batch_begin, examples.end());
+            prev_batch_begin = batch_begin;
+
+            thread_pool.wait_for_workers_finished();
+
+            trainer->step_end(thread_pool, learning_rate);
+
+            collect_stats = false;
+        }
+        examples.erase(prev_batch_begin, examples.end());
+
+        if (verbose)
+        {
+            abs_eval_diff_sum = std::accumulate(abs_eval_diff_sum_local.begin(), abs_eval_diff_sum_local.end(), 0.0);
+            abs_discrete_eval_sum = std::accumulate(abs_discrete_eval_sum_local.begin(), abs_discrete_eval_sum_local.end(), 0.0);
+            gradient_norm = std::accumulate(gradient_norm_local.begin(), gradient_norm_local.end(), 0.0);
+
+            const double avg_abs_eval_diff = abs_eval_diff_sum / batch_size;
+            const double avg_abs_discrete_eval = abs_discrete_eval_sum / batch_size;
+
+            auto out = sync_region_cout.new_region();
+
+            out << "INFO (update_parameters):"
+                << " epoch = " << epoch
+                << " , avg_abs(trainer_eval-nnue_eval) = " << avg_abs_eval_diff
+                << " , avg_abs(nnue_eval) = " << avg_abs_discrete_eval
+                << " , avg_relative_error = " << avg_abs_eval_diff / avg_abs_discrete_eval
+                << " , batch_size = " << batch_size
+                << " , grad_norm = " << gradient_norm
+                << std::endl;
+        } else {
+            // Display some progress but don't synchronize as
+            // we can't really decide when to release the output lock here
+            std::cout << '.';
+        }
+
+        send_messages({{"quantize_parameters"}});
+
+        for(auto& loss : loss_sum_local)
+        {
+            loss_sum += loss;
+        }
+
+        return loss_sum;
+    }
+
+    // Check if there are any problems with learning
+    void check_health() {
+        send_messages({{"check_health"}});
+    }
+
+    // save merit function parameters to a file
+    void save_eval(std::string dir_name) {
+        auto eval_dir = Path::combine(Options["EvalSaveDir"], dir_name);
+
+        auto out = sync_region_cout.new_region();
+
+        out << "INFO (save_eval): Saving current evaluation file in " << eval_dir << std::endl;
+
+        // mkdir() will fail if this folder already exists, but
+        // Apart from that. If not, I just want you to make it.
+        // Also, assume that the folders up to EvalSaveDir have been dug.
+        std::filesystem::create_directories(eval_dir);
+
+        const std::string file_name = Path::combine(eval_dir, NNUE::savedfileName);
+        std::ofstream stream(file_name, std::ios::binary);
+#ifndef NDEBUG
+        bool result =
+#endif
+        WriteParameters(stream);
+#ifndef NDEBUG
+        assert(result);
+#endif
+        out << "INFO (save_eval): Finished saving evaluation file in " << eval_dir << std::endl;
+    }
+}  // namespace Eval::NNUE
@@ -1,46 +1,52 @@
-// Interface used for learning NNUE evaluation function
-
-#ifndef _EVALUATE_NNUE_LEARNER_H_
+#ifndef _EVALUATE_NNUE_LEARNER_H_
 #define _EVALUATE_NNUE_LEARNER_H_

-#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+#include "learn/learn.h"

-#include "../learn/learn.h"
+#include "misc.h"

-namespace Eval {
+struct ThreadPool;

-namespace NNUE {
+// Interface used for learning NNUE evaluation function
+namespace Eval::NNUE {

-// Initialize learning
-void InitializeTraining(double eta1, uint64_t eta1_epoch,
-                        double eta2, uint64_t eta2_epoch, double eta3);
+    // Initialize learning
+    void initialize_training(
+        const std::string& seed,
+        SynchronizedRegionLogger::Region& out);

-// set the number of samples in the mini-batch
-void SetBatchSize(uint64_t size);
+    // set the number of samples in the mini-batch
+    void set_batch_size(uint64_t size);

-// set the learning rate scale
-void SetGlobalLearningRateScale(double scale);
+    // Set options such as hyperparameters
+    void set_options(const std::string& options);

-// Set options such as hyperparameters
-void SetOptions(const std::string& options);
+    // Reread the evaluation function parameters for learning from the file
+    void restore_parameters(const std::string& dir_name);

-// Reread the evaluation function parameters for learning from the file
-void RestoreParameters(const std::string& dir_name);
+    // Add 1 sample of learning data
+    void add_example(
+        Position& pos,
+        Color rootColor,
+        Value discrete_nn_eval,
+    	const Learner::PackedSfenValue& psv,
+        double weight);

-// Add 1 sample of learning data
-void AddExample(Position& pos, Color rootColor,
-                const Learner::PackedSfenValue& psv, double weight);
+    // update the evaluation function parameters
+    Learner::Loss update_parameters(
+        ThreadPool& thread_pool,
+        uint64_t epoch,
+        bool verbose,
+        double learning_rate,
+        double max_grad,
+        Learner::CalcLossFunc calc_loss);

-// update the evaluation function parameters
-void UpdateParameters(uint64_t epoch);
+    // Check if there are any problems with learning
+    void check_health();

-// Check if there are any problems with learning
-void CheckHealth();
+    void finalize_net();

-}  // namespace NNUE
-
-}  // namespace Eval
-
-#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+    void save_eval(std::string suffix);
+}  // namespace Eval::NNUE

 #endif
@@ -0,0 +1,54 @@
+#include "a.h"
+#include "index_list.h"
+
+// Definition of input feature A of NNUE evaluation function
+namespace Eval::NNUE::Features {
+
+    // Orient a square according to perspective (rotate the board 180° for black)
+    // Important note for "halfka": this arch was designed with "flip" in mind 
+    // although it still is untested which approach is better.
+    // this has to stay until we find a better arch that works with "flip".
+    // allows us to use current master net for gensfen (primarily needed for higher quality data)
+    inline Square orient(Color perspective, Square s) {
+        return Square(int(s) ^ (bool(perspective) * 63));
+    }
+
+    // Find the index of the feature quantity from the king position and PieceSquare
+    inline IndexType A::make_index(
+        Color perspective, Square s, Piece pc) {
+        return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
+    }
+
+    // Get a list of indices with a value of 1 among the features
+    void A::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {
+
+        Bitboard bb = pos.pieces();
+        while (bb) {
+            Square s = pop_lsb(&bb);
+            active->push_back(make_index(perspective, s, pos.piece_on(s)));
+        }
+    }
+
+    // Get a list of indices whose values have changed from the previous one in the feature quantity
+    void A::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* added) {
+
+        const auto& dp = pos.state()->dirtyPiece;
+        for (int i = 0; i < dp.dirty_num; ++i) {
+            Piece pc = dp.piece[i];
+
+            if (dp.from[i] != SQ_NONE)
+              removed->push_back(make_index(perspective, dp.from[i], pc));
+
+            if (dp.to[i] != SQ_NONE)
+              added->push_back(make_index(perspective, dp.to[i], pc));
+        }
+    }
+
+}  // namespace Eval::NNUE::Features
@@ -0,0 +1,54 @@
+#ifndef _NNUE_FEATURES_A_H_
+#define _NNUE_FEATURES_A_H_
+
+#include "features_common.h"
+
+#include "evaluate.h"
+
+// Definition of input feature A of NNUE evaluation function
+// A is a union of P features and K features, so technically the
+// same effect can be achieved by including both P and K features
+// but it would result in slower index appending because
+// P would conditionally exclude K features and vice versa,
+// where A doesn't have any conditionals.
+namespace Eval::NNUE::Features {
+
+    // Feature P: PieceSquare of pieces other than balls
+    class A {
+    public:
+        // feature quantity name
+        static constexpr const char* kName = "A";
+
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t kHashValue = 0x7A4C414Cu;
+
+        // number of feature dimensions
+        static constexpr IndexType kDimensions = PS_END2;
+
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions = 32;
+
+        // Timing of full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
+
+        // Get a list of indices with a value of 1 among the features
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);
+
+        // Get a list of indices whose values have changed from the previous one in the feature quantity
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
+
+    private:
+        // Index of a feature for a given piece on some square
+        static IndexType make_index(Color perspective, Square s, Piece pc);
+    };
+
+}  // namespace Eval::NNUE::Features
+
+#endif // #ifndef _NNUE_FEATURES_UNION_P_K_H_
@@ -1,73 +1,65 @@
-//Definition of input feature quantity K of NNUE evaluation function
-
-#if defined(EVAL_NNUE)
-
 #include "castling_right.h"
 #include "index_list.h"

-namespace Eval {
+//Definition of input feature quantity CastlingRight of NNUE evaluation function
+namespace Eval::NNUE::Features {

-  namespace NNUE {
+    // Get a list of indices with a value of 1 among the features
+    void CastlingRight::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {

-    namespace Features {
-
-      // Get a list of indices with a value of 1 among the features
-      void CastlingRight::AppendActiveIndices(
-        const Position& pos, Color perspective, IndexList* active) {
        // do nothing if array size is small to avoid compiler warning
        if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;

        int castling_rights = pos.state()->castlingRights;
        int relative_castling_rights;
        if (perspective == WHITE) {
-          relative_castling_rights = castling_rights;
+            relative_castling_rights = castling_rights;
        }
        else {
-          // Invert the perspective.
-          relative_castling_rights = ((castling_rights & 3) << 2)
-            & ((castling_rights >> 2) & 3);
+            // Invert the perspective.
+            relative_castling_rights = ((castling_rights & 3) << 2)
+                & ((castling_rights >> 2) & 3);
        }

-        for (int i = 0; i <kDimensions; ++i) {
-          if (relative_castling_rights & (i << 1)) {
-            active->push_back(i);
-          }
+        for (Eval::NNUE::IndexType i = 0; i < kDimensions; ++i) {
+            if (relative_castling_rights & (1 << i)) {
+                active->push_back(i);
+            }
        }
-      }
+    }

-      // Get a list of indices whose values have changed from the previous one in the feature quantity
-      void CastlingRight::AppendChangedIndices(
-        const Position& pos, Color perspective,
-        IndexList* removed, IndexList* added) {
+    // Get a list of indices whose values have changed from the previous one in the feature quantity
+    void CastlingRight::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* /* added */) {

        int previous_castling_rights = pos.state()->previous->castlingRights;
        int current_castling_rights = pos.state()->castlingRights;
        int relative_previous_castling_rights;
        int relative_current_castling_rights;
        if (perspective == WHITE) {
-          relative_previous_castling_rights = previous_castling_rights;
-          relative_current_castling_rights = current_castling_rights;
+            relative_previous_castling_rights = previous_castling_rights;
+            relative_current_castling_rights = current_castling_rights;
        }
        else {
-          // Invert the perspective.
-          relative_previous_castling_rights = ((previous_castling_rights & 3) << 2)
-            & ((previous_castling_rights >> 2) & 3);
-          relative_current_castling_rights = ((current_castling_rights & 3) << 2)
-            & ((current_castling_rights >> 2) & 3);
+            // Invert the perspective.
+            relative_previous_castling_rights = ((previous_castling_rights & 3) << 2)
+                & ((previous_castling_rights >> 2) & 3);
+            relative_current_castling_rights = ((current_castling_rights & 3) << 2)
+                & ((current_castling_rights >> 2) & 3);
        }

-        for (int i = 0; i < kDimensions; ++i) {
-          if ((relative_previous_castling_rights & (i << 1)) &&
-            (relative_current_castling_rights & (i << 1)) == 0) {
-            removed->push_back(i);
-          }
+        for (Eval::NNUE::IndexType i = 0; i < kDimensions; ++i) {
+            if ((relative_previous_castling_rights & (1 << i)) &&
+                (relative_current_castling_rights & (1 << i)) == 0) {
+                removed->push_back(i);
+            }
        }
-      }
+    }

-    }  // namespace Features
-
-  }  // namespace NNUE
-
-}  // namespace Eval
-
-#endif  // defined(EVAL_NNUE)
+}  // namespace Eval::NNUE::Features
@@ -1,48 +1,44 @@
-//Definition of input feature quantity K of NNUE evaluation function
-
 #ifndef _NNUE_FEATURES_CASTLING_RIGHT_H_
 #define _NNUE_FEATURES_CASTLING_RIGHT_H_

-#if defined(EVAL_NNUE)
-
-#include "../../evaluate.h"
 #include "features_common.h"

-namespace Eval {
+#include "evaluate.h"

-  namespace NNUE {
+//Definition of input feature quantity CastlingRight of NNUE evaluation function
+namespace Eval::NNUE::Features {

-    namespace Features {
-
-      // Feature K: Ball position
-      class CastlingRight {
-      public:
+    class CastlingRight {
+    public:
        // feature quantity name
        static constexpr const char* kName = "CastlingRight";
+
        // Hash value embedded in the evaluation function file
        static constexpr std::uint32_t kHashValue = 0x913968AAu;
+
        // number of feature dimensions
        static constexpr IndexType kDimensions = 4;
+
        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
        static constexpr IndexType kMaxActiveDimensions = 4;
+
        // Timing of full calculation instead of difference calculation
        static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;

        // Get a list of indices with a value of 1 among the features
-        static void AppendActiveIndices(const Position& pos, Color perspective,
-          IndexList* active);
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);

-        // Get a list of indices whose values ??have changed from the previous one in the feature quantity
-        static void AppendChangedIndices(const Position& pos, Color perspective,
-          IndexList* removed, IndexList* added);
-      };
+        // Get a list of indices whose values have changed from the previous one in the feature quantity
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
+    };

-    }  // namespace Features
-
-  }  // namespace NNUE
-
-}  // namespace Eval
-
-#endif  // defined(EVAL_NNUE)
+}  // namespace Eval::NNUE::Features

 #endif
@@ -1,47 +1,49 @@
-//Definition of input feature quantity K of NNUE evaluation function
-
-#if defined(EVAL_NNUE)
-
 #include "enpassant.h"
 #include "index_list.h"

-namespace Eval {
+//Definition of input feature quantity EnPassant of NNUE evaluation function
+namespace Eval::NNUE::Features {

-  namespace NNUE {
+    // Get a list of indices with a value of 1 among the features
+    void EnPassant::append_active_indices(
+        const Position& pos,
+        Color /* perspective */,
+        IndexList* active) {

-    namespace Features {
-
-      // Get a list of indices with a value of 1 among the features
-      void EnPassant::AppendActiveIndices(
-        const Position& pos, Color perspective, IndexList* active) {
        // do nothing if array size is small to avoid compiler warning
-        if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
+        if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions)
+            return;

        auto epSquare = pos.state()->epSquare;
-        if (epSquare == SQ_NONE) {
-          return;
-        }
-
-        if (perspective == BLACK) {
-          epSquare = rotate180(epSquare);
-        }
+        if (epSquare == SQ_NONE)
+            return;

        auto file = file_of(epSquare);
        active->push_back(file);
-      }
+    }

-      // Get a list of indices whose values ??have changed from the previous one in the feature quantity
-      void EnPassant::AppendChangedIndices(
-        const Position& pos, Color perspective,
-        IndexList* removed, IndexList* added) {
-        // Not implemented.
-        assert(false);
-      }
+    // Get a list of indices whose values have changed from the previous one in the feature quantity
+    void EnPassant::append_changed_indices(
+        const Position& pos,
+        Color /* perspective */,
+        IndexList* removed,
+        IndexList* added) {

-    }  // namespace Features
+        auto previous_epSquare = pos.state()->previous->epSquare;
+        auto epSquare = pos.state()->epSquare;

-  }  // namespace NNUE
+        if (previous_epSquare != SQ_NONE) {
+            if (epSquare != SQ_NONE && file_of(epSquare) == file_of(previous_epSquare))
+                return;

-}  // namespace Eval
+            auto file = file_of(previous_epSquare);
+            removed->push_back(file);
+        }

-#endif  // defined(EVAL_NNUE)
+        if (epSquare != SQ_NONE) {
+            auto file = file_of(epSquare);
+            added->push_back(file);
+        }
+    }
+
+}  // namespace Eval::NNUE::Features
@@ -1,22 +1,15 @@
-//Definition of input feature quantity K of NNUE evaluation function
-
 #ifndef _NNUE_FEATURES_ENPASSANT_H_
 #define _NNUE_FEATURES_ENPASSANT_H_

-#if defined(EVAL_NNUE)
-
-#include "../../evaluate.h"
 #include "features_common.h"

-namespace Eval {
+#include "evaluate.h"

-  namespace NNUE {
+//Definition of input feature quantity EnPassant of NNUE evaluation function
+namespace Eval::NNUE::Features {

-    namespace Features {
-
-      // Feature K: Ball position
-      class EnPassant {
-      public:
+    class EnPassant {
+    public:
        // feature quantity name
        static constexpr const char* kName = "EnPassant";
        // Hash value embedded in the evaluation function file
@@ -26,23 +19,22 @@ namespace Eval {
        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
        static constexpr IndexType kMaxActiveDimensions = 1;
        // Timing of full calculation instead of difference calculation
-        static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kAnyPieceMoved;
+        static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;

        // Get a list of indices with a value of 1 among the features
-        static void AppendActiveIndices(const Position& pos, Color perspective,
-          IndexList* active);
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);

-        // Get a list of indices whose values ??have changed from the previous one in the feature quantity
-        static void AppendChangedIndices(const Position& pos, Color perspective,
-          IndexList* removed, IndexList* added);
-      };
+        // Get a list of indices whose values have changed from the previous one in the feature quantity
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
+    };

-    }  // namespace Features
-
-  }  // namespace NNUE
-
-}  // namespace Eval
-
-#endif  // defined(EVAL_NNUE)
+}  // namespace Eval::NNUE::Features

 #endif
@@ -26,222 +26,276 @@

 namespace Eval::NNUE::Features {

-  // Class template that represents a list of values
-  template <typename T, T... Values>
-  struct CompileTimeList;
+    // Class template that represents a list of values
+    template <typename T, T... Values>
+    struct CompileTimeList;

-  template <typename T, T First, T... Remaining>
-  struct CompileTimeList<T, First, Remaining...> {
-    static constexpr bool Contains(T value) {
-      return value == First || CompileTimeList<T, Remaining...>::Contains(value);
-    }
-    static constexpr std::array<T, sizeof...(Remaining) + 1>
-        kValues = {{First, Remaining...}};
-  };
-
-  template <typename T, T First, T... Remaining>
-  constexpr std::array<T, sizeof...(Remaining) + 1>
-    CompileTimeList<T, First, Remaining...>::kValues;
-  template <typename T>
-  struct CompileTimeList<T> {
-    static constexpr bool Contains(T /*value*/) {
-      return false;
-    }
-    static constexpr std::array<T, 0> kValues = { {} };
-  };
-
-  // Class template that adds to the beginning of the list
-  template <typename T, typename ListType, T Value>
-  struct AppendToList;
-  template <typename T, T... Values, T AnotherValue>
-  struct AppendToList<T, CompileTimeList<T, Values...>, AnotherValue> {
-    using Result = CompileTimeList<T, AnotherValue, Values...>;
-  };
-
-  // Class template for adding to a sorted, unique list
-  template <typename T, typename ListType, T Value>
-  struct InsertToSet;
-  template <typename T, T First, T... Remaining, T AnotherValue>
-  struct InsertToSet<T, CompileTimeList<T, First, Remaining...>, AnotherValue> {
-    using Result = std::conditional_t<
-      CompileTimeList<T, First, Remaining...>::Contains(AnotherValue),
-      CompileTimeList<T, First, Remaining...>,
-      std::conditional_t<(AnotherValue < First),
-      CompileTimeList<T, AnotherValue, First, Remaining...>,
-      typename AppendToList<T, typename InsertToSet<
-      T, CompileTimeList<T, Remaining...>, AnotherValue>::Result,
-      First>::Result>>;
-  };
-  template <typename T, T Value>
-  struct InsertToSet<T, CompileTimeList<T>, Value> {
-    using Result = CompileTimeList<T, Value>;
-  };
-
-  // Base class of feature set
-  template <typename Derived>
-  class FeatureSetBase {
-
-   public:
-    // Get a list of indices for active features
-    template <typename IndexListType>
-    static void AppendActiveIndices(
-        const Position& pos, TriggerEvent trigger, IndexListType active[2]) {
-
-      for (Color perspective : { WHITE, BLACK }) {
-        Derived::CollectActiveIndices(
-            pos, trigger, perspective, &active[perspective]);
-      }
-    }
-
-    // Get a list of indices for recently changed features
-    template <typename PositionType, typename IndexListType>
-    static void AppendChangedIndices(
-        const PositionType& pos, TriggerEvent trigger,
-        IndexListType removed[2], IndexListType added[2], bool reset[2]) {
-
-      const auto& dp = pos.state()->dirtyPiece;
-      if (dp.dirty_num == 0) return;
-
-      for (Color perspective : { WHITE, BLACK }) {
-        reset[perspective] = false;
-        switch (trigger) {
-          case TriggerEvent::kFriendKingMoved:
-            reset[perspective] = dp.piece[0] == make_piece(perspective, KING);
-            break;
-          default:
-            assert(false);
-            break;
+    template <typename T, T First, T... Remaining>
+    struct CompileTimeList<T, First, Remaining...> {
+        static constexpr bool contains(T value) {
+            return value == First || CompileTimeList<T, Remaining...>::contains(value);
        }
-        if (reset[perspective]) {
-          Derived::CollectActiveIndices(
-              pos, trigger, perspective, &added[perspective]);
-        } else {
-          Derived::CollectChangedIndices(
-              pos, trigger, perspective,
-              &removed[perspective], &added[perspective]);
+
+        static constexpr std::array<T, sizeof...(Remaining) + 1>
+            kValues = {{First, Remaining...}};
+    };
+
+    template <typename T, T First, T... Remaining>
+    constexpr std::array<T, sizeof...(Remaining) + 1>
+        CompileTimeList<T, First, Remaining...>::kValues;
+
+    template <typename T>
+    struct CompileTimeList<T> {
+        static constexpr bool contains(T /*value*/) {
+            return false;
        }
-      }
-    }
-  };
+        static constexpr std::array<T, 0> kValues = { {} };
+    };

-  // Class template that represents the feature set
-  // do internal processing in reverse order of template arguments in order to linearize the amount of calculation at runtime
-  template <typename FirstFeatureType, typename... RemainingFeatureTypes>
-  class FeatureSet<FirstFeatureType, RemainingFeatureTypes...> :
-    public FeatureSetBase<
-    FeatureSet<FirstFeatureType, RemainingFeatureTypes...>> {
-  private:
-    using Head = FirstFeatureType;
-    using Tail = FeatureSet<RemainingFeatureTypes...>;
+    // Class template that adds to the beginning of the list
+    template <typename T, typename ListType, T Value>
+    struct AppendToList;

-  public:
-    // Hash value embedded in the evaluation function file
-    static constexpr std::uint32_t kHashValue =
-      Head::kHashValue ^ (Tail::kHashValue << 1) ^ (Tail::kHashValue >> 31);
-    // number of feature dimensions
-    static constexpr IndexType kDimensions =
-      Head::kDimensions + Tail::kDimensions;
-    // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-    static constexpr IndexType kMaxActiveDimensions =
-      Head::kMaxActiveDimensions + Tail::kMaxActiveDimensions;
-    // List of timings to perform all calculations instead of difference calculation
-    using SortedTriggerSet = typename InsertToSet<TriggerEvent,
-      typename Tail::SortedTriggerSet, Head::kRefreshTrigger>::Result;
-    static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
+    template <typename T, T... Values, T AnotherValue>
+    struct AppendToList<T, CompileTimeList<T, Values...>, AnotherValue> {
+        using Result = CompileTimeList<T, AnotherValue, Values...>;
+    };

-    // Get the feature quantity name
-    static std::string GetName() {
-      return std::string(Head::kName) + "+" + Tail::GetName();
-    }
+    // Class template for adding to a sorted, unique list
+    template <typename T, typename ListType, T Value>
+    struct InsertToSet;

-  private:
-    // Get a list of indices with a value of 1 among the features
-    template <typename IndexListType>
-    static void CollectActiveIndices(
-      const Position& pos, const TriggerEvent trigger, const Color perspective,
-      IndexListType* const active) {
-      Tail::CollectActiveIndices(pos, trigger, perspective, active);
-      if (Head::kRefreshTrigger == trigger) {
-        const auto start = active->size();
-        Head::AppendActiveIndices(pos, perspective, active);
-        for (auto i = start; i < active->size(); ++i) {
-          (*active)[i] += Tail::kDimensions;
+    template <typename T, T First, T... Remaining, T AnotherValue>
+    struct InsertToSet<T, CompileTimeList<T, First, Remaining...>, AnotherValue> {
+        using Result =
+            std::conditional_t<
+                CompileTimeList<T, First, Remaining...>::contains(AnotherValue),
+                CompileTimeList<T, First, Remaining...>,
+                std::conditional_t<
+                    (AnotherValue < First),
+                    CompileTimeList<T, AnotherValue, First, Remaining...>,
+                    typename AppendToList<T, typename InsertToSet<
+                        T, CompileTimeList<T, Remaining...>, AnotherValue>::Result,
+                        First
+                    >::Result
+                >
+            >;
+    };
+
+    template <typename T, T Value>
+    struct InsertToSet<T, CompileTimeList<T>, Value> {
+        using Result = CompileTimeList<T, Value>;
+    };
+
+    // Base class of feature set
+    template <typename Derived>
+    class FeatureSetBase {
+
+       public:
+        // Get a list of indices for active features
+        template <typename IndexListType>
+        static void append_active_indices(
+            const Position& pos, TriggerEvent trigger, IndexListType active[2]) {
+
+            for (Color perspective : { WHITE, BLACK }) {
+                Derived::collect_active_indices(
+                    pos, trigger, perspective, &active[perspective]);
+            }
        }
-      }
-    }

-    // Get a list of indices whose values have changed from the previous one in the feature quantity
-    template <typename IndexListType>
-    static void CollectChangedIndices(
-      const Position& pos, const TriggerEvent trigger, const Color perspective,
-      IndexListType* const removed, IndexListType* const added) {
-      Tail::CollectChangedIndices(pos, trigger, perspective, removed, added);
-      if (Head::kRefreshTrigger == trigger) {
-        const auto start_removed = removed->size();
-        const auto start_added = added->size();
-        Head::AppendChangedIndices(pos, perspective, removed, added);
-        for (auto i = start_removed; i < removed->size(); ++i) {
-          (*removed)[i] += Tail::kDimensions;
+        // Get a list of indices for recently changed features
+        template <typename PositionType, typename IndexListType>
+        static void append_changed_indices(
+            const PositionType& pos,
+            TriggerEvent trigger,
+            IndexListType removed[2],
+            IndexListType added[2],
+            bool reset[2]) {
+
+            const auto& dp = pos.state()->dirtyPiece;
+
+            for (Color perspective : { WHITE, BLACK }) {
+                switch (trigger) {
+                    case TriggerEvent::kNone:
+                        break;
+                    case TriggerEvent::kFriendKingMoved:
+                        if (dp.dirty_num == 0) continue;
+                        reset[perspective] = dp.piece[0] == make_piece(perspective, KING);
+                        break;
+                    case TriggerEvent::kEnemyKingMoved:
+                        if (dp.dirty_num == 0) continue;
+                        reset[perspective] = dp.piece[0] == make_piece(~perspective, KING);
+                        break;
+                    case TriggerEvent::kAnyKingMoved:
+                        if (dp.dirty_num == 0) continue;
+                        reset[perspective] = type_of(dp.piece[0]) == KING;
+                        break;
+                    case TriggerEvent::kAnyPieceMoved:
+                        reset[perspective] = true;
+                        break;
+                    default:
+                        assert(false);
+                        break;
+                }
+
+                if (reset[perspective]) {
+                    Derived::collect_active_indices(
+                        pos, trigger, perspective, &added[perspective]);
+                } else {
+                    Derived::collect_changed_indices(
+                        pos, trigger, perspective,
+                        &removed[perspective], &added[perspective]);
+                }
+            }
        }
-        for (auto i = start_added; i < added->size(); ++i) {
-          (*added)[i] += Tail::kDimensions;
+    };
+
+    // Class template that represents the feature set
+    // do internal processing in reverse order of template arguments in order to linearize the amount of calculation at runtime
+    template <typename FirstFeatureType, typename... RemainingFeatureTypes>
+    class FeatureSet<FirstFeatureType, RemainingFeatureTypes...> :
+      public FeatureSetBase<
+          FeatureSet<FirstFeatureType, RemainingFeatureTypes...>
+      > {
+
+    private:
+        using Head = FirstFeatureType;
+        using Tail = FeatureSet<RemainingFeatureTypes...>;
+
+    public:
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t kHashValue =
+            Head::kHashValue ^ (Tail::kHashValue << 1) ^ (Tail::kHashValue >> 31);
+
+        // number of feature dimensions
+        static constexpr IndexType kDimensions =
+            Head::kDimensions + Tail::kDimensions;
+
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions =
+            Head::kMaxActiveDimensions + Tail::kMaxActiveDimensions;
+
+        // List of timings to perform all calculations instead of difference calculation
+        using SortedTriggerSet = typename InsertToSet<TriggerEvent,
+            typename Tail::SortedTriggerSet, Head::kRefreshTrigger>::Result;
+
+        static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
+
+        // Get the feature quantity name
+        static std::string get_name() {
+            return std::string(Head::kName) + "+" + Tail::get_name();
        }
-      }
-    }

-    // Make the base class and the class template that recursively uses itself a friend
-    friend class FeatureSetBase<FeatureSet>;
-    template <typename... FeatureTypes>
-    friend class FeatureSet;
-  };
+    private:
+        // Get a list of indices with a value of 1 among the features
+        template <typename IndexListType>
+        static void collect_active_indices(
+            const Position& pos,
+            const TriggerEvent trigger,
+            const Color perspective,
+            IndexListType* const active) {

-  // Class template that represents the feature set
-  template <typename FeatureType>
-  class FeatureSet<FeatureType> : public FeatureSetBase<FeatureSet<FeatureType>> {
+            Tail::collect_active_indices(pos, trigger, perspective, active);
+            if (Head::kRefreshTrigger == trigger) {
+                const auto start = active->size();
+                Head::append_active_indices(pos, perspective, active);

-   public:
-    // Hash value embedded in the evaluation file
-    static constexpr std::uint32_t kHashValue = FeatureType::kHashValue;
-    // Number of feature dimensions
-    static constexpr IndexType kDimensions = FeatureType::kDimensions;
-    // Maximum number of simultaneously active features
-    static constexpr IndexType kMaxActiveDimensions =
-        FeatureType::kMaxActiveDimensions;
-    // Trigger for full calculation instead of difference calculation
-    using SortedTriggerSet =
-        CompileTimeList<TriggerEvent, FeatureType::kRefreshTrigger>;
-    static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
+                for (auto i = start; i < active->size(); ++i) {
+                    (*active)[i] += Tail::kDimensions;
+                }
+            }
+        }

-    // Get the feature quantity name
-    static std::string GetName() {
-      return FeatureType::kName;
-    }
+        // Get a list of indices whose values have changed from the previous one in the feature quantity
+        template <typename IndexListType>
+        static void collect_changed_indices(
+            const Position& pos,
+            const TriggerEvent trigger,
+            const Color perspective,
+            IndexListType* const removed,
+            IndexListType* const added) {

-   private:
-    // Get a list of indices for active features
-    static void CollectActiveIndices(
-        const Position& pos, const TriggerEvent trigger, const Color perspective,
-        IndexList* const active) {
-      if (FeatureType::kRefreshTrigger == trigger) {
-        FeatureType::AppendActiveIndices(pos, perspective, active);
-      }
-    }
+            Tail::collect_changed_indices(pos, trigger, perspective, removed, added);
+            if (Head::kRefreshTrigger == trigger) {
+                const auto start_removed = removed->size();
+                const auto start_added = added->size();
+                Head::append_changed_indices(pos, perspective, removed, added);

-    // Get a list of indices for recently changed features
-    static void CollectChangedIndices(
-        const Position& pos, const TriggerEvent trigger, const Color perspective,
-        IndexList* const removed, IndexList* const added) {
+                for (auto i = start_removed; i < removed->size(); ++i) {
+                    (*removed)[i] += Tail::kDimensions;
+                }

-      if (FeatureType::kRefreshTrigger == trigger) {
-        FeatureType::AppendChangedIndices(pos, perspective, removed, added);
-      }
-    }
+                for (auto i = start_added; i < added->size(); ++i) {
+                    (*added)[i] += Tail::kDimensions;
+                }
+            }
+        }

-    // Make the base class and the class template that recursively uses itself a friend
-    friend class FeatureSetBase<FeatureSet>;
-    template <typename... FeatureTypes>
-    friend class FeatureSet;
-  };
+        // Make the base class and the class template that recursively uses itself a friend
+        friend class FeatureSetBase<FeatureSet>;
+
+        template <typename... FeatureTypes>
+        friend class FeatureSet;
+    };
+
+    // Class template that represents the feature set
+    template <typename FeatureType>
+    class FeatureSet<FeatureType> : public FeatureSetBase<FeatureSet<FeatureType>> {
+
+    public:
+        // Hash value embedded in the evaluation file
+        static constexpr std::uint32_t kHashValue = FeatureType::kHashValue;
+
+        // Number of feature dimensions
+        static constexpr IndexType kDimensions = FeatureType::kDimensions;
+
+        // Maximum number of simultaneously active features
+        static constexpr IndexType kMaxActiveDimensions =
+            FeatureType::kMaxActiveDimensions;
+
+        // Trigger for full calculation instead of difference calculation
+        using SortedTriggerSet =
+            CompileTimeList<TriggerEvent, FeatureType::kRefreshTrigger>;
+
+        static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
+
+        // Get the feature quantity name
+        static std::string get_name() {
+            return FeatureType::kName;
+        }
+
+    private:
+        // Get a list of indices for active features
+        static void collect_active_indices(
+            const Position& pos,
+            const TriggerEvent trigger,
+            const Color perspective,
+            IndexList* const active) {
+
+            if (FeatureType::kRefreshTrigger == trigger) {
+              FeatureType::append_active_indices(pos, perspective, active);
+            }
+        }
+
+        // Get a list of indices for recently changed features
+        static void collect_changed_indices(
+            const Position& pos,
+            const TriggerEvent trigger,
+            const Color perspective,
+            IndexList* const removed,
+            IndexList* const added) {
+
+            if (FeatureType::kRefreshTrigger == trigger) {
+              FeatureType::append_changed_indices(pos, perspective, removed, added);
+            }
+        }
+
+        // Make the base class and the class template that recursively uses itself a friend
+        friend class FeatureSetBase<FeatureSet>;
+
+        template <typename... FeatureTypes>
+        friend class FeatureSet;
+    };

 }  // namespace Eval::NNUE::Features

@@ -34,10 +34,10 @@ namespace Eval::NNUE::Features {
  // Trigger to perform full calculations instead of difference only
  enum class TriggerEvent {
    kNone, // Calculate the difference whenever possible
-    kFriendKingMoved, // calculate all when own ball moves
-    kEnemyKingMoved, // do all calculations when enemy balls move
-    kAnyKingMoved, // do all calculations if either ball moves
-    kAnyPieceMoved, // always do all calculations
+    kFriendKingMoved, // calculate full evaluation when own king moves
+    kEnemyKingMoved, // calculate full evaluation when opponent king moves
+    kAnyKingMoved, // calculate full evaluation when any king moves
+    kAnyPieceMoved, // always calculate full evaluation
  };

  enum class Side {
@@ -0,0 +1,93 @@
+/*
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+//Definition of input features HalfKA of NNUE evaluation function
+
+#include "half_ka.h"
+#include "index_list.h"
+
+namespace Eval::NNUE::Features {
+
+    // Orient a square according to perspective (rotate the board 180° for black)
+    // Important note for "halfka": this arch was designed with "flip" in mind 
+    // although it still is untested which approach is better.
+    // this has to stay until we find a better arch that works with "flip".
+    // allows us to use current master net for gensfen (primarily needed for higher quality data)
+    inline Square orient(Color perspective, Square s) {
+        return Square(int(s) ^ (bool(perspective) * 63));
+    }
+
+    // Find the index of the feature quantity from the king position and PieceSquare
+    template <Side AssociatedKing>
+    inline IndexType HalfKA<AssociatedKing>::make_index(
+        Color perspective,
+        Square s,
+        Piece pc,
+        Square ksq) {
+
+        return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective] + PS_END2 * ksq);
+    }
+
+    // Get a list of indices for active features
+    template <Side AssociatedKing>
+    void HalfKA<AssociatedKing>::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {
+
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));
+
+        Bitboard bb = pos.pieces();
+        while (bb) {
+            Square s = pop_lsb(&bb);
+            active->push_back(make_index(perspective, s, pos.piece_on(s), ksq));
+        }
+    }
+
+    // Get a list of indices for recently changed features
+    template <Side AssociatedKing>
+    void HalfKA<AssociatedKing>::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* added) {
+
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));
+
+        const auto& dp = pos.state()->dirtyPiece;
+        for (int i = 0; i < dp.dirty_num; ++i) {
+            Piece pc = dp.piece[i];
+
+            if (dp.from[i] != SQ_NONE)
+                removed->push_back(make_index(perspective, dp.from[i], pc, ksq));
+
+            if (dp.to[i] != SQ_NONE)
+                added->push_back(make_index(perspective, dp.to[i], pc, ksq));
+        }
+    }
+
+    template class HalfKA<Side::kFriend>;
+    template class HalfKA<Side::kEnemy>;
+
+}  // namespace Eval::NNUE::Features
@@ -0,0 +1,75 @@
+/*
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef NNUE_FEATURES_HALF_KA_H_INCLUDED
+#define NNUE_FEATURES_HALF_KA_H_INCLUDED
+
+#include "features_common.h"
+
+#include "evaluate.h"
+
+//Definition of input features HalfKPK of NNUE evaluation function
+namespace Eval::NNUE::Features {
+
+    // Feature HalfKPK: Combination of the position of own king
+    // and the position of pieces other than kings
+    template <Side AssociatedKing>
+    class HalfKA {
+
+    public:
+        // Feature name
+        static constexpr const char* kName = (AssociatedKing == Side::kFriend) ?
+            "HalfKA(Friend)" : "HalfKA(Enemy)";
+
+        // Hash value embedded in the evaluation file
+        static constexpr std::uint32_t kHashValue =
+            0x5F134CB9u ^ (AssociatedKing == Side::kFriend);
+
+        // Number of feature dimensions
+        static constexpr IndexType kDimensions =
+            static_cast<IndexType>(SQUARE_NB) * static_cast<IndexType>(PS_END2);
+
+        // Maximum number of simultaneously active features
+        static constexpr IndexType kMaxActiveDimensions = 32;
+
+        // Trigger for full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger =
+            (AssociatedKing == Side::kFriend) ?
+            TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
+
+        // Get a list of indices for active features
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);
+
+        // Get a list of indices for recently changed features
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
+
+    private:
+        // Index of a feature for a given king position and another piece on some square
+        static IndexType make_index(Color perspective, Square s, Piece pc, Square sq_k);
+    };
+
+}  // namespace Eval::NNUE::Features
+
+#endif // #ifndef NNUE_FEATURES_HALF_KA_H_INCLUDED
@@ -1,19 +1,19 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)

-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.

-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.

-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

 //Definition of input features HalfKP of NNUE evaluation function
@@ -23,50 +23,72 @@

 namespace Eval::NNUE::Features {

-  // Orient a square according to perspective (rotates by 180 for black)
-  inline Square orient(Color perspective, Square s) {
-    return Square(int(s) ^ (bool(perspective) * 63));
-  }
-
-  // Find the index of the feature quantity from the king position and PieceSquare
-  template <Side AssociatedKing>
-  inline IndexType HalfKP<AssociatedKing>::MakeIndex(
-      Color perspective, Square s, Piece pc, Square ksq) {
-
-    return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective] + PS_END * ksq);
-  }
-
-  // Get a list of indices for active features
-  template <Side AssociatedKing>
-  void HalfKP<AssociatedKing>::AppendActiveIndices(
-      const Position& pos, Color perspective, IndexList* active) {
-
-    Square ksq = orient(perspective, pos.square<KING>(perspective));
-    Bitboard bb = pos.pieces() & ~pos.pieces(KING);
-    while (bb) {
-      Square s = pop_lsb(&bb);
-      active->push_back(MakeIndex(perspective, s, pos.piece_on(s), ksq));
+    // Orient a square according to perspective (rotate the board 180° for black)
+    // this has to stay until we find a better arch that works with "flip".
+    // allows us to use current master net for gensfen (primarily needed for higher quality data)
+    inline Square orient(Color perspective, Square s) {
+        return Square(int(s) ^ (bool(perspective) * 63));
    }
-  }

-  // Get a list of indices for recently changed features
-  template <Side AssociatedKing>
-  void HalfKP<AssociatedKing>::AppendChangedIndices(
-      const Position& pos, Color perspective,
-      IndexList* removed, IndexList* added) {
+    // Find the index of the feature quantity from the king position and PieceSquare
+    template <Side AssociatedKing>
+    inline IndexType HalfKP<AssociatedKing>::make_index(
+        Color perspective,
+        Square s,
+        Piece pc,
+        Square ksq) {

-    Square ksq = orient(perspective, pos.square<KING>(perspective));
-    const auto& dp = pos.state()->dirtyPiece;
-    for (int i = 0; i < dp.dirty_num; ++i) {
-      Piece pc = dp.piece[i];
-      if (type_of(pc) == KING) continue;
-      if (dp.from[i] != SQ_NONE)
-        removed->push_back(MakeIndex(perspective, dp.from[i], pc, ksq));
-      if (dp.to[i] != SQ_NONE)
-        added->push_back(MakeIndex(perspective, dp.to[i], pc, ksq));
+        return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective] + PS_END * ksq);
    }
-  }

-  template class HalfKP<Side::kFriend>;
+    // Get a list of indices for active features
+    template <Side AssociatedKing>
+    void HalfKP<AssociatedKing>::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {
+
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));
+
+        Bitboard bb = pos.pieces() & ~pos.pieces(KING);
+        while (bb) {
+            Square s = pop_lsb(&bb);
+            active->push_back(make_index(perspective, s, pos.piece_on(s), ksq));
+        }
+    }
+
+    // Get a list of indices for recently changed features
+    template <Side AssociatedKing>
+    void HalfKP<AssociatedKing>::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* added) {
+
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));
+
+        const auto& dp = pos.state()->dirtyPiece;
+        for (int i = 0; i < dp.dirty_num; ++i) {
+            Piece pc = dp.piece[i];
+
+            if (type_of(pc) == KING)
+                continue;
+
+            if (dp.from[i] != SQ_NONE)
+                removed->push_back(make_index(perspective, dp.from[i], pc, ksq));
+
+            if (dp.to[i] != SQ_NONE)
+                added->push_back(make_index(perspective, dp.to[i], pc, ksq));
+        }
+    }
+
+    template class HalfKP<Side::kFriend>;
+    template class HalfKP<Side::kEnemy>;

 }  // namespace Eval::NNUE::Features
@@ -1,62 +1,74 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)

-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.

-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.

-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

-//Definition of input features HalfKP of NNUE evaluation function
-
 #ifndef NNUE_FEATURES_HALF_KP_H_INCLUDED
 #define NNUE_FEATURES_HALF_KP_H_INCLUDED

-#include "../../evaluate.h"
 #include "features_common.h"

+#include "evaluate.h"
+
+//Definition of input features HalfKP of NNUE evaluation function
 namespace Eval::NNUE::Features {

-  // Feature HalfKP: Combination of the position of own king
-  // and the position of pieces other than kings
-  template <Side AssociatedKing>
-  class HalfKP {
+    // Feature HalfKP: Combination of the position of own king
+    // and the position of pieces other than kings
+    template <Side AssociatedKing>
+    class HalfKP {

-   public:
-    // Feature name
-    static constexpr const char* kName = "HalfKP(Friend)";
-    // Hash value embedded in the evaluation file
-    static constexpr std::uint32_t kHashValue =
-        0x5D69D5B9u ^ (AssociatedKing == Side::kFriend);
-    // Number of feature dimensions
-    static constexpr IndexType kDimensions =
-        static_cast<IndexType>(SQUARE_NB) * static_cast<IndexType>(PS_END);
-    // Maximum number of simultaneously active features
-    static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
-    // Trigger for full calculation instead of difference calculation
-    static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kFriendKingMoved;
+    public:
+        // Feature name
+        static constexpr const char* kName = (AssociatedKing == Side::kFriend) ?
+            "HalfKP(Friend)" : "HalfKP(Enemy)";

-    // Get a list of indices for active features
-    static void AppendActiveIndices(const Position& pos, Color perspective,
-                                    IndexList* active);
+        // Hash value embedded in the evaluation file
+        static constexpr std::uint32_t kHashValue =
+            0x5D69D5B9u ^ (AssociatedKing == Side::kFriend);

-    // Get a list of indices for recently changed features
-    static void AppendChangedIndices(const Position& pos, Color perspective,
-                                     IndexList* removed, IndexList* added);
+        // Number of feature dimensions
+        static constexpr IndexType kDimensions =
+            static_cast<IndexType>(SQUARE_NB) * static_cast<IndexType>(PS_END);

-   private:
-    // Index of a feature for a given king position and another piece on some square
-    static IndexType MakeIndex(Color perspective, Square s, Piece pc, Square sq_k);
-  };
+        // Maximum number of simultaneously active features
+        static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
+
+        // Trigger for full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger =
+            (AssociatedKing == Side::kFriend) ?
+            TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
+
+        // Get a list of indices for active features
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);
+
+        // Get a list of indices for recently changed features
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
+
+    private:
+        // Index of a feature for a given king position and another piece on some square
+        static IndexType make_index(Color perspective, Square s, Piece pc, Square sq_k);
+    };

 }  // namespace Eval::NNUE::Features

@@ -0,0 +1,90 @@
+#include "half_relative_ka.h"
+#include "index_list.h"
+
+//Definition of input features HalfRelativeKA of NNUE evaluation function
+namespace Eval::NNUE::Features {
+
+    // Orient a square according to perspective (rotate the board 180° for black)
+    // Important note for "halfka": this arch was designed with "flip" in mind 
+    // although it still is untested which approach is better.
+    // this has to stay until we find a better arch that works with "flip".
+    // allows us to use current master net for gensfen (primarily needed for higher quality data)
+    inline Square orient(Color perspective, Square s) {
+        return Square(int(s) ^ (bool(perspective) * 63));
+    }
+
+    // Find the index of the feature quantity from the ball position and PieceSquare
+    template <Side AssociatedKing>
+    inline IndexType HalfRelativeKA<AssociatedKing>::make_index(
+        Color perspective,
+        Square s,
+        Piece pc,
+        Square sq_k) {
+
+        const IndexType p = IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
+        return make_index(sq_k, p);
+    }
+
+    // Find the index of the feature quantity from the ball position and PieceSquare
+    template <Side AssociatedKing>
+    inline IndexType HalfRelativeKA<AssociatedKing>::make_index(
+        Square sq_k,
+        IndexType p) {
+
+        constexpr IndexType W = kBoardWidth;
+        constexpr IndexType H = kBoardHeight;
+        const IndexType piece_index = (p - PS_W_PAWN) / SQUARE_NB;
+        const Square sq_p = static_cast<Square>((p - PS_W_PAWN) % SQUARE_NB);
+        const IndexType relative_file = file_of(sq_p) - file_of(sq_k) + (W / 2);
+        const IndexType relative_rank = rank_of(sq_p) - rank_of(sq_k) + (H / 2);
+        return H * W * piece_index + H * relative_file + relative_rank;
+    }
+
+    // Get a list of indices with a value of 1 among the features
+    template <Side AssociatedKing>
+    void HalfRelativeKA<AssociatedKing>::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {
+
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));
+
+        Bitboard bb = pos.pieces();
+        while (bb) {
+            Square s = pop_lsb(&bb);
+            active->push_back(make_index(perspective, s, pos.piece_on(s), ksq));
+        }
+    }
+
+    // Get a list of indices whose values have changed from the previous one in the feature quantity
+    template <Side AssociatedKing>
+    void HalfRelativeKA<AssociatedKing>::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* added) {
+
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));
+
+        const auto& dp = pos.state()->dirtyPiece;
+        for (int i = 0; i < dp.dirty_num; ++i) {
+            Piece pc = dp.piece[i];
+
+            if (dp.from[i] != SQ_NONE)
+                removed->push_back(make_index(perspective, dp.from[i], pc, ksq));
+
+            if (dp.to[i] != SQ_NONE)
+                added->push_back(make_index(perspective, dp.to[i], pc, ksq));
+        }
+    }
+
+    template class HalfRelativeKA<Side::kFriend>;
+    template class HalfRelativeKA<Side::kEnemy>;
+
+}  // namespace Eval::NNUE::Features
@@ -0,0 +1,68 @@
+#ifndef _NNUE_FEATURES_HALF_RELATIVE_KA_H_
+#define _NNUE_FEATURES_HALF_RELATIVE_KA_H_
+
+#include "features_common.h"
+
+#include "evaluate.h"
+
+// Definition of input features HalfRelativeKA of NNUE evaluation function
+// K - King
+// A - Any piece
+// KA - product of K and A
+namespace Eval::NNUE::Features {
+
+    // Feature HalfRelativeKA: Relative position of each piece other than the ball based on own ball or enemy ball
+    template <Side AssociatedKing>
+    class HalfRelativeKA {
+    public:
+        // feature quantity name
+        static constexpr const char* kName = (AssociatedKing == Side::kFriend) ?
+            "HalfRelativeKA(Friend)" : "HalfRelativeKA(Enemy)";
+
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t kHashValue =
+            0xA123051Fu ^ (AssociatedKing == Side::kFriend);
+
+        static constexpr IndexType kNumPieceKinds = 6 * 2;
+
+        // width of the virtual board with the ball in the center
+        static constexpr IndexType kBoardWidth = FILE_NB * 2 - 1;
+
+        // height of a virtual board with balls in the center
+        static constexpr IndexType kBoardHeight = RANK_NB * 2 - 1;
+
+        // number of feature dimensions
+        static constexpr IndexType kDimensions =
+            kNumPieceKinds * kBoardHeight * kBoardWidth;
+
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions = 32;
+
+        // Timing of full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger =
+            (AssociatedKing == Side::kFriend) ?
+            TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
+
+        // Get a list of indices with a value of 1 among the features
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);
+
+        // Get a list of indices whose values have changed from the previous one in the feature quantity
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
+
+        // Find the index of the feature quantity from the ball position and PieceSquare
+        static IndexType make_index(Square s, IndexType p);
+
+        // Find the index of the feature quantity from the ball position and PieceSquare
+        static IndexType make_index(Color perspective, Square s, Piece pc, Square sq_k);
+    };
+
+}  // namespace Eval::NNUE::Features
+
+#endif // #ifndef _NNUE_FEATURES_HALF_RELATIVE_KA_H_
@@ -1,78 +1,91 @@
-//Definition of input features HalfRelativeKP of NNUE evaluation function
-
-#if defined(EVAL_NNUE)
-
-#include "half_relative_kp.h"
+#include "half_relative_kp.h"
 #include "index_list.h"

-namespace Eval {
+//Definition of input features HalfRelativeKP of NNUE evaluation function
+namespace Eval::NNUE::Features {

-namespace NNUE {
+    // Orient a square according to perspective (rotate the board 180° for black)
+    // this has to stay until we find a better arch that works with "flip".
+    // allows us to use current master net for gensfen (primarily needed for higher quality data)
+    inline Square orient(Color perspective, Square s) {
+        return Square(int(s) ^ (bool(perspective) * 63));
+    }

-namespace Features {
+    // Find the index of the feature quantity from the ball position and PieceSquare
+    template <Side AssociatedKing>
+    inline IndexType HalfRelativeKP<AssociatedKing>::make_index(
+        Color perspective,
+        Square s,
+        Piece pc,
+        Square sq_k) {

-// Orient a square according to perspective (rotates by 180 for black)
-inline Square orient(Color perspective, Square s) {
-  return Square(int(s) ^ (bool(perspective) * 63));
-}
+        const IndexType p = IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
+        return make_index(sq_k, p);
+    }

-// Find the index of the feature quantity from the ball position and PieceSquare
-template <Side AssociatedKing>
-inline IndexType HalfRelativeKP<AssociatedKing>::MakeIndex(
-  Color perspective, Square s, Piece pc, Square sq_k) {
-  const IndexType p = IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
-  return MakeIndex(sq_k, p);
-}
+    // Find the index of the feature quantity from the ball position and PieceSquare
+    template <Side AssociatedKing>
+    inline IndexType HalfRelativeKP<AssociatedKing>::make_index(
+        Square sq_k,
+        IndexType p) {

-// Find the index of the feature quantity from the ball position and PieceSquare
-template <Side AssociatedKing>
-inline IndexType HalfRelativeKP<AssociatedKing>::MakeIndex(
-    Square sq_k, IndexType p) {
-  constexpr IndexType W = kBoardWidth;
-  constexpr IndexType H = kBoardHeight;
-  const IndexType piece_index = (p - PS_W_PAWN) / SQUARE_NB;
-  const Square sq_p = static_cast<Square>((p - PS_W_PAWN) % SQUARE_NB);
-  const IndexType relative_file = file_of(sq_p) - file_of(sq_k) + (W / 2);
-  const IndexType relative_rank = rank_of(sq_p) - rank_of(sq_k) + (H / 2);
-  return H * W * piece_index + H * relative_file + relative_rank;
-}
+        constexpr IndexType W = kBoardWidth;
+        constexpr IndexType H = kBoardHeight;
+        const IndexType piece_index = (p - PS_W_PAWN) / SQUARE_NB;
+        const Square sq_p = static_cast<Square>((p - PS_W_PAWN) % SQUARE_NB);
+        const IndexType relative_file = file_of(sq_p) - file_of(sq_k) + (W / 2);
+        const IndexType relative_rank = rank_of(sq_p) - rank_of(sq_k) + (H / 2);
+        return H * W * piece_index + H * relative_file + relative_rank;
+    }

-// Get a list of indices with a value of 1 among the features
-template <Side AssociatedKing>
-void HalfRelativeKP<AssociatedKing>::AppendActiveIndices(
-    const Position& pos, Color perspective, IndexList* active) {
-  Square ksq = orient(perspective, pos.square<KING>(perspective));
-  Bitboard bb = pos.pieces() & ~pos.pieces(KING);
-  while (bb) {
-    Square s = pop_lsb(&bb);
-    active->push_back(MakeIndex(perspective, s, pos.piece_on(s), ksq));
-  }
-}
+    // Get a list of indices with a value of 1 among the features
+    template <Side AssociatedKing>
+    void HalfRelativeKP<AssociatedKing>::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {

-// Get a list of indices whose values have changed from the previous one in the feature quantity
-template <Side AssociatedKing>
-void HalfRelativeKP<AssociatedKing>::AppendChangedIndices(
-    const Position& pos, Color perspective,
-    IndexList* removed, IndexList* added) {
-  Square ksq = orient(perspective, pos.square<KING>(perspective));
-  const auto& dp = pos.state()->dirtyPiece;
-  for (int i = 0; i < dp.dirty_num; ++i) {
-    Piece pc = dp.piece[i];
-    if (type_of(pc) == KING) continue;
-    if (dp.from[i] != SQ_NONE)
-      removed->push_back(MakeIndex(perspective, dp.from[i], pc, ksq));
-    if (dp.to[i] != SQ_NONE)
-      added->push_back(MakeIndex(perspective, dp.to[i], pc, ksq));
-  }
-}
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));

-template class HalfRelativeKP<Side::kFriend>;
-template class HalfRelativeKP<Side::kEnemy>;
+        Bitboard bb = pos.pieces() & ~pos.pieces(KING);
+        while (bb) {
+            Square s = pop_lsb(&bb);
+            active->push_back(make_index(perspective, s, pos.piece_on(s), ksq));
+        }
+    }

-}  // namespace Features
+    // Get a list of indices whose values have changed from the previous one in the feature quantity
+    template <Side AssociatedKing>
+    void HalfRelativeKP<AssociatedKing>::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* added) {

-}  // namespace NNUE
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));

-}  // namespace Eval
+        const auto& dp = pos.state()->dirtyPiece;
+        for (int i = 0; i < dp.dirty_num; ++i) {
+            Piece pc = dp.piece[i];

-#endif  // defined(EVAL_NNUE)
+            if (type_of(pc) == KING)
+                continue;
+
+            if (dp.from[i] != SQ_NONE)
+                removed->push_back(make_index(perspective, dp.from[i], pc, ksq));
+
+            if (dp.to[i] != SQ_NONE)
+                added->push_back(make_index(perspective, dp.to[i], pc, ksq));
+        }
+    }
+
+    template class HalfRelativeKP<Side::kFriend>;
+    template class HalfRelativeKP<Side::kEnemy>;
+
+}  // namespace Eval::NNUE::Features
@@ -1,65 +1,66 @@
-//Definition of input features HalfRelativeKP of NNUE evaluation function
-
-#ifndef _NNUE_FEATURES_HALF_RELATIVE_KP_H_
+#ifndef _NNUE_FEATURES_HALF_RELATIVE_KP_H_
 #define _NNUE_FEATURES_HALF_RELATIVE_KP_H_

-#if defined(EVAL_NNUE)
-
-#include "../../evaluate.h"
 #include "features_common.h"

-namespace Eval {
+#include "evaluate.h"

-namespace NNUE {
+//Definition of input features HalfRelativeKP of NNUE evaluation function
+namespace Eval::NNUE::Features {

-namespace Features {
+    // Feature HalfRelativeKP: Relative position of each piece other than the ball based on own ball or enemy ball
+    template <Side AssociatedKing>
+    class HalfRelativeKP {
+    public:
+        // feature quantity name
+        static constexpr const char* kName = (AssociatedKing == Side::kFriend) ?
+            "HalfRelativeKP(Friend)" : "HalfRelativeKP(Enemy)";

-// Feature HalfRelativeKP: Relative position of each piece other than the ball based on own ball or enemy ball
-template <Side AssociatedKing>
-class HalfRelativeKP {
- public:
-  // feature quantity name
-  static constexpr const char* kName = (AssociatedKing == Side::kFriend) ?
-      "HalfRelativeKP(Friend)" : "HalfRelativeKP(Enemy)";
-  // Hash value embedded in the evaluation function file
-  static constexpr std::uint32_t kHashValue =
-      0xF9180919u ^ (AssociatedKing == Side::kFriend);
-  // Piece type excluding balls
-  static constexpr IndexType kNumPieceKinds = 5 * 2;
-  // width of the virtual board with the ball in the center
-  static constexpr IndexType kBoardWidth = FILE_NB * 2 - 1;
-  // height of a virtual board with balls in the center
-  static constexpr IndexType kBoardHeight = RANK_NB * 2 - 1;
-  // number of feature dimensions
-  static constexpr IndexType kDimensions =
-      kNumPieceKinds * kBoardHeight * kBoardWidth;
-  // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-  static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
-  // Timing of full calculation instead of difference calculation
-  static constexpr TriggerEvent kRefreshTrigger =
-      (AssociatedKing == Side::kFriend) ?
-      TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t kHashValue =
+            0xF9180919u ^ (AssociatedKing == Side::kFriend);

-  // Get a list of indices with a value of 1 among the features
-  static void AppendActiveIndices(const Position& pos, Color perspective,
-                                  IndexList* active);
+        // Piece type excluding balls
+        static constexpr IndexType kNumPieceKinds = 5 * 2;

-  // Get a list of indices whose values have changed from the previous one in the feature quantity
-  static void AppendChangedIndices(const Position& pos, Color perspective,
-                                   IndexList* removed, IndexList* added);
+        // width of the virtual board with the ball in the center
+        static constexpr IndexType kBoardWidth = FILE_NB * 2 - 1;

-  // Find the index of the feature quantity from the ball position and PieceSquare
-  static IndexType MakeIndex(Square s, IndexType p);
-  // Find the index of the feature quantity from the ball position and PieceSquare
-  static IndexType MakeIndex(Color perspective, Square s, Piece pc, Square sq_k);
-};
+        // height of a virtual board with balls in the center
+        static constexpr IndexType kBoardHeight = RANK_NB * 2 - 1;

-}  // namespace Features
+        // number of feature dimensions
+        static constexpr IndexType kDimensions =
+            kNumPieceKinds * kBoardHeight * kBoardWidth;

-}  // namespace NNUE
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count

-}  // namespace Eval
+        // Timing of full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger =
+            (AssociatedKing == Side::kFriend) ?
+            TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;

-#endif  // defined(EVAL_NNUE)
+        // Get a list of indices with a value of 1 among the features
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);
+
+        // Get a list of indices whose values have changed from the previous one in the feature quantity
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
+
+        // Find the index of the feature quantity from the ball position and PieceSquare
+        static IndexType make_index(Square s, IndexType p);
+
+        // Find the index of the feature quantity from the ball position and PieceSquare
+        static IndexType make_index(Color perspective, Square s, Piece pc, Square sq_k);
+    };
+
+}  // namespace Eval::NNUE::Features

 #endif
@@ -1,58 +1,45 @@
-//Definition of input feature quantity K of NNUE evaluation function
-
-#if defined(EVAL_NNUE)
-
-#include "k.h"
+#include "k.h"
 #include "index_list.h"

-namespace Eval {
+//Definition of input feature quantity K of NNUE evaluation function
+namespace Eval::NNUE::Features {

-namespace NNUE {
+    // Orient a square according to perspective (rotate the board 180° for black)
+    // this has to stay until we find a better arch that works with "flip".
+    // allows us to use current master net for gensfen (primarily needed for higher quality data)
+    inline Square orient(Color perspective, Square s) {
+        return Square(int(s) ^ (bool(perspective) * 63));
+    }

-namespace Features {
+    // Index of a feature for a given king position.
+    IndexType K::make_index(Color perspective, Square s, Color king_color) {
+        return IndexType(orient(perspective, s) + bool(perspective ^ king_color) * 64);
+    }

-// Orient a square according to perspective (rotates by 180 for black)
-inline Square orient(Color perspective, Square s) {
-  return Square(int(s) ^ (bool(perspective) * 63));
-}
+    // Get a list of indices with a value of 1 among the features
+    void K::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {

-// Index of a feature for a given king position.
-IndexType K::MakeIndex(Color perspective, Square s, Color king_color) {
-  return IndexType(orient(perspective, s) + bool(perspective ^ king_color) * 64);
-}
+        for (auto color : Colors) {
+          active->push_back(make_index(perspective, pos.square<KING>(color), color));
+        }
+    }

-// Get a list of indices with a value of 1 among the features
-void K::AppendActiveIndices(
-    const Position& pos, Color perspective, IndexList* active) {
-  for (auto color : Colors) {
-    active->push_back(MakeIndex(perspective, pos.square<KING>(color), color));
-  }
-}
+    // Get a list of indices whose values have changed from the previous one in the feature quantity
+    void K::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* added) {

-// Get a list of indices whose values have changed from the previous one in the feature quantity
-void K::AppendChangedIndices(
-    const Position& pos, Color perspective,
-    IndexList* removed, IndexList* added) {
-  const auto& dp = pos.state()->dirtyPiece;
-  Color king_color;
-  if (dp.piece[0] == Piece::W_KING) {
-    king_color = WHITE;
-  }
-  else if (dp.piece[0] == Piece::B_KING) {
-    king_color = BLACK;
-  }
-  else {
-    return;
-  }
+        const auto& dp = pos.state()->dirtyPiece;
+        if (type_of(dp.piece[0]) == KING)
+        {
+            removed->push_back(make_index(perspective, dp.from[0], color_of(dp.piece[0])));
+            added->push_back(make_index(perspective, dp.to[0], color_of(dp.piece[0])));
+        }
+    }

-  removed->push_back(MakeIndex(perspective, dp.from[0], king_color));
-  added->push_back(MakeIndex(perspective, dp.to[0], king_color));
-}
-
-}  // namespace Features
-
-}  // namespace NNUE
-
-}  // namespace Eval
-
-#endif  // defined(EVAL_NNUE)
+}  // namespace Eval::NNUE::Features
@@ -1,52 +1,49 @@
-//Definition of input feature quantity K of NNUE evaluation function
-
-#ifndef _NNUE_FEATURES_K_H_
+#ifndef _NNUE_FEATURES_K_H_
 #define _NNUE_FEATURES_K_H_

-#if defined(EVAL_NNUE)
-
-#include "../../evaluate.h"
 #include "features_common.h"

-namespace Eval {
+#include "evaluate.h"

-namespace NNUE {
+//Definition of input feature quantity K of NNUE evaluation function
+namespace Eval::NNUE::Features {

-namespace Features {
+    // Feature K: Ball position
+    class K {
+    public:
+        // feature quantity name
+        static constexpr const char* kName = "K";

-// Feature K: Ball position
-class K {
- public:
-  // feature quantity name
-  static constexpr const char* kName = "K";
-  // Hash value embedded in the evaluation function file
-  static constexpr std::uint32_t kHashValue = 0xD3CEE169u;
-  // number of feature dimensions
-  static constexpr IndexType kDimensions = SQUARE_NB * 2;
-  // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-  static constexpr IndexType kMaxActiveDimensions = 2;
-  // Timing of full calculation instead of difference calculation
-  static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t kHashValue = 0xD3CEE169u;

-  // Get a list of indices with a value of 1 among the features
-  static void AppendActiveIndices(const Position& pos, Color perspective,
-                                  IndexList* active);
+        // number of feature dimensions
+        static constexpr IndexType kDimensions = SQUARE_NB * 2;

-  // Get a list of indices whose values have changed from the previous one in the feature quantity
-  static void AppendChangedIndices(const Position& pos, Color perspective,
-                                   IndexList* removed, IndexList* added);
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions = 2;

-private:
-  // Index of a feature for a given king position.
-  static IndexType MakeIndex(Color perspective, Square s, Color king_color);
-};
+        // Timing of full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;

-}  // namespace Features
+        // Get a list of indices with a value of 1 among the features
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);

-}  // namespace NNUE
+        // Get a list of indices whose values have changed from the previous one in the feature quantity
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);

-}  // namespace Eval
+    private:
+        // Index of a feature for a given king position.
+        static IndexType make_index(Color perspective, Square s, Color king_color);
+    };

-#endif  // defined(EVAL_NNUE)
+}  // namespace Eval::NNUE::Features

 #endif
@@ -1,56 +1,55 @@
-//Definition of input feature P of NNUE evaluation function
-
-#if defined(EVAL_NNUE)
-
-#include "p.h"
+#include "p.h"
 #include "index_list.h"

-namespace Eval {
+//Definition of input feature P of NNUE evaluation function
+namespace Eval::NNUE::Features {

-namespace NNUE {
+    // Orient a square according to perspective (rotate the board 180° for black)
+    // this has to stay until we find a better arch that works with "flip".
+    // allows us to use current master net for gensfen (primarily needed for higher quality data)
+    inline Square orient(Color perspective, Square s) {
+        return Square(int(s) ^ (bool(perspective) * 63));
+    }

-namespace Features {
+    // Find the index of the feature quantity from the king position and PieceSquare
+    inline IndexType P::make_index(
+        Color perspective, Square s, Piece pc) {
+        return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
+    }

-// Orient a square according to perspective (rotates by 180 for black)
-inline Square orient(Color perspective, Square s) {
-  return Square(int(s) ^ (bool(perspective) * 63));
-}
+    // Get a list of indices with a value of 1 among the features
+    void P::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {

-// Find the index of the feature quantity from the king position and PieceSquare
-inline IndexType P::MakeIndex(
-  Color perspective, Square s, Piece pc) {
-  return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
-}
+        Bitboard bb = pos.pieces() & ~pos.pieces(KING);
+        while (bb) {
+            Square s = pop_lsb(&bb);
+            active->push_back(make_index(perspective, s, pos.piece_on(s)));
+        }
+    }

-// Get a list of indices with a value of 1 among the features
-void P::AppendActiveIndices(
-    const Position& pos, Color perspective, IndexList* active) {
-  Bitboard bb = pos.pieces() & ~pos.pieces(KING);
-  while (bb) {
-    Square s = pop_lsb(&bb);
-    active->push_back(MakeIndex(perspective, s, pos.piece_on(s)));
-  }
-}
+    // Get a list of indices whose values have changed from the previous one in the feature quantity
+    void P::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* added) {

-// Get a list of indices whose values have changed from the previous one in the feature quantity
-void P::AppendChangedIndices(
-    const Position& pos, Color perspective,
-    IndexList* removed, IndexList* added) {
-  const auto& dp = pos.state()->dirtyPiece;
-  for (int i = 0; i < dp.dirty_num; ++i) {
-    Piece pc = dp.piece[i];
-    if (type_of(pc) == KING) continue;
-    if (dp.from[i] != SQ_NONE)
-      removed->push_back(MakeIndex(perspective, dp.from[i], pc));
-    if (dp.to[i] != SQ_NONE)
-      added->push_back(MakeIndex(perspective, dp.to[i], pc));
-  }
-}
+        const auto& dp = pos.state()->dirtyPiece;
+        for (int i = 0; i < dp.dirty_num; ++i) {
+            Piece pc = dp.piece[i];

-}  // namespace Features
+            if (type_of(pc) == KING)
+              continue;

-}  // namespace NNUE
+            if (dp.from[i] != SQ_NONE)
+              removed->push_back(make_index(perspective, dp.from[i], pc));

-}  // namespace Eval
+            if (dp.to[i] != SQ_NONE)
+              added->push_back(make_index(perspective, dp.to[i], pc));
+        }
+    }

-#endif  // defined(EVAL_NNUE)
+}  // namespace Eval::NNUE::Features
@@ -1,52 +1,49 @@
-//Definition of input feature P of NNUE evaluation function
-
-#ifndef _NNUE_FEATURES_P_H_
+#ifndef _NNUE_FEATURES_P_H_
 #define _NNUE_FEATURES_P_H_

-#if defined(EVAL_NNUE)
-
-#include "../../evaluate.h"
 #include "features_common.h"

-namespace Eval {
+#include "evaluate.h"

-namespace NNUE {
+//Definition of input feature P of NNUE evaluation function
+namespace Eval::NNUE::Features {

-namespace Features {
+    // Feature P: PieceSquare of pieces other than balls
+    class P {
+    public:
+        // feature quantity name
+        static constexpr const char* kName = "P";

-// Feature P: PieceSquare of pieces other than balls
-class P {
- public:
-  // feature quantity name
-  static constexpr const char* kName = "P";
-  // Hash value embedded in the evaluation function file
-  static constexpr std::uint32_t kHashValue = 0x764CFB4Bu;
-  // number of feature dimensions
-  static constexpr IndexType kDimensions = PS_END;
-  // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-  static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
-  // Timing of full calculation instead of difference calculation
-  static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t kHashValue = 0x764CFB4Bu;

-  // Get a list of indices with a value of 1 among the features
-  static void AppendActiveIndices(const Position& pos, Color perspective,
-                                  IndexList* active);
+        // number of feature dimensions
+        static constexpr IndexType kDimensions = PS_END;

-  // Get a list of indices whose values have changed from the previous one in the feature quantity
-  static void AppendChangedIndices(const Position& pos, Color perspective,
-                                   IndexList* removed, IndexList* added);
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count

- private:
-  // Index of a feature for a given piece on some square
-  static IndexType MakeIndex(Color perspective, Square s, Piece pc);
-};
+        // Timing of full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;

-}  // namespace Features
+        // Get a list of indices with a value of 1 among the features
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);

-}  // namespace NNUE
+        // Get a list of indices whose values have changed from the previous one in the feature quantity
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);

-}  // namespace Eval
+    private:
+        // Index of a feature for a given piece on some square
+        static IndexType make_index(Color perspective, Square s, Piece pc);
+    };

-#endif  // defined(EVAL_NNUE)
+}  // namespace Eval::NNUE::Features

 #endif
@@ -24,6 +24,10 @@
 #include <iostream>
 #include "../nnue_common.h"

+#include <string>
+#include <type_traits>
+#include <cstdint>
+
 namespace Eval::NNUE::Layers {

  // Affine transformation layer
@@ -50,6 +54,8 @@ namespace Eval::NNUE::Layers {
    static constexpr std::size_t kBufferSize =
        PreviousLayer::kBufferSize + kSelfBufferSize;

+    static constexpr int kLayerIndex = PreviousLayer::kLayerIndex + 1;
+
    // Hash value embedded in the evaluation file
    static constexpr std::uint32_t GetHashValue() {
      std::uint32_t hash_value = 0xCC03DAE4u;
@@ -59,14 +65,27 @@ namespace Eval::NNUE::Layers {
      return hash_value;
    }

-    // A string that represents the structure from the input layer to this layer
-    static std::string GetStructureString() {
-      return "AffineTransform[" +
-        std::to_string(kOutputDimensions) + "<-" +
-        std::to_string(kInputDimensions) + "](" +
-        PreviousLayer::GetStructureString() + ")";
+    static std::string get_name() {
+        return "AffineTransform[" +
+            std::to_string(kOutputDimensions) + "<-" +
+            std::to_string(kInputDimensions) + "]";
    }
-    
+
+    // A string that represents the structure from the input layer to this layer
+    static std::string get_structure_string() {
+        return get_name() + "(" +
+            PreviousLayer::get_structure_string() + ")";
+    }
+
+    static std::string get_layers_info() {
+        std::string info = PreviousLayer::get_layers_info();
+        info += "\n  - ";
+        info += std::to_string(kLayerIndex);
+        info += " - ";
+        info += get_name();
+        return info;
+    }
+
   // Read network parameters
    bool ReadParameters(std::istream& stream) {
      if (!previous_layer_.ReadParameters(stream)) return false;
@@ -79,13 +98,17 @@ namespace Eval::NNUE::Layers {

    // write parameters
    bool WriteParameters(std::ostream& stream) const {
-      if (!previous_layer_.WriteParameters(stream)) return false;
-      stream.write(reinterpret_cast<const char*>(biases_),
-        kOutputDimensions * sizeof(BiasType));
-      stream.write(reinterpret_cast<const char*>(weights_),
-        kOutputDimensions * kPaddedInputDimensions *
-        sizeof(WeightType));
-      return !stream.fail();
+        if (!previous_layer_.WriteParameters(stream))
+            return false;
+
+        stream.write(reinterpret_cast<const char*>(biases_),
+            kOutputDimensions * sizeof(BiasType));
+
+        stream.write(reinterpret_cast<const char*>(weights_),
+            kOutputDimensions * kPaddedInputDimensions *
+            sizeof(WeightType));
+
+        return !stream.fail();
    }

    // Forward propagation
@@ -93,113 +116,606 @@ namespace Eval::NNUE::Layers {
        const TransformedFeatureType* transformed_features, char* buffer) const {
      const auto input = previous_layer_.Propagate(
          transformed_features, buffer + kSelfBufferSize);
+
+#if defined (USE_AVX512)
+
+      [[maybe_unused]] const __m512i kOnes512 = _mm512_set1_epi16(1);
+
+      [[maybe_unused]] auto m512_hadd = [](__m512i sum, int bias) -> int {
+        return _mm512_reduce_add_epi32(sum) + bias;
+      };
+
+      // This function takes
+      //   sum0 = [xmm0a, xmm0b, xmm0c, xmm0d]
+      //   sum1 = [xmm1a, xmm1b, xmm1c, xmm1d]
+      //   sum2 = [xmm2a, xmm2b, xmm2c, xmm2d]
+      //   sum3 = [xmm3a, xmm3b, xmm3c, xmm3d]
+      // and returns
+      //   ret = [
+      //     reduce_add_epi32(xmm0a), reduce_add_epi32(xmm1a), reduce_add_epi32(xmm2a), reduce_add_epi32(xmm3a),
+      //     reduce_add_epi32(xmm0b), reduce_add_epi32(xmm1b), reduce_add_epi32(xmm2b), reduce_add_epi32(xmm3b),
+      //     reduce_add_epi32(xmm0c), reduce_add_epi32(xmm1c), reduce_add_epi32(xmm2c), reduce_add_epi32(xmm3c),
+      //     reduce_add_epi32(xmm0d), reduce_add_epi32(xmm1d), reduce_add_epi32(xmm2d), reduce_add_epi32(xmm3d)
+      //   ]
+      [[maybe_unused]] auto m512_hadd128x16_interleave = [](
+        __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3) -> __m512i {
+
+        __m512i sum01a = _mm512_unpacklo_epi32(sum0, sum1);
+        __m512i sum01b = _mm512_unpackhi_epi32(sum0, sum1);
+
+        __m512i sum23a = _mm512_unpacklo_epi32(sum2, sum3);
+        __m512i sum23b = _mm512_unpackhi_epi32(sum2, sum3);
+
+        __m512i sum01 = _mm512_add_epi32(sum01a, sum01b);
+        __m512i sum23 = _mm512_add_epi32(sum23a, sum23b);
+
+        __m512i sum0123a = _mm512_unpacklo_epi64(sum01, sum23);
+        __m512i sum0123b = _mm512_unpackhi_epi64(sum01, sum23);
+
+        return _mm512_add_epi32(sum0123a, sum0123b);
+      };
+
+      [[maybe_unused]] auto m512_haddx4 = [m512_hadd128x16_interleave](
+        __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3, __m128i bias) -> __m128i {
+
+        __m512i sum = m512_hadd128x16_interleave(sum0, sum1, sum2, sum3);
+
+        __m256i sum256lo = _mm512_castsi512_si256(sum);
+        __m256i sum256hi = _mm512_extracti64x4_epi64(sum, 1);
+
+        sum256lo = _mm256_add_epi32(sum256lo, sum256hi);
+
+        __m128i sum128lo = _mm256_castsi256_si128(sum256lo);
+        __m128i sum128hi = _mm256_extracti128_si256(sum256lo, 1);
+
+        return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias);
+      };
+
+      [[maybe_unused]] auto m512_haddx8 = [m512_hadd128x16_interleave](
+        __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3,
+        __m512i sum4, __m512i sum5, __m512i sum6, __m512i sum7, __m256i bias) -> __m256i {
+
+        __m512i suma = m512_hadd128x16_interleave(sum0, sum1, sum2, sum3);
+        __m512i sumb = m512_hadd128x16_interleave(sum4, sum5, sum6, sum7);
+
+        __m512i indices0 = _mm512_setr_epi64(0, 1, 8, 9, 4, 5, 12, 13);
+        __m512i indices1 = _mm512_setr_epi64(2, 3, 10, 11, 6, 7, 14, 15);
+        __m512i x = _mm512_add_epi32(
+          _mm512_permutex2var_epi64(suma, indices0, sumb),
+          _mm512_permutex2var_epi64(suma, indices1, sumb));
+
+        __m256i sum256lo = _mm512_castsi512_si256(x);
+        __m256i sum256hi = _mm512_extracti64x4_epi64(x, 1);
+
+        return _mm256_add_epi32(_mm256_add_epi32(sum256lo, sum256hi), bias);
+      };
+
+      [[maybe_unused]] auto m512_hadd256x8 =[m512_hadd128x16_interleave](
+        __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3, __m256i bias) -> __m256i {
+
+        __m512i sum = m512_hadd128x16_interleave(sum0, sum1, sum2, sum3);
+
+        __m512i indices = _mm512_setr_epi32(
+          0, 4, 8, 12, 2, 6, 10, 14,
+          1, 5, 9, 13, 3, 7, 11, 15);
+        sum = _mm512_permutexvar_epi32(indices, sum);
+
+        __m256i sum256lo = _mm512_castsi512_si256(sum);
+        __m256i sum256hi = _mm512_extracti64x4_epi64(sum, 1);
+
+        return _mm256_add_epi32(_mm256_hadd_epi32(sum256lo, sum256hi), bias);
+      };
+
+      [[maybe_unused]] auto m512_hadd256x16 = [m512_hadd128x16_interleave](
+        __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3,
+        __m512i sum4, __m512i sum5, __m512i sum6, __m512i sum7, __m512i bias) -> __m512i {
+
+        __m512i suma = m512_hadd128x16_interleave(sum0, sum1, sum2, sum3);
+        __m512i sumb = m512_hadd128x16_interleave(sum4, sum5, sum6, sum7);
+
+        __m512i indices0 = _mm512_setr_epi64(0, 1, 8, 9, 4, 5, 12, 13);
+        __m512i indices1 = _mm512_setr_epi64(2, 3, 10, 11, 6, 7, 14, 15);
+        __m512i x = _mm512_add_epi32(
+          _mm512_permutex2var_epi64(suma, indices0, sumb),
+          _mm512_permutex2var_epi64(suma, indices1, sumb));
+
+        __m512i indices = _mm512_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15);
+        return _mm512_add_epi32(_mm512_permutexvar_epi32(indices, x), bias);
+      };
+
+#if defined (USE_VNNI)
+      [[maybe_unused]] auto m512_add_dpbusd_epi32 = [=](__m512i& acc, __m512i a, __m512i b) {
+        acc = _mm512_dpbusd_epi32(acc, a, b);
+#else
+      [[maybe_unused]] auto m512_dpbusd_epi32 = [=](__m512i a, __m512i b) -> __m512i {
+        __m512i product0 = _mm512_maddubs_epi16(a, b);
+        return _mm512_madd_epi16(product0, kOnes512);
+#endif
+      };
+
+#endif
+#if defined (USE_AVX2)
+
+      [[maybe_unused]] const __m256i kOnes256 = _mm256_set1_epi16(1);
+
+      [[maybe_unused]] auto m256_hadd = [](__m256i sum, int bias) -> int {
+        __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
+        sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC));
+        sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB));
+        return _mm_cvtsi128_si32(sum128) + bias;
+      };
+
+      [[maybe_unused]] auto m256_haddx4 = [](__m256i sum0, __m256i sum1, __m256i sum2, __m256i sum3, __m128i bias) -> __m128i {
+        sum0 = _mm256_hadd_epi32(sum0, sum1);
+        sum2 = _mm256_hadd_epi32(sum2, sum3);
+
+        sum0 = _mm256_hadd_epi32(sum0, sum2);
+
+        __m128i sum128lo = _mm256_castsi256_si128(sum0);
+        __m128i sum128hi = _mm256_extracti128_si256(sum0, 1);
+
+        return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias);
+      };
+#if defined (USE_VNNI)
+      [[maybe_unused]] auto m256_add_dpbusd_epi32 = [=](__m256i& acc, __m256i a, __m256i b) {
+        acc = _mm256_dpbusd_epi32(acc, a, b);
+#else
+      [[maybe_unused]] auto m256_dpbusd_epi32 = [=](__m256i a, __m256i b) -> __m256i {
+        __m256i product0 = _mm256_maddubs_epi16(a, b);
+        return _mm256_madd_epi16(product0, kOnes256);
+#endif
+      };
+
+#endif
+
+#if defined (USE_SSSE3)
+
+      [[maybe_unused]] const __m128i kOnes128 = _mm_set1_epi16(1);
+
+      [[maybe_unused]] auto m128_hadd = [](__m128i sum, int bias) -> int {
+        sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC
+        sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB
+        return _mm_cvtsi128_si32(sum) + bias;
+      };
+
+      [[maybe_unused]] auto m128_haddx4 = [](__m128i sum0, __m128i sum1, __m128i sum2, __m128i sum3, __m128i bias) -> __m128i {
+        sum0 = _mm_hadd_epi32(sum0, sum1);
+        sum2 = _mm_hadd_epi32(sum2, sum3);
+
+        sum0 = _mm_hadd_epi32(sum0, sum2);
+
+        return _mm_add_epi32(sum0, bias);
+      };
+
+      [[maybe_unused]] auto m128_dpbusd_epi32 = [=](__m128i a, __m128i b) -> __m128i {
+        __m128i product0 = _mm_maddubs_epi16(a, b);
+        return _mm_madd_epi16(product0, kOnes128);
+      };
+
+#endif
+
+#if defined (USE_AVX512)
+
+      constexpr IndexType kNumChunks512 = kPaddedInputDimensions / (kSimdWidth * 2);
+      constexpr IndexType kNumChunks256 = kPaddedInputDimensions / kSimdWidth;
+
      const auto output = reinterpret_cast<OutputType*>(buffer);

-  #if defined(USE_AVX512)
-      constexpr IndexType kNumChunks = kPaddedInputDimensions / (kSimdWidth * 2);
-      const auto input_vector = reinterpret_cast<const __m512i*>(input);
-  #if !defined(USE_VNNI)
-      const __m512i kOnes = _mm512_set1_epi16(1);
-  #endif
+      // Since to saturate a zmm register it takes 64 bytes we
+      // cannot use AVX512 for the smaller affine transforms.
+      // Instead we fallback to a AVX2 implementation if the
+      // kInputDimensions isn't a multiple of 64.
+      // Note that this means that for example for
+      // kInputDimensions of 96 we fallback to AVX2 even though
+      // the first 64 elements could be processed with AVX512.
+      // This is caused by mixing the __m256 and __m512 variables
+      // required to better handle that case and it would
+      // require handling more cases statically not to lose performance.
+      // This should be revisited if such input dimensions are to be considered.
+      [[maybe_unused]] const auto input_vector512 = reinterpret_cast<const __m512i*>(input);
+      [[maybe_unused]] const auto input_vector256 = reinterpret_cast<const __m256i*>(input);
+
+      // kOutputDimensions is either 1 or a multiple of kSimdWidth
+      // because then it is also an input dimension.
+      if constexpr (kOutputDimensions % 16 == 0 && kNumChunks256 == 1)
+      {
+        for (IndexType i = 0; i < kOutputDimensions; i += 16)
+        {
+          const IndexType offset01a = (i + 0) * kPaddedInputDimensions;
+          const IndexType offset23a = (i + 2) * kPaddedInputDimensions;
+          const IndexType offset45a = (i + 4) * kPaddedInputDimensions;
+          const IndexType offset67a = (i + 6) * kPaddedInputDimensions;
+          const IndexType offset01b = (i + 8) * kPaddedInputDimensions;
+          const IndexType offset23b = (i + 10) * kPaddedInputDimensions;
+          const IndexType offset45b = (i + 12) * kPaddedInputDimensions;
+          const IndexType offset67b = (i + 14) * kPaddedInputDimensions;
+
+          const __m512i bias = *reinterpret_cast<const __m512i*>(&biases_[i]);
+          __m512i* outptr = reinterpret_cast<__m512i*>(&output[i]);
+
+          const auto row01a = *reinterpret_cast<const __m512i*>(&weights_[offset01a]);
+          const auto row23a = *reinterpret_cast<const __m512i*>(&weights_[offset23a]);
+          const auto row45a = *reinterpret_cast<const __m512i*>(&weights_[offset45a]);
+          const auto row67a = *reinterpret_cast<const __m512i*>(&weights_[offset67a]);
+          const auto row01b = *reinterpret_cast<const __m512i*>(&weights_[offset01b]);
+          const auto row23b = *reinterpret_cast<const __m512i*>(&weights_[offset23b]);
+          const auto row45b = *reinterpret_cast<const __m512i*>(&weights_[offset45b]);
+          const auto row67b = *reinterpret_cast<const __m512i*>(&weights_[offset67b]);
+
+          const __m256i in256 = input_vector256[0];
+          const __m512i in = _mm512_inserti64x4(_mm512_castsi256_si512(in256), in256, 1);
+
+#if defined (USE_VNNI)
+          __m512i sum01a = _mm512_setzero_si512();
+          __m512i sum23a = _mm512_setzero_si512();
+          __m512i sum45a = _mm512_setzero_si512();
+          __m512i sum67a = _mm512_setzero_si512();
+          __m512i sum01b = _mm512_setzero_si512();
+          __m512i sum23b = _mm512_setzero_si512();
+          __m512i sum45b = _mm512_setzero_si512();
+          __m512i sum67b = _mm512_setzero_si512();
+
+          m512_add_dpbusd_epi32(sum01a, in, row01a);
+          m512_add_dpbusd_epi32(sum23a, in, row23a);
+          m512_add_dpbusd_epi32(sum45a, in, row45a);
+          m512_add_dpbusd_epi32(sum67a, in, row67a);
+          m512_add_dpbusd_epi32(sum01b, in, row01b);
+          m512_add_dpbusd_epi32(sum23b, in, row23b);
+          m512_add_dpbusd_epi32(sum45b, in, row45b);
+          m512_add_dpbusd_epi32(sum67b, in, row67b);
+#else
+          __m512i sum01a = m512_dpbusd_epi32(in, row01a);
+          __m512i sum23a = m512_dpbusd_epi32(in, row23a);
+          __m512i sum45a = m512_dpbusd_epi32(in, row45a);
+          __m512i sum67a = m512_dpbusd_epi32(in, row67a);
+          __m512i sum01b = m512_dpbusd_epi32(in, row01b);
+          __m512i sum23b = m512_dpbusd_epi32(in, row23b);
+          __m512i sum45b = m512_dpbusd_epi32(in, row45b);
+          __m512i sum67b = m512_dpbusd_epi32(in, row67b);
+#endif
+
+          *outptr = m512_hadd256x16(
+            sum01a, sum23a, sum45a, sum67a,
+            sum01b, sum23b, sum45b, sum67b, bias);
+        }
+      }
+      else if constexpr (kOutputDimensions % 4 == 0)
+      {
+        for (IndexType i = 0; i < kOutputDimensions; i += 4)
+        {
+          const IndexType offset0 = (i + 0) * kPaddedInputDimensions;
+          const IndexType offset1 = (i + 1) * kPaddedInputDimensions;
+          const IndexType offset2 = (i + 2) * kPaddedInputDimensions;
+          const IndexType offset3 = (i + 3) * kPaddedInputDimensions;
+
+          const __m128i bias = *reinterpret_cast<const __m128i*>(&biases_[i]);
+          __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]);
+
+          if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) == 0)
+          {
+            const auto row0 = reinterpret_cast<const __m512i*>(&weights_[offset0]);
+            const auto row1 = reinterpret_cast<const __m512i*>(&weights_[offset1]);
+            const auto row2 = reinterpret_cast<const __m512i*>(&weights_[offset2]);
+            const auto row3 = reinterpret_cast<const __m512i*>(&weights_[offset3]);
+
+#if defined (USE_VNNI)
+            __m512i sum0 = _mm512_setzero_si512();
+            __m512i sum1 = _mm512_setzero_si512();
+            __m512i sum2 = _mm512_setzero_si512();
+            __m512i sum3 = _mm512_setzero_si512();
+            const IndexType kStart = 0;
+#else
+            __m512i sum0 = m512_dpbusd_epi32(input_vector512[0], row0[0]);
+            __m512i sum1 = m512_dpbusd_epi32(input_vector512[0], row1[0]);
+            __m512i sum2 = m512_dpbusd_epi32(input_vector512[0], row2[0]);
+            __m512i sum3 = m512_dpbusd_epi32(input_vector512[0], row3[0]);
+            const IndexType kStart = 1;
+#endif
+
+            for (IndexType j = kStart; j < kNumChunks512; ++j)
+            {
+              const __m512i in = input_vector512[j];
+
+#if defined (USE_VNNI)
+              m512_add_dpbusd_epi32(sum0, in, row0[j]);
+              m512_add_dpbusd_epi32(sum1, in, row1[j]);
+              m512_add_dpbusd_epi32(sum2, in, row2[j]);
+              m512_add_dpbusd_epi32(sum3, in, row3[j]);
+#else
+              sum0 = _mm512_add_epi32(sum0, m512_dpbusd_epi32(in, row0[j]));
+              sum1 = _mm512_add_epi32(sum1, m512_dpbusd_epi32(in, row1[j]));
+              sum2 = _mm512_add_epi32(sum2, m512_dpbusd_epi32(in, row2[j]));
+              sum3 = _mm512_add_epi32(sum3, m512_dpbusd_epi32(in, row3[j]));
+#endif
+            }
+
+            *outptr = m512_haddx4(sum0, sum1, sum2, sum3, bias);
+          }
+          else
+          {
+            const auto row0 = reinterpret_cast<const __m256i*>(&weights_[offset0]);
+            const auto row1 = reinterpret_cast<const __m256i*>(&weights_[offset1]);
+            const auto row2 = reinterpret_cast<const __m256i*>(&weights_[offset2]);
+            const auto row3 = reinterpret_cast<const __m256i*>(&weights_[offset3]);
+
+#if defined (USE_VNNI)
+            __m256i sum0 = _mm256_setzero_si256();
+            __m256i sum1 = _mm256_setzero_si256();
+            __m256i sum2 = _mm256_setzero_si256();
+            __m256i sum3 = _mm256_setzero_si256();
+            const IndexType kStart = 0;
+#else
+            __m256i sum0 = m256_dpbusd_epi32(input_vector256[0], row0[0]);
+            __m256i sum1 = m256_dpbusd_epi32(input_vector256[0], row1[0]);
+            __m256i sum2 = m256_dpbusd_epi32(input_vector256[0], row2[0]);
+            __m256i sum3 = m256_dpbusd_epi32(input_vector256[0], row3[0]);
+            const IndexType kStart = 1;
+#endif
+
+            for (IndexType j = kStart; j < kNumChunks256; ++j)
+            {
+              const __m256i in = input_vector256[j];
+
+#if defined (USE_VNNI)
+              m256_add_dpbusd_epi32(sum0, in, row0[j]);
+              m256_add_dpbusd_epi32(sum1, in, row1[j]);
+              m256_add_dpbusd_epi32(sum2, in, row2[j]);
+              m256_add_dpbusd_epi32(sum3, in, row3[j]);
+#else
+              sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
+              sum1 = _mm256_add_epi32(sum1, m256_dpbusd_epi32(in, row1[j]));
+              sum2 = _mm256_add_epi32(sum2, m256_dpbusd_epi32(in, row2[j]));
+              sum3 = _mm256_add_epi32(sum3, m256_dpbusd_epi32(in, row3[j]));
+#endif
+            }
+
+            *outptr = m256_haddx4(sum0, sum1, sum2, sum3, bias);
+          }
+        }
+      }
+      else if constexpr (kOutputDimensions == 1)
+      {
+        if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) == 0)
+        {
+          const auto row0 = reinterpret_cast<const __m512i*>(&weights_[0]);
+
+#if defined (USE_VNNI)
+          __m512i sum0 = _mm512_setzero_si512();
+          const IndexType kStart = 0;
+#else
+          __m512i sum0 = m512_dpbusd_epi32(input_vector512[0], row0[0]);
+          const IndexType kStart = 1;
+#endif
+
+          for (IndexType j = kStart; j < kNumChunks512; ++j)
+          {
+            const __m512i in = input_vector512[j];
+
+#if defined (USE_VNNI)
+            m512_add_dpbusd_epi32(sum0, in, row0[j]);
+#else
+            sum0 = _mm512_add_epi32(sum0, m512_dpbusd_epi32(in, row0[j]));
+#endif
+          }
+
+          output[0] = m512_hadd(sum0, biases_[0]);
+        }
+        else
+        {
+          const auto row0 = reinterpret_cast<const __m256i*>(&weights_[0]);
+
+#if defined (USE_VNNI)
+          __m256i sum0 = _mm256_setzero_si256();
+          const IndexType kStart = 0;
+#else
+          __m256i sum0 = m256_dpbusd_epi32(input_vector256[0], row0[0]);
+          const IndexType kStart = 1;
+#endif
+
+          for (IndexType j = kStart; j < kNumChunks256; ++j)
+          {
+            const __m256i in = input_vector256[j];
+
+#if defined (USE_VNNI)
+            m256_add_dpbusd_epi32(sum0, in, row0[j]);
+#else
+            sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
+#endif
+          }
+
+          output[0] = m256_hadd(sum0, biases_[0]);
+        }
+      }
+      else
+      {
+        // This case can never happen because kOutputDimensions
+        // is always 1 or a multiple of kSimdWidth.
+        assert(false);
+      }
+
+#elif defined (USE_AVX2)

-  #elif defined(USE_AVX2)
      constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+
+      const auto output = reinterpret_cast<OutputType*>(buffer);
      const auto input_vector = reinterpret_cast<const __m256i*>(input);
-  #if !defined(USE_VNNI)
-      const __m256i kOnes = _mm256_set1_epi16(1);
-  #endif

-  #elif defined(USE_SSE2)
+      // kOutputDimensions is either 1 or a multiple of kSimdWidth
+      // because then it is also an input dimension.
+      if constexpr (kOutputDimensions % 4 == 0)
+      {
+        for (IndexType i = 0; i < kOutputDimensions; i += 4)
+        {
+          const IndexType offset0 = (i + 0) * kPaddedInputDimensions;
+          const IndexType offset1 = (i + 1) * kPaddedInputDimensions;
+          const IndexType offset2 = (i + 2) * kPaddedInputDimensions;
+          const IndexType offset3 = (i + 3) * kPaddedInputDimensions;
+
+          const __m128i bias = *reinterpret_cast<const __m128i*>(&biases_[i]);
+          __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]);
+
+          const auto row0 = reinterpret_cast<const __m256i*>(&weights_[offset0]);
+          const auto row1 = reinterpret_cast<const __m256i*>(&weights_[offset1]);
+          const auto row2 = reinterpret_cast<const __m256i*>(&weights_[offset2]);
+          const auto row3 = reinterpret_cast<const __m256i*>(&weights_[offset3]);
+
+#if defined (USE_VNNI)
+          __m256i sum0 = _mm256_setzero_si256();
+          __m256i sum1 = _mm256_setzero_si256();
+          __m256i sum2 = _mm256_setzero_si256();
+          __m256i sum3 = _mm256_setzero_si256();
+          const IndexType kStart = 0;
+#else
+          __m256i sum0 = m256_dpbusd_epi32(input_vector[0], row0[0]);
+          __m256i sum1 = m256_dpbusd_epi32(input_vector[0], row1[0]);
+          __m256i sum2 = m256_dpbusd_epi32(input_vector[0], row2[0]);
+          __m256i sum3 = m256_dpbusd_epi32(input_vector[0], row3[0]);
+          const IndexType kStart = 1;
+#endif
+
+          for (IndexType j = kStart; j < kNumChunks; ++j)
+          {
+            const __m256i in = input_vector[j];
+
+#if defined (USE_VNNI)
+            m256_add_dpbusd_epi32(sum0, in, row0[j]);
+            m256_add_dpbusd_epi32(sum1, in, row1[j]);
+            m256_add_dpbusd_epi32(sum2, in, row2[j]);
+            m256_add_dpbusd_epi32(sum3, in, row3[j]);
+#else
+            sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
+            sum1 = _mm256_add_epi32(sum1, m256_dpbusd_epi32(in, row1[j]));
+            sum2 = _mm256_add_epi32(sum2, m256_dpbusd_epi32(in, row2[j]));
+            sum3 = _mm256_add_epi32(sum3, m256_dpbusd_epi32(in, row3[j]));
+#endif
+          }
+
+          *outptr = m256_haddx4(sum0, sum1, sum2, sum3, bias);
+        }
+      }
+      else if constexpr (kOutputDimensions == 1)
+      {
+        const auto row0 = reinterpret_cast<const __m256i*>(&weights_[0]);
+
+#if defined (USE_VNNI)
+        __m256i sum0 = _mm256_setzero_si256();
+        const IndexType kStart = 0;
+#else
+        __m256i sum0 = m256_dpbusd_epi32(input_vector[0], row0[0]);
+        const IndexType kStart = 1;
+#endif
+
+        for (IndexType j = kStart; j < kNumChunks; ++j)
+        {
+          const __m256i in = input_vector[j];
+
+#if defined (USE_VNNI)
+          m256_add_dpbusd_epi32(sum0, in, row0[j]);
+#else
+          sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
+#endif
+        }
+
+        output[0] = m256_hadd(sum0, biases_[0]);
+      }
+      else
+      {
+        // This case can never happen because kOutputDimensions
+        // is always 1 or a multiple of kSimdWidth.
+        assert(false);
+      }
+
+#elif defined (USE_SSSE3)
+
      constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
-  #ifndef USE_SSSE3
-      const __m128i kZeros = _mm_setzero_si128();
-  #else
-      const __m128i kOnes = _mm_set1_epi16(1);
-  #endif
+
+      auto output = reinterpret_cast<OutputType*>(buffer);
      const auto input_vector = reinterpret_cast<const __m128i*>(input);

-  #elif defined(USE_MMX)
+      // kOutputDimensions is either 1 or a multiple of kSimdWidth
+      // because then it is also an input dimension.
+      if constexpr (kOutputDimensions % 4 == 0)
+      {
+        for (IndexType i = 0; i < kOutputDimensions; i += 4)
+        {
+          const IndexType offset0 = (i + 0) * kPaddedInputDimensions;
+          const IndexType offset1 = (i + 1) * kPaddedInputDimensions;
+          const IndexType offset2 = (i + 2) * kPaddedInputDimensions;
+          const IndexType offset3 = (i + 3) * kPaddedInputDimensions;
+
+          const __m128i bias = *reinterpret_cast<const __m128i*>(&biases_[i]);
+          __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]);
+
+          const auto row0 = reinterpret_cast<const __m128i*>(&weights_[offset0]);
+          const auto row1 = reinterpret_cast<const __m128i*>(&weights_[offset1]);
+          const auto row2 = reinterpret_cast<const __m128i*>(&weights_[offset2]);
+          const auto row3 = reinterpret_cast<const __m128i*>(&weights_[offset3]);
+
+          __m128i sum0 = m128_dpbusd_epi32(input_vector[0], row0[0]);
+          __m128i sum1 = m128_dpbusd_epi32(input_vector[0], row1[0]);
+          __m128i sum2 = m128_dpbusd_epi32(input_vector[0], row2[0]);
+          __m128i sum3 = m128_dpbusd_epi32(input_vector[0], row3[0]);
+
+          for (int j = 1; j < (int)kNumChunks; ++j)
+          {
+            const __m128i in = input_vector[j];
+
+            sum0 = _mm_add_epi32(sum0, m128_dpbusd_epi32(in, row0[j]));
+            sum1 = _mm_add_epi32(sum1, m128_dpbusd_epi32(in, row1[j]));
+            sum2 = _mm_add_epi32(sum2, m128_dpbusd_epi32(in, row2[j]));
+            sum3 = _mm_add_epi32(sum3, m128_dpbusd_epi32(in, row3[j]));
+          }
+
+          *outptr = m128_haddx4(sum0, sum1, sum2, sum3, bias);
+        }
+      }
+      else if constexpr (kOutputDimensions == 1)
+      {
+        const auto row0 = reinterpret_cast<const __m128i*>(&weights_[0]);
+
+        __m128i sum0 = m128_dpbusd_epi32(input_vector[0], row0[0]);
+
+        for (int j = 1; j < (int)kNumChunks; ++j)
+          sum0 = _mm_add_epi32(sum0, m128_dpbusd_epi32(input_vector[j], row0[j]));
+
+        output[0] = m128_hadd(sum0, biases_[0]);
+      }
+      else
+      {
+        // This case can never happen because kOutputDimensions
+        // is always 1 or a multiple of kSimdWidth.
+        assert(false);
+      }
+
+#else
+
+// Use old implementation for the other architectures.
+
+      auto output = reinterpret_cast<OutputType*>(buffer);
+
+#if defined(USE_SSE2)
+      constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+#ifndef USE_SSSE3
+      const __m128i kZeros = _mm_setzero_si128();
+#else
+      const __m128i kOnes = _mm_set1_epi16(1);
+#endif
+      const auto input_vector = reinterpret_cast<const __m128i*>(input);
+
+#elif defined(USE_MMX)
      constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
      const __m64 kZeros = _mm_setzero_si64();
      const auto input_vector = reinterpret_cast<const __m64*>(input);

-  #elif defined(USE_NEON)
+#elif defined(USE_NEON)
      constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
      const auto input_vector = reinterpret_cast<const int8x8_t*>(input);
-  #endif
+#endif

      for (IndexType i = 0; i < kOutputDimensions; ++i) {
        const IndexType offset = i * kPaddedInputDimensions;

-  #if defined(USE_AVX512)
-        __m512i sum = _mm512_setzero_si512();
-        const auto row = reinterpret_cast<const __m512i*>(&weights_[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-  #if defined(USE_VNNI)
-            sum = _mm512_dpbusd_epi32(sum, _mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
-  #else
-            __m512i product = _mm512_maddubs_epi16(_mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
-            product = _mm512_madd_epi16(product, kOnes);
-            sum = _mm512_add_epi32(sum, product);
-  #endif
-        }
-
-        // Note: Changing kMaxSimdWidth from 32 to 64 breaks loading existing networks.
-        // As a result kPaddedInputDimensions may not be an even multiple of 64(512bit)
-        // and we have to do one more 256bit chunk.
-        if (kPaddedInputDimensions != kNumChunks * kSimdWidth * 2)
-        {
-            const auto iv256  = reinterpret_cast<const __m256i*>(&input_vector[kNumChunks]);
-            const auto row256 = reinterpret_cast<const __m256i*>(&row[kNumChunks]);
-  #if defined(USE_VNNI)
-            __m256i product256 = _mm256_dpbusd_epi32(
-                _mm512_castsi512_si256(sum), _mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0]));
-            sum = _mm512_inserti32x8(sum, product256, 0);
-  #else
-            __m256i product256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0]));
-            sum = _mm512_add_epi32(sum, _mm512_cvtepi16_epi32(product256));
-  #endif
-        }
-        output[i] = _mm512_reduce_add_epi32(sum) + biases_[i];
-
-  #elif defined(USE_AVX2)
-        __m256i sum = _mm256_setzero_si256();
-        const auto row = reinterpret_cast<const __m256i*>(&weights_[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-  #if defined(USE_VNNI)
-          sum = _mm256_dpbusd_epi32(sum, _mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
-  #else
-          __m256i product = _mm256_maddubs_epi16(_mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
-          product = _mm256_madd_epi16(product, kOnes);
-          sum = _mm256_add_epi32(sum, product);
-  #endif
-        }
-        __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
-        sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC));
-        sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB));
-        output[i] = _mm_cvtsi128_si32(sum128) + biases_[i];
-
-  #elif defined(USE_SSSE3)
-        __m128i sum = _mm_setzero_si128();
-        const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
-        for (int j = 0; j < (int)kNumChunks - 1; j += 2) {
-          __m128i product0 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j]));
-          product0 = _mm_madd_epi16(product0, kOnes);
-          sum = _mm_add_epi32(sum, product0);
-          __m128i product1 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j+1]), _mm_load_si128(&row[j+1]));
-          product1 = _mm_madd_epi16(product1, kOnes);
-          sum = _mm_add_epi32(sum, product1);
-        }
-        if (kNumChunks & 0x1) {
-          __m128i product = _mm_maddubs_epi16(_mm_load_si128(&input_vector[kNumChunks-1]), _mm_load_si128(&row[kNumChunks-1]));
-          product = _mm_madd_epi16(product, kOnes);
-          sum = _mm_add_epi32(sum, product);
-        }
-        sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC
-        sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB
-        output[i] = _mm_cvtsi128_si32(sum) + biases_[i];
-
-  #elif defined(USE_SSE2)
+#if defined(USE_SSE2)
        __m128i sum_lo = _mm_cvtsi32_si128(biases_[i]);
        __m128i sum_hi = kZeros;
        const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
@@ -223,7 +739,7 @@ namespace Eval::NNUE::Layers {
        sum = _mm_add_epi32(sum, sum_second_32);
        output[i] = _mm_cvtsi128_si32(sum);

-  #elif defined(USE_MMX)
+#elif defined(USE_MMX)
        __m64 sum_lo = _mm_cvtsi32_si64(biases_[i]);
        __m64 sum_hi = kZeros;
        const auto row = reinterpret_cast<const __m64*>(&weights_[offset]);
@@ -244,7 +760,7 @@ namespace Eval::NNUE::Layers {
        sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum));
        output[i] = _mm_cvtsi64_si32(sum);

-  #elif defined(USE_NEON)
+#elif defined(USE_NEON)
        int32x4_t sum = {biases_[i]};
        const auto row = reinterpret_cast<const int8x8_t*>(&weights_[offset]);
        for (IndexType j = 0; j < kNumChunks; ++j) {
@@ -254,18 +770,21 @@ namespace Eval::NNUE::Layers {
        }
        output[i] = sum[0] + sum[1] + sum[2] + sum[3];

-  #else
+#else
        OutputType sum = biases_[i];
        for (IndexType j = 0; j < kInputDimensions; ++j) {
          sum += weights_[offset + j] * input[j];
        }
        output[i] = sum;
-  #endif
+#endif

      }
-  #if defined(USE_MMX)
+#if defined(USE_MMX)
      _mm_empty();
-  #endif
+#endif
+
+#endif
+
      return output;
    }

@@ -23,6 +23,10 @@

 #include "../nnue_common.h"

+#include <string>
+#include <cstdint>
+#include <type_traits>
+
 namespace Eval::NNUE::Layers {

  // Clipped ReLU
@@ -47,6 +51,8 @@ namespace Eval::NNUE::Layers {
    static constexpr std::size_t kBufferSize =
        PreviousLayer::kBufferSize + kSelfBufferSize;

+    static constexpr int kLayerIndex = PreviousLayer::kLayerIndex + 1;
+
    // Hash value embedded in the evaluation file
    static constexpr std::uint32_t GetHashValue() {
      std::uint32_t hash_value = 0x538D24C7u;
@@ -54,11 +60,24 @@ namespace Eval::NNUE::Layers {
      return hash_value;
    }

+    static std::string get_name() {
+        return "ClippedReLU[" +
+            std::to_string(kOutputDimensions) + "]";
+    }
+
    // A string that represents the structure from the input layer to this layer
-    static std::string GetStructureString() {
-      return "ClippedReLU[" +
-        std::to_string(kOutputDimensions) + "](" +
-        PreviousLayer::GetStructureString() + ")";
+    static std::string get_structure_string() {
+        return get_name() + "(" +
+            PreviousLayer::get_structure_string() + ")";
+    }
+
+    static std::string get_layers_info() {
+        std::string info = PreviousLayer::get_layers_info();
+        info += "\n  - ";
+        info += std::to_string(kLayerIndex);
+        info += " - ";
+        info += get_name();
+        return info;
    }

    // Read network parameters
@@ -68,7 +87,7 @@ namespace Eval::NNUE::Layers {

    // write parameters
    bool WriteParameters(std::ostream& stream) const {
-      return previous_layer_.WriteParameters(stream);
+        return previous_layer_.WriteParameters(stream);
    }

    // Forward propagation
@@ -86,12 +105,12 @@ namespace Eval::NNUE::Layers {
      const auto out = reinterpret_cast<__m256i*>(output);
      for (IndexType i = 0; i < kNumChunks; ++i) {
        const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32(
-            _mm256_loadA_si256(&in[i * 4 + 0]),
-            _mm256_loadA_si256(&in[i * 4 + 1])), kWeightScaleBits);
+            _mm256_load_si256(&in[i * 4 + 0]),
+            _mm256_load_si256(&in[i * 4 + 1])), kWeightScaleBits);
        const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32(
-            _mm256_loadA_si256(&in[i * 4 + 2]),
-            _mm256_loadA_si256(&in[i * 4 + 3])), kWeightScaleBits);
-        _mm256_storeA_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
+            _mm256_load_si256(&in[i * 4 + 2]),
+            _mm256_load_si256(&in[i * 4 + 3])), kWeightScaleBits);
+        _mm256_store_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
            _mm256_packs_epi16(words0, words1), kZero), kOffsets));
      }
      constexpr IndexType kStart = kNumChunks * kSimdWidth;
@@ -170,9 +189,9 @@ namespace Eval::NNUE::Layers {
    }

   private:
-     // Make the learning class a friend
-     friend class Trainer<ClippedReLU>;
-     
+    // Make the learning class a friend
+    friend class Trainer<ClippedReLU>;
+
    PreviousLayer previous_layer_;
  };

@@ -41,6 +41,8 @@ class InputSlice {
  // Size of forward propagation buffer used from the input layer to this layer
  static constexpr std::size_t kBufferSize = 0;

+  static constexpr int kLayerIndex = 1;
+
  // Hash value embedded in the evaluation file
  static constexpr std::uint32_t GetHashValue() {
    std::uint32_t hash_value = 0xEC42E90Du;
@@ -48,12 +50,24 @@ class InputSlice {
    return hash_value;
  }

-  // A string that represents the structure from the input layer to this layer
-  static std::string GetStructureString() {
-    return "InputSlice[" + std::to_string(kOutputDimensions) + "(" +
-      std::to_string(Offset) + ":" +
-      std::to_string(Offset + kOutputDimensions) + ")]";
-  }
+    static std::string get_name() {
+        return "InputSlice[" + std::to_string(kOutputDimensions) + "(" +
+            std::to_string(Offset) + ":" +
+            std::to_string(Offset + kOutputDimensions) + ")]";
+    }
+
+    // A string that represents the structure from the input layer to this layer
+    static std::string get_structure_string() {
+        return get_name();
+    }
+
+    static std::string get_layers_info() {
+        std::string info = "  - ";
+        info += std::to_string(kLayerIndex);
+        info += " - ";
+        info += get_name();
+        return info;
+    }

  // Read network parameters
  bool ReadParameters(std::istream& /*stream*/) {
@@ -62,7 +76,7 @@ class InputSlice {

  // write parameters
  bool WriteParameters(std::ostream& /*stream*/) const {
-    return true;
+      return true;
  }

  // Forward propagation
@@ -1,163 +1,196 @@
-// Definition of layer Sum of NNUE evaluation function
-
-#ifndef _NNUE_LAYERS_SUM_H_
+#ifndef _NNUE_LAYERS_SUM_H_
 #define _NNUE_LAYERS_SUM_H_

-#if defined(EVAL_NNUE)
+#include "nnue/nnue_common.h"

-#include "../nnue_common.h"
+// Definition of layer Sum of NNUE evaluation function
+namespace Eval::NNUE::Layers {

-namespace Eval {
+    // Layer that sums the output of multiple layers
+    template <typename FirstPreviousLayer, typename... RemainingPreviousLayers>
+    class Sum : public Sum<RemainingPreviousLayers...> {
+    private:
+        using Head = FirstPreviousLayer;
+        using Tail = Sum<RemainingPreviousLayers...>;

-namespace NNUE {
+     public:
+        // Input/output type
+        using InputType = typename Head::OutputType;

-namespace Layers {
+        using OutputType = InputType;

-// Layer that sums the output of multiple layers
-template <typename FirstPreviousLayer, typename... RemainingPreviousLayers>
-class Sum : public Sum<RemainingPreviousLayers...> {
- private:
-  using Head = FirstPreviousLayer;
-  using Tail = Sum<RemainingPreviousLayers...>;
+        static_assert(std::is_same<InputType, typename Tail::InputType>::value, "");

- public:
-  // Input/output type
-  using InputType = typename Head::OutputType;
-  using OutputType = InputType;
-  static_assert(std::is_same<InputType, typename Tail::InputType>::value, "");
+        // number of input/output dimensions
+        static constexpr IndexType kInputDimensions = Head::kOutputDimensions;

-  // number of input/output dimensions
-  static constexpr IndexType kInputDimensions = Head::kOutputDimensions;
-  static constexpr IndexType kOutputDimensions = kInputDimensions;
-  static_assert(kInputDimensions == Tail::kInputDimensions ,"");
+        static constexpr IndexType kOutputDimensions = kInputDimensions;

-  // Size of forward propagation buffer used in this layer
-  static constexpr std::size_t kSelfBufferSize =
-      CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
+        static_assert(kInputDimensions == Tail::kInputDimensions ,"");

-  // Size of the forward propagation buffer used from the input layer to this layer
-  static constexpr std::size_t kBufferSize =
-      std::max(Head::kBufferSize + kSelfBufferSize, Tail::kBufferSize);
+        // Size of forward propagation buffer used in this layer
+        static constexpr std::size_t kSelfBufferSize =
+            CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);

-  // Hash value embedded in the evaluation function file
-  static constexpr std::uint32_t GetHashValue() {
-    std::uint32_t hash_value = 0xBCE400B4u;
-    hash_value ^= Head::GetHashValue() >> 1;
-    hash_value ^= Head::GetHashValue() << 31;
-    hash_value ^= Tail::GetHashValue() >> 2;
-    hash_value ^= Tail::GetHashValue() << 30;
-    return hash_value;
-  }
+        // Size of the forward propagation buffer used from the input layer to this layer
+        static constexpr std::size_t kBufferSize =
+            std::max(Head::kBufferSize + kSelfBufferSize, Tail::kBufferSize);

-  // A string that represents the structure from the input layer to this layer
-  static std::string GetStructureString() {
-    return "Sum[" +
-        std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")";
-  }
+        static constexpr int kLayerIndex = Tail::kLayerIndex + 1;

-  // read parameters
-  bool ReadParameters(std::istream& stream) {
-    if (!Tail::ReadParameters(stream)) return false;
-    return previous_layer_.ReadParameters(stream);
-  }
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t GetHashValue() {
+            std::uint32_t hash_value = 0xBCE400B4u;
+            hash_value ^= Head::GetHashValue() >> 1;
+            hash_value ^= Head::GetHashValue() << 31;
+            hash_value ^= Tail::GetHashValue() >> 2;
+            hash_value ^= Tail::GetHashValue() << 30;
+            return hash_value;
+        }

-  // write parameters
-  bool WriteParameters(std::ostream& stream) const {
-    if (!Tail::WriteParameters(stream)) return false;
-    return previous_layer_.WriteParameters(stream);
-  }
+        static std::string get_name() {
+             return "Sum[" +
+                std::to_string(kOutputDimensions) + "]";
+        }

-  // forward propagation
-  const OutputType* Propagate(
-      const TransformedFeatureType* transformed_features, char* buffer) const {
-    Tail::Propagate(transformed_features, buffer);
-    const auto head_output = previous_layer_.Propagate(
-        transformed_features, buffer + kSelfBufferSize);
-    const auto output = reinterpret_cast<OutputType*>(buffer);
-    for (IndexType i = 0; i <kOutputDimensions; ++i) {
-      output[i] += head_output[i];
-    }
-    return output;
-  }
+        // A string that represents the structure from the input layer to this layer
+        static std::string get_structure_string() {
+            return get_name() + "(" + get_summands_string() + ")";
+        }

- protected:
-  // A string that represents the list of layers to be summed
-  static std::string GetSummandsString() {
-    return Head::GetStructureString() + "," + Tail::GetSummandsString();
-  }
+        static std::string get_layers_info() {
+            std::string info = Tail::get_layers_info();
+            info += "\n  - ";
+            info += std::to_string(kLayerIndex);
+            info += " - ";
+            info += get_name();
+            return info;
+        }

-  // Make the learning class a friend
-  friend class Trainer<Sum>;
+        // read parameters
+        bool ReadParameters(std::istream& stream) {
+            if (!Tail::ReadParameters(stream))
+                return false;

-  // the layer immediately before this layer
-  FirstPreviousLayer previous_layer_;
-};
+            return previous_layer_.ReadParameters(stream);
+        }

-// Layer that sums the output of multiple layers (when there is one template argument)
-template <typename PreviousLayer>
-class Sum<PreviousLayer> {
- public:
-  // Input/output type
-  using InputType = typename PreviousLayer::OutputType;
-  using OutputType = InputType;
+        // write parameters
+        bool WriteParameters(std::ostream& stream) const {
+            if (!Tail::WriteParameters(stream))
+                return false;

-  // number of input/output dimensions
-  static constexpr IndexType kInputDimensions =
-      PreviousLayer::kOutputDimensions;
-  static constexpr IndexType kOutputDimensions = kInputDimensions;
+            return previous_layer_.WriteParameters(stream);
+        }

-  // Size of the forward propagation buffer used from the input layer to this layer
-  static constexpr std::size_t kBufferSize = PreviousLayer::kBufferSize;
+        // forward propagation
+        const OutputType* propagate(
+            const TransformedFeatureType* transformed_features, char* buffer) const {

-  // Hash value embedded in the evaluation function file
-  static constexpr std::uint32_t GetHashValue() {
-    std::uint32_t hash_value = 0xBCE400B4u;
-    hash_value ^= PreviousLayer::GetHashValue() >> 1;
-    hash_value ^= PreviousLayer::GetHashValue() << 31;
-    return hash_value;
-  }
+            Tail::propagate(transformed_features, buffer);

-  // A string that represents the structure from the input layer to this layer
-  static std::string GetStructureString() {
-    return "Sum[" +
-        std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")";
-  }
+            const auto head_output = previous_layer_.Propagate(
+                transformed_features, buffer + kSelfBufferSize);

-  // read parameters
-  bool ReadParameters(std::istream& stream) {
-    return previous_layer_.ReadParameters(stream);
-  }
+            const auto output = reinterpret_cast<OutputType*>(buffer);

-  // write parameters
-  bool WriteParameters(std::ostream& stream) const {
-    return previous_layer_.WriteParameters(stream);
-  }
+            for (IndexType i = 0; i <kOutputDimensions; ++i) {
+                output[i] += head_output[i];
+            }

-  // forward propagation
-  const OutputType* Propagate(
-      const TransformedFeatureType* transformed_features, char* buffer) const {
-    return previous_layer_.Propagate(transformed_features, buffer);
-  }
+            return output;
+        }

- protected:
-  // A string that represents the list of layers to be summed
-  static std::string GetSummandsString() {
-    return PreviousLayer::GetStructureString();
-  }
+    protected:
+        // A string that represents the list of layers to be summed
+        static std::string get_summands_string() {
+            return Head::get_structure_string() + "," + Tail::get_summands_string();
+        }

-  // Make the learning class a friend
-  friend class Trainer<Sum>;
+        // Make the learning class a friend
+        friend class Trainer<Sum>;

-  // the layer immediately before this layer
-  PreviousLayer previous_layer_;
-};
+        // the layer immediately before this layer
+        FirstPreviousLayer previous_layer_;
+    };

-}  // namespace Layers
+    // Layer that sums the output of multiple layers (when there is one template argument)
+    template <typename PreviousLayer>
+    class Sum<PreviousLayer> {
+    public:
+        // Input/output type
+        using InputType = typename PreviousLayer::OutputType;

-}  // namespace NNUE
+        using OutputType = InputType;

-}  // namespace Eval
+        // number of input/output dimensions
+        static constexpr IndexType kInputDimensions =
+            PreviousLayer::kOutputDimensions;

-#endif  // defined(EVAL_NNUE)
+        static constexpr IndexType kOutputDimensions = kInputDimensions;
+
+        // Size of the forward propagation buffer used from the input layer to this layer
+        static constexpr std::size_t kBufferSize = PreviousLayer::kBufferSize;
+
+        static constexpr int kLayerIndex = PreviousLayer::kLayerIndex + 1;
+
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t GetHashValue() {
+            std::uint32_t hash_value = 0xBCE400B4u;
+            hash_value ^= PreviousLayer::GetHashValue() >> 1;
+            hash_value ^= PreviousLayer::GetHashValue() << 31;
+            return hash_value;
+        }
+
+        static std::string get_name() {
+             return "Sum[" +
+                std::to_string(kOutputDimensions) + "]";
+        }
+
+        // A string that represents the structure from the input layer to this layer
+        static std::string get_structure_string() {
+            return get_name() + "(" + get_summands_string() + ")";
+        }
+
+        static std::string get_layers_info() {
+            std::string info = PreviousLayer::get_layers_info();
+            info += '\n';
+            info += std::to_string(kLayerIndex);
+            info += ": ";
+            info += get_name();
+            return info;
+        }
+
+        // read parameters
+        bool ReadParameters(std::istream& stream) {
+            return previous_layer_.ReadParameters(stream);
+        }
+
+        // write parameters
+        bool WriteParameters(std::ostream& stream) const {
+            return previous_layer_.WriteParameters(stream);
+        }
+
+        // forward propagation
+        const OutputType* Propagate(
+            const TransformedFeatureType* transformed_features, char* buffer) const {
+
+            return previous_layer_.Propagate(transformed_features, buffer);
+        }
+
+    protected:
+        // A string that represents the list of layers to be summed
+        static std::string get_summands_string() {
+            return PreviousLayer::get_structure_string();
+        }
+
+        // Make the learning class a friend
+        friend class Trainer<Sum>;
+
+        // the layer immediately before this layer
+        PreviousLayer previous_layer_;
+    };
+
+}  // namespace Eval::NNUE::Layers

 #endif
@@ -27,11 +27,8 @@ namespace Eval::NNUE {

  // Class that holds the result of affine transformation of input features
  struct alignas(kCacheLineSize) Accumulator {
-    std::int16_t
-        accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions];
-    Value score;
-    bool computed_accumulation;
-    bool computed_score;
+      std::int16_t accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions];
+      bool computed_accumulation;
  };

 }  // namespace Eval::NNUE
@@ -21,6 +21,8 @@
 #ifndef NNUE_COMMON_H_INCLUDED
 #define NNUE_COMMON_H_INCLUDED

+#include "types.h"
+
 #include <cstring>
 #include <iostream>

@@ -43,29 +45,6 @@
 #include <arm_neon.h>
 #endif

-// HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Otherwise a binary
-//       compiled with older g++ crashes because the output memory is not aligned
-//       even though alignas is specified.
-#if defined(USE_AVX2)
-#if defined(__GNUC__ ) && (__GNUC__ < 9) && defined(_WIN32) && !defined(__clang__)
-#define _mm256_loadA_si256  _mm256_loadu_si256
-#define _mm256_storeA_si256 _mm256_storeu_si256
-#else
-#define _mm256_loadA_si256  _mm256_load_si256
-#define _mm256_storeA_si256 _mm256_store_si256
-#endif
-#endif
-
-#if defined(USE_AVX512)
-#if defined(__GNUC__ ) && (__GNUC__ < 9) && defined(_WIN32) && !defined(__clang__)
-#define _mm512_loadA_si512   _mm512_loadu_si512
-#define _mm512_storeA_si512  _mm512_storeu_si512
-#else
-#define _mm512_loadA_si512   _mm512_load_si512
-#define _mm512_storeA_si512  _mm512_store_si512
-#endif
-#endif
-
 namespace Eval::NNUE {

  // Version of the evaluation file
@@ -113,7 +92,7 @@ namespace Eval::NNUE {
    PS_END2     = 12 * SQUARE_NB + 1
  };

-  extern uint32_t kpp_board_index[PIECE_NB][COLOR_NB];
+  extern const uint32_t kpp_board_index[PIECE_NB][COLOR_NB];

  // Type of input feature after conversion
  using TransformedFeatureType = std::uint8_t;
@@ -25,10 +25,66 @@
 #include "nnue_architecture.h"
 #include "features/index_list.h"

-#include <cstring> // std::memset()
+#include <cstring>
+#include <string>

 namespace Eval::NNUE {

+  // If vector instructions are enabled, we update and refresh the
+  // accumulator tile by tile such that each tile fits in the CPU's
+  // vector registers.
+  #define VECTOR
+
+  #ifdef USE_AVX512
+  typedef __m512i vec_t;
+  #define vec_load(a) _mm512_load_si512(a)
+  #define vec_store(a,b) _mm512_store_si512(a,b)
+  #define vec_add_16(a,b) _mm512_add_epi16(a,b)
+  #define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
+  #define vec_zero _mm512_setzero_si512()
+  static constexpr IndexType kNumRegs = 8; // only 8 are needed
+
+  #elif USE_AVX2
+  typedef __m256i vec_t;
+  #define vec_load(a) _mm256_load_si256(a)
+  #define vec_store(a,b) _mm256_store_si256(a,b)
+  #define vec_add_16(a,b) _mm256_add_epi16(a,b)
+  #define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
+  #define vec_zero _mm256_setzero_si256()
+  static constexpr IndexType kNumRegs = 16;
+
+  #elif USE_SSE2
+  typedef __m128i vec_t;
+  #define vec_load(a) (*(a))
+  #define vec_store(a,b) *(a)=(b)
+  #define vec_add_16(a,b) _mm_add_epi16(a,b)
+  #define vec_sub_16(a,b) _mm_sub_epi16(a,b)
+  #define vec_zero _mm_setzero_si128()
+  static constexpr IndexType kNumRegs = Is64Bit ? 16 : 8;
+
+  #elif USE_MMX
+  typedef __m64 vec_t;
+  #define vec_load(a) (*(a))
+  #define vec_store(a,b) *(a)=(b)
+  #define vec_add_16(a,b) _mm_add_pi16(a,b)
+  #define vec_sub_16(a,b) _mm_sub_pi16(a,b)
+  #define vec_zero _mm_setzero_si64()
+  static constexpr IndexType kNumRegs = 8;
+
+  #elif USE_NEON
+  typedef int16x8_t vec_t;
+  #define vec_load(a) (*(a))
+  #define vec_store(a,b) *(a)=(b)
+  #define vec_add_16(a,b) vaddq_s16(a,b)
+  #define vec_sub_16(a,b) vsubq_s16(a,b)
+  #define vec_zero {0}
+  static constexpr IndexType kNumRegs = 16;
+
+  #else
+  #undef VECTOR
+
+  #endif
+
  // Input feature converter
  class FeatureTransformer {

@@ -36,6 +92,11 @@ namespace Eval::NNUE {
    // Number of output dimensions for one side
    static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions;

+    #ifdef VECTOR
+    static constexpr IndexType kTileHeight = kNumRegs * sizeof(vec_t) / 2;
+    static_assert(kHalfDimensions % kTileHeight == 0, "kTileHeight must divide kHalfDimensions");
+    #endif
+
   public:
    // Output type
    using OutputType = TransformedFeatureType;
@@ -48,20 +109,36 @@ namespace Eval::NNUE {
    static constexpr std::size_t kBufferSize =
        kOutputDimensions * sizeof(OutputType);

+    static constexpr int kLayerIndex = 0;
+
    // Hash value embedded in the evaluation file
    static constexpr std::uint32_t GetHashValue() {
+
      return RawFeatures::kHashValue ^ kOutputDimensions;
    }

+    static std::string get_name() {
+      return RawFeatures::get_name() + "[" +
+          std::to_string(kInputDimensions) + "->" +
+          std::to_string(kHalfDimensions) + "x2]";
+    }
+
    // a string representing the structure
-    static std::string GetStructureString() {
-      return RawFeatures::GetName() + "[" +
-        std::to_string(kInputDimensions) + "->" +
-        std::to_string(kHalfDimensions) + "x2]";
+    static std::string get_structure_string() {
+      return get_name();
+    }
+
+    static std::string get_layers_info() {
+      std::string info = "  - ";
+      info += std::to_string(kLayerIndex);
+      info += " - ";
+      info += get_name();
+      return info;
    }

    // Read network parameters
    bool ReadParameters(std::istream& stream) {
+
      for (std::size_t i = 0; i < kHalfDimensions; ++i)
        biases_[i] = read_little_endian<BiasType>(stream);
      for (std::size_t i = 0; i < kHalfDimensions * kInputDimensions; ++i)
@@ -72,34 +149,45 @@ namespace Eval::NNUE {
    // write parameters
    bool WriteParameters(std::ostream& stream) const {
      stream.write(reinterpret_cast<const char*>(biases_),
-        kHalfDimensions * sizeof(BiasType));
+          kHalfDimensions * sizeof(BiasType));
+
      stream.write(reinterpret_cast<const char*>(weights_),
-        kHalfDimensions * kInputDimensions * sizeof(WeightType));
+          kHalfDimensions * kInputDimensions * sizeof(WeightType));
+
      return !stream.fail();
    }

    // Proceed with the difference calculation if possible
-    bool UpdateAccumulatorIfPossible(const Position& pos) const {
+    bool update_accumulator_if_possible(const Position& pos) const {
+
      const auto now = pos.state();
-      if (now->accumulator.computed_accumulation) {
+      if (now->accumulator.computed_accumulation)
        return true;
-      }
+
      const auto prev = now->previous;
      if (prev && prev->accumulator.computed_accumulation) {
-        UpdateAccumulator(pos);
+        update_accumulator(pos);
        return true;
      }
+
      return false;
    }

    // Convert input features
-    void Transform(const Position& pos, OutputType* output, bool refresh) const {
-      if (refresh || !UpdateAccumulatorIfPossible(pos)) {
-        RefreshAccumulator(pos);
-      }
+    void Transform(const Position& pos, OutputType* output) const {
+
+      if (!update_accumulator_if_possible(pos))
+        refresh_accumulator(pos);
+
      const auto& accumulation = pos.state()->accumulator.accumulation;

-  #if defined(USE_AVX2)
+  #if defined(USE_AVX512)
+      constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth * 2);
+      static_assert(kHalfDimensions % (kSimdWidth * 2) == 0);
+      const __m512i kControl = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
+      const __m512i kZero = _mm512_setzero_si512();
+
+  #elif defined(USE_AVX2)
      constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
      constexpr int kControl = 0b11011000;
      const __m256i kZero = _mm256_setzero_si256();
@@ -126,14 +214,39 @@ namespace Eval::NNUE {
      for (IndexType p = 0; p < 2; ++p) {
        const IndexType offset = kHalfDimensions * p;

-  #if defined(USE_AVX2)
+  #if defined(USE_AVX512)
+        auto out = reinterpret_cast<__m512i*>(&output[offset]);
+        for (IndexType j = 0; j < kNumChunks; ++j) {
+          __m512i sum0 = _mm512_load_si512(
+              &reinterpret_cast<const __m512i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
+          __m512i sum1 = _mm512_load_si512(
+              &reinterpret_cast<const __m512i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
+          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+              sum0 = _mm512_add_epi16(sum0, reinterpret_cast<const __m512i*>(
+                  accumulation[perspectives[p]][i])[j * 2 + 0]);
+              sum1 = _mm512_add_epi16(sum1, reinterpret_cast<const __m512i*>(
+                  accumulation[perspectives[p]][i])[j * 2 + 1]);
+          }
+
+          _mm512_store_si512(&out[j], _mm512_permutexvar_epi64(kControl,
+              _mm512_max_epi8(_mm512_packs_epi16(sum0, sum1), kZero)));
+        }
+
+  #elif defined(USE_AVX2)
        auto out = reinterpret_cast<__m256i*>(&output[offset]);
        for (IndexType j = 0; j < kNumChunks; ++j) {
-          __m256i sum0 = _mm256_loadA_si256(
+          __m256i sum0 = _mm256_load_si256(
              &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
-          __m256i sum1 = _mm256_loadA_si256(
-            &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
-          _mm256_storeA_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
+          __m256i sum1 = _mm256_load_si256(
+              &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
+          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+              sum0 = _mm256_add_epi16(sum0, reinterpret_cast<const __m256i*>(
+                  accumulation[perspectives[p]][i])[j * 2 + 0]);
+              sum1 = _mm256_add_epi16(sum1, reinterpret_cast<const __m256i*>(
+                  accumulation[perspectives[p]][i])[j * 2 + 1]);
+          }
+
+          _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
              _mm256_packs_epi16(sum0, sum1), kZero), kControl));
        }

@@ -144,14 +257,21 @@ namespace Eval::NNUE {
              accumulation[perspectives[p]][0])[j * 2 + 0]);
          __m128i sum1 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
              accumulation[perspectives[p]][0])[j * 2 + 1]);
+          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+            sum0 = _mm_add_epi16(sum0, reinterpret_cast<const __m128i*>(
+                accumulation[perspectives[p]][i])[j * 2 + 0]);
+            sum1 = _mm_add_epi16(sum1, reinterpret_cast<const __m128i*>(
+                accumulation[perspectives[p]][i])[j * 2 + 1]);
+          }
+
      const __m128i packedbytes = _mm_packs_epi16(sum0, sum1);

          _mm_store_si128(&out[j],

  #ifdef USE_SSE41
-            _mm_max_epi8(packedbytes, kZero)
+              _mm_max_epi8(packedbytes, kZero)
  #else
-            _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
+              _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
  #endif

          );
@@ -164,6 +284,13 @@ namespace Eval::NNUE {
              accumulation[perspectives[p]][0])[j * 2 + 0]);
          __m64 sum1 = *(&reinterpret_cast<const __m64*>(
              accumulation[perspectives[p]][0])[j * 2 + 1]);
+          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+              sum0 = _mm_add_pi16(sum0, reinterpret_cast<const __m64*>(
+                  accumulation[perspectives[p]][i])[j * 2 + 0]);
+              sum1 = _mm_add_pi16(sum1, reinterpret_cast<const __m64*>(
+                  accumulation[perspectives[p]][i])[j * 2 + 1]);
+          }
+
          const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
          out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
        }
@@ -173,12 +300,22 @@ namespace Eval::NNUE {
        for (IndexType j = 0; j < kNumChunks; ++j) {
          int16x8_t sum = reinterpret_cast<const int16x8_t*>(
              accumulation[perspectives[p]][0])[j];
+
+          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+              sum = vaddq_s16(sum, reinterpret_cast<const int16x8_t*>(
+                  accumulation[perspectives[p]][i])[j]);
+          }
+
          out[j] = vmax_s8(vqmovn_s16(sum), kZero);
        }

  #else
        for (IndexType j = 0; j < kHalfDimensions; ++j) {
          BiasType sum = accumulation[static_cast<int>(perspectives[p])][0][j];
+          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+              sum += accumulation[static_cast<int>(perspectives[p])][i][j];
+          }
+
          output[offset + j] = static_cast<OutputType>(
              std::max<int>(0, std::min<int>(127, sum)));
        }
@@ -192,108 +329,150 @@ namespace Eval::NNUE {

   private:
    // Calculate cumulative value without using difference calculation
-    void RefreshAccumulator(const Position& pos) const {
+    void refresh_accumulator(const Position& pos) const {
+
+  #ifdef VECTOR
+      // Gcc-10.2 unnecessarily spills AVX2 registers if this array
+      // is defined in the VECTOR code below, once in each branch
+      vec_t acc[kNumRegs];
+  #endif
      auto& accumulator = pos.state()->accumulator;
-      IndexType i = 0;
-      Features::IndexList active_indices[2];
-      RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i],
-                                       active_indices);
-      for (Color perspective : { WHITE, BLACK }) {
-        std::memcpy(accumulator.accumulation[perspective][i], biases_,
-                   kHalfDimensions * sizeof(BiasType));
-        for (const auto index : active_indices[perspective]) {
-          const IndexType offset = kHalfDimensions * index;
-  #if defined(USE_AVX512)
-          auto accumulation = reinterpret_cast<__m512i*>(
-              &accumulator.accumulation[perspective][i][0]);
-          auto column = reinterpret_cast<const __m512i*>(&weights_[offset]);
-          constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
-          for (IndexType j = 0; j < kNumChunks; ++j)
-            _mm512_storeA_si512(&accumulation[j], _mm512_add_epi16(_mm512_loadA_si512(&accumulation[j]), column[j]));
+      for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+        Features::IndexList active_indices[2];
+        RawFeatures::append_active_indices(pos, kRefreshTriggers[i],
+                                           active_indices);
+          for (Color perspective : { WHITE, BLACK }) {
+#ifdef VECTOR
+            for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
+              auto accTile = reinterpret_cast<vec_t*>(
+                  &accumulator.accumulation[perspective][i][j * kTileHeight]);

-  #elif defined(USE_AVX2)
-          auto accumulation = reinterpret_cast<__m256i*>(
-              &accumulator.accumulation[perspective][i][0]);
-          auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
-          constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-          for (IndexType j = 0; j < kNumChunks; ++j)
-            _mm256_storeA_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadA_si256(&accumulation[j]), column[j]));
+              if (i == 0) {
+                auto biasesTile = reinterpret_cast<const vec_t*>(
+                    &biases_[j * kTileHeight]);
+                for (IndexType k = 0; k < kNumRegs; ++k)
+                  acc[k] = biasesTile[k];
+              } else {
+                for (IndexType k = 0; k < kNumRegs; ++k)
+                  acc[k] = vec_zero;
+              }

-  #elif defined(USE_SSE2)
-          auto accumulation = reinterpret_cast<__m128i*>(
-              &accumulator.accumulation[perspective][i][0]);
-          auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
-          constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-          for (IndexType j = 0; j < kNumChunks; ++j)
-            accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
+              for (const auto index : active_indices[perspective]) {
+                const IndexType offset = kHalfDimensions * index + j * kTileHeight;
+                auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);

-  #elif defined(USE_MMX)
-          auto accumulation = reinterpret_cast<__m64*>(
-              &accumulator.accumulation[perspective][i][0]);
-          auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
-          constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-          for (IndexType j = 0; j < kNumChunks; ++j) {
-            accumulation[j] = _mm_add_pi16(accumulation[j], column[j]);
+                for (IndexType k = 0; k < kNumRegs; ++k)
+                  acc[k] = vec_add_16(acc[k], column[k]);
+              }
+
+              for (IndexType k = 0; k < kNumRegs; k++)
+                vec_store(&accTile[k], acc[k]);
+            }
+#else
+            if (i == 0) {
+              std::memcpy(accumulator.accumulation[perspective][i], biases_,
+                          kHalfDimensions * sizeof(BiasType));
+            } else {
+              std::memset(accumulator.accumulation[perspective][i], 0,
+                          kHalfDimensions * sizeof(BiasType));
+            }
+
+            for (const auto index : active_indices[perspective]) {
+              const IndexType offset = kHalfDimensions * index;
+
+              for (IndexType j = 0; j < kHalfDimensions; ++j)
+                accumulator.accumulation[perspective][i][j] += weights_[offset + j];
+            }
+#endif
          }

-  #elif defined(USE_NEON)
-          auto accumulation = reinterpret_cast<int16x8_t*>(
-              &accumulator.accumulation[perspective][i][0]);
-          auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
-          constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-          for (IndexType j = 0; j < kNumChunks; ++j)
-            accumulation[j] = vaddq_s16(accumulation[j], column[j]);
-
-  #else
-          for (IndexType j = 0; j < kHalfDimensions; ++j)
-            accumulator.accumulation[perspective][i][j] += weights_[offset + j];
-  #endif
-
        }
-      }
-  #if defined(USE_MMX)
-      _mm_empty();
-  #endif

-      accumulator.computed_accumulation = true;
-      accumulator.computed_score = false;
+#if defined(USE_MMX)
+        _mm_empty();
+#endif
+
+        accumulator.computed_accumulation = true;
    }

    // Calculate cumulative value using difference calculation
-    void UpdateAccumulator(const Position& pos) const {
-      const auto prev_accumulator = pos.state()->previous->accumulator;
-      auto& accumulator = pos.state()->accumulator;
-      IndexType i = 0;
+    void update_accumulator(const Position& pos) const {
+
+  #ifdef VECTOR
+      // Gcc-10.2 unnecessarily spills AVX2 registers if this array
+      // is defined in the VECTOR code below, once in each branch
+      vec_t acc[kNumRegs];
+  #endif
+    const auto& prev_accumulator = pos.state()->previous->accumulator;
+    auto& accumulator = pos.state()->accumulator;
+    for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
      Features::IndexList removed_indices[2], added_indices[2];
-      bool reset[2];
-      RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i],
-                                        removed_indices, added_indices, reset);
+      bool reset[2] = { false, false };
+      RawFeatures::append_changed_indices(pos, kRefreshTriggers[i],
+                                          removed_indices, added_indices, reset);
+
+#ifdef VECTOR
+      for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
+        for (Color perspective : { WHITE, BLACK }) {
+          auto accTile = reinterpret_cast<vec_t*>(
+              &accumulator.accumulation[perspective][i][j * kTileHeight]);
+
+          if (reset[perspective]) {
+            if (i == 0) {
+              auto biasesTile = reinterpret_cast<const vec_t*>(
+                  &biases_[j * kTileHeight]);
+              for (IndexType k = 0; k < kNumRegs; ++k)
+                acc[k] = biasesTile[k];
+            } else {
+              for (IndexType k = 0; k < kNumRegs; ++k)
+                acc[k] = vec_zero;
+            }
+          } else {
+            auto prevAccTile = reinterpret_cast<const vec_t*>(
+                &prev_accumulator.accumulation[perspective][i][j * kTileHeight]);
+
+            for (IndexType k = 0; k < kNumRegs; ++k)
+              acc[k] = vec_load(&prevAccTile[k]);
+
+            // Difference calculation for the deactivated features
+            for (const auto index : removed_indices[perspective]) {
+              const IndexType offset = kHalfDimensions * index + j * kTileHeight;
+              auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
+
+              for (IndexType k = 0; k < kNumRegs; ++k)
+                acc[k] = vec_sub_16(acc[k], column[k]);
+            }
+          }
+
+          { // Difference calculation for the activated features
+            for (const auto index : added_indices[perspective]) {
+              const IndexType offset = kHalfDimensions * index + j * kTileHeight;
+              auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
+
+              for (IndexType k = 0; k < kNumRegs; ++k)
+                acc[k] = vec_add_16(acc[k], column[k]);
+            }
+          }
+
+          for (IndexType k = 0; k < kNumRegs; ++k)
+            vec_store(&accTile[k], acc[k]);
+        }
+      }
+#if defined(USE_MMX)
+      _mm_empty();
+#endif
+
+#else
      for (Color perspective : { WHITE, BLACK }) {

-  #if defined(USE_AVX2)
-        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-        auto accumulation = reinterpret_cast<__m256i*>(
-            &accumulator.accumulation[perspective][i][0]);
-
-  #elif defined(USE_SSE2)
-        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-        auto accumulation = reinterpret_cast<__m128i*>(
-            &accumulator.accumulation[perspective][i][0]);
-
-  #elif defined(USE_MMX)
-        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-        auto accumulation = reinterpret_cast<__m64*>(
-            &accumulator.accumulation[perspective][i][0]);
-
-  #elif defined(USE_NEON)
-        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-        auto accumulation = reinterpret_cast<int16x8_t*>(
-            &accumulator.accumulation[perspective][i][0]);
-  #endif
-
        if (reset[perspective]) {
-          std::memcpy(accumulator.accumulation[perspective][i], biases_,
-                      kHalfDimensions * sizeof(BiasType));
+          if (i == 0) {
+            std::memcpy(accumulator.accumulation[perspective][i], biases_,
+                        kHalfDimensions * sizeof(BiasType));
+          } else {
+            std::memset(accumulator.accumulation[perspective][i], 0,
+                        kHalfDimensions * sizeof(BiasType));
+          }
        } else {
          std::memcpy(accumulator.accumulation[perspective][i],
                      prev_accumulator.accumulation[perspective][i],
@@ -302,83 +481,22 @@ namespace Eval::NNUE {
          for (const auto index : removed_indices[perspective]) {
            const IndexType offset = kHalfDimensions * index;

-  #if defined(USE_AVX2)
-            auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j) {
-              accumulation[j] = _mm256_sub_epi16(accumulation[j], column[j]);
-            }
-
-  #elif defined(USE_SSE2)
-            auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j) {
-              accumulation[j] = _mm_sub_epi16(accumulation[j], column[j]);
-            }
-
-  #elif defined(USE_MMX)
-            auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j) {
-              accumulation[j] = _mm_sub_pi16(accumulation[j], column[j]);
-            }
-
-  #elif defined(USE_NEON)
-            auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j) {
-              accumulation[j] = vsubq_s16(accumulation[j], column[j]);
-            }
-
-  #else
-            for (IndexType j = 0; j < kHalfDimensions; ++j) {
-              accumulator.accumulation[perspective][i][j] -=
-                  weights_[offset + j];
-            }
-  #endif
-
+            for (IndexType j = 0; j < kHalfDimensions; ++j)
+              accumulator.accumulation[perspective][i][j] -= weights_[offset + j];
          }
        }
        { // Difference calculation for the activated features
          for (const auto index : added_indices[perspective]) {
            const IndexType offset = kHalfDimensions * index;

-  #if defined(USE_AVX2)
-            auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j) {
-              accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]);
-            }
-
-  #elif defined(USE_SSE2)
-            auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j) {
-              accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
-            }
-
-  #elif defined(USE_MMX)
-            auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j) {
-              accumulation[j] = _mm_add_pi16(accumulation[j], column[j]);
-            }
-
-  #elif defined(USE_NEON)
-            auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j) {
-              accumulation[j] = vaddq_s16(accumulation[j], column[j]);
-            }
-
-  #else
-            for (IndexType j = 0; j < kHalfDimensions; ++j) {
-              accumulator.accumulation[perspective][i][j] +=
-                  weights_[offset + j];
-            }
-  #endif
-
+            for (IndexType j = 0; j < kHalfDimensions; ++j)
+              accumulator.accumulation[perspective][i][j] += weights_[offset + j];
          }
        }
      }
-  #if defined(USE_MMX)
-      _mm_empty();
-  #endif
-
+#endif
+      }
      accumulator.computed_accumulation = true;
-      accumulator.computed_score = false;
    }

    using BiasType = std::int16_t;
@@ -1,201 +1,215 @@
-// USI extended command for NNUE evaluation function
-
-#if defined(ENABLE_TEST_CMD) && defined(EVAL_NNUE)
-
-#include "../thread.h"
-#include "../uci.h"
-#include "evaluate_nnue.h"
+#include "evaluate_nnue.h"
 #include "nnue_test_command.h"

+#include "thread.h"
+#include "uci.h"
+
 #include <set>
 #include <fstream>

-#define ASSERT(X) { if (!(X)) { std::cout << "\nError : ASSERT(" << #X << "), " << __FILE__ << "(" << __LINE__ << "): " << __func__ << std::endl; \
- std::this_thread::sleep_for(std::chrono::microseconds(3000)); *(int*)1 =0;} }
-
-namespace Eval {
-
-namespace NNUE {
-
-namespace {
-
-// Testing RawFeatures mainly for difference calculation
-void TestFeatures(Position& pos) {
-  const std::uint64_t num_games = 1000;
-  StateInfo si;
-  pos.set(StartFEN, false, &si, Threads.main());
-  const int MAX_PLY = 256; // test up to 256 hands
-
-  StateInfo state[MAX_PLY]; // StateInfo only for the maximum number of steps
-  int ply; // Trouble from the initial phase
-
-  PRNG prng(20171128);
-
-  std::uint64_t num_moves = 0;
-  std::vector<std::uint64_t> num_updates(kRefreshTriggers.size() + 1);
-  std::vector<std::uint64_t> num_resets(kRefreshTriggers.size());
-  constexpr IndexType kUnknown = -1;
-  std::vector<IndexType> trigger_map(RawFeatures::kDimensions, kUnknown);
-  auto make_index_sets = [&](const Position& pos) {
-    std::vector<std::vector<std::set<IndexType>>> index_sets(
-        kRefreshTriggers.size(), std::vector<std::set<IndexType>>(2));
-    for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-      Features::IndexList active_indices[2];
-      RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i],
-                                       active_indices);
-      for (const auto perspective : Colors) {
-        for (const auto index : active_indices[perspective]) {
-          ASSERT(index < RawFeatures::kDimensions);
-          ASSERT(index_sets[i][perspective].count(index) == 0);
-          ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
-          index_sets[i][perspective].insert(index);
-          trigger_map[index] = i;
-        }
-      }
-    }
-    return index_sets;
-  };
-  auto update_index_sets = [&](const Position& pos, auto* index_sets) {
-    for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-      Features::IndexList removed_indices[2], added_indices[2];
-      bool reset[2];
-      RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i],
-                                        removed_indices, added_indices, reset);
-      for (const auto perspective : Colors) {
-        if (reset[perspective]) {
-          (*index_sets)[i][perspective].clear();
-          ++num_resets[i];
-        } else {
-          for (const auto index : removed_indices[perspective]) {
-            ASSERT(index < RawFeatures::kDimensions);
-            ASSERT((*index_sets)[i][perspective].count(index) == 1);
-            ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
-            (*index_sets)[i][perspective].erase(index);
-            ++num_updates.back();
-            ++num_updates[i];
-            trigger_map[index] = i;
-          }
-        }
-        for (const auto index : added_indices[perspective]) {
-          ASSERT(index < RawFeatures::kDimensions);
-          ASSERT((*index_sets)[i][perspective].count(index) == 0);
-          ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
-          (*index_sets)[i][perspective].insert(index);
-          ++num_updates.back();
-          ++num_updates[i];
-          trigger_map[index] = i;
-        }
-      }
-    }
-  };
-
-  std::cout << "feature set: " << RawFeatures::GetName()
-            << "[" << RawFeatures::kDimensions << "]" << std::endl;
-  std::cout << "start testing with random games";
-
-  for (std::uint64_t i = 0; i < num_games; ++i) {
-    auto index_sets = make_index_sets(pos);
-    for (ply = 0; ply < MAX_PLY; ++ply) {
-      MoveList<LEGAL> mg(pos); // Generate all legal hands
-
-      // There was no legal move == Clog
-      if (mg.size() == 0)
-        break;
-
-      // Randomly choose from the generated moves and advance the phase with the moves.
-      Move m = mg.begin()[prng.rand(mg.size())];
-      pos.do_move(m, state[ply]);
-
-      ++num_moves;
-      update_index_sets(pos, &index_sets);
-      ASSERT(index_sets == make_index_sets(pos));
-    }
-
-    pos.set(StartFEN, false, &si, Threads.main());
-
-    // Output'.' every 100 times (so you can see that it's progressing)
-    if ((i % 100) == 0)
-      std::cout << "." << std::flush;
-  }
-  std::cout << "passed." << std::endl;
-  std::cout << num_games << " games, " << num_moves << " moves, "
-            << num_updates.back() << " updates, "
-            << (1.0 * num_updates.back() / num_moves)
-            << " updates per move" << std::endl;
-  std::size_t num_observed_indices = 0;
-  for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-    const auto count = std::count(trigger_map.begin(), trigger_map.end(), i);
-    num_observed_indices += count;
-    std::cout << "TriggerEvent(" << static_cast<int>(kRefreshTriggers[i])
-              << "): " << count << " features ("
-              << (100.0 * count / RawFeatures::kDimensions) << "%), "
-              << num_updates[i] << " updates ("
-              << (1.0 * num_updates[i] / num_moves) << " per move), "
-              << num_resets[i] << " resets ("
-              << (100.0 * num_resets[i] / num_moves) << "%)"
-              << std::endl;
-  }
-  std::cout << "observed " << num_observed_indices << " ("
-            << (100.0 * num_observed_indices / RawFeatures::kDimensions)
-            << "% of " << RawFeatures::kDimensions
-            << ") features" << std::endl;
+#define ASSERT(X) { \
+    if (!(X)) { \
+        std::cout \
+            << "\nError : ASSERT(" << #X << "), " \
+            << __FILE__ << "(" << __LINE__ << "): " \
+            << __func__ << std::endl; \
+            std::this_thread::sleep_for(std::chrono::microseconds(3000)); \
+            *(int*)1 =0; \
+    } \
 }

-// Output a string that represents the structure of the evaluation function
-void PrintInfo(std::istream& stream) {
-  std::cout << "network architecture: " << GetArchitectureString() << std::endl;
-
-  while (true) {
-    std::string file_name;
-    stream >> file_name;
-    if (file_name.empty()) break;
-
-    std::uint32_t hash_value;
-    std::string architecture;
-    const bool success = [&]() {
-      std::ifstream file_stream(file_name, std::ios::binary);
-      if (!file_stream) return false;
-      if (!ReadHeader(file_stream, &hash_value, &architecture)) return false;
-      return true;
-    }();
-
-    std::cout << file_name << ": ";
-    if (success) {
-      if (hash_value == kHashValue) {
-        std::cout << "matches with this binary";
-        if (architecture != GetArchitectureString()) {
-          std::cout << ", but architecture string differs: " << architecture;
-        }
-        std::cout << std::endl;
-      } else {
-        std::cout << architecture << std::endl;
-      }
-    } else {
-      std::cout << "failed to read header" << std::endl;
-    }
-  }
-}
-
-}  // namespace
-
 // USI extended command for NNUE evaluation function
-void TestCommand(Position& pos, std::istream& stream) {
-  std::string sub_command;
-  stream >> sub_command;
+namespace Eval::NNUE {

-  if (sub_command == "test_features") {
-    TestFeatures(pos);
-  } else if (sub_command == "info") {
-    PrintInfo(stream);
-  } else {
-    std::cout << "usage:" << std::endl;
-    std::cout << " test nnue test_features" << std::endl;
-    std::cout << " test nnue info [path/to/" << fileName << "...]" << std::endl;
-  }
-}
+    namespace {

-}  // namespace NNUE
+        // Testing RawFeatures mainly for difference calculation
+        void test_features(Position& pos) {
+            const std::uint64_t num_games = 1000;
+            StateInfo si;
+            pos.set(StartFEN, false, &si, Threads.main());
+            const int MAX_PLY = 256; // test up to 256 hands

-}  // namespace Eval
+            StateInfo state[MAX_PLY]; // StateInfo only for the maximum number of steps
+            int ply; // Trouble from the initial phase

-#endif  // defined(ENABLE_TEST_CMD) && defined(EVAL_NNUE)
+            PRNG prng(20171128);
+
+            std::uint64_t num_moves = 0;
+            std::vector<std::uint64_t> num_updates(kRefreshTriggers.size() + 1);
+            std::vector<std::uint64_t> num_resets(kRefreshTriggers.size());
+            constexpr IndexType kUnknown = -1;
+            std::vector<IndexType> trigger_map(RawFeatures::kDimensions, kUnknown);
+
+            auto make_index_sets = [&](const Position& position) {
+                std::vector<std::vector<std::set<IndexType>>> index_sets(
+                    kRefreshTriggers.size(), std::vector<std::set<IndexType>>(2));
+
+                for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+                    Features::IndexList active_indices[2];
+                    RawFeatures::append_active_indices(position, kRefreshTriggers[i],
+                                                     active_indices);
+
+                    for (const auto perspective : Colors) {
+                        for (const auto index : active_indices[perspective]) {
+                            ASSERT(index < RawFeatures::kDimensions);
+                            ASSERT(index_sets[i][perspective].count(index) == 0);
+                            ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
+                            index_sets[i][perspective].insert(index);
+                            trigger_map[index] = i;
+                        }
+                    }
+                }
+
+                return index_sets;
+            };
+
+            auto update_index_sets = [&](const Position& position, auto* index_sets) {
+                for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+                    Features::IndexList removed_indices[2], added_indices[2];
+                    bool reset[2] = { false, false };
+                    RawFeatures::append_changed_indices(position, kRefreshTriggers[i],
+                                                      removed_indices, added_indices, reset);
+                    for (const auto perspective : Colors) {
+                        if (reset[perspective]) {
+                            (*index_sets)[i][perspective].clear();
+                            ++num_resets[i];
+                        } else {
+                            for (const auto index : removed_indices[perspective]) {
+                                ASSERT(index < RawFeatures::kDimensions);
+                                ASSERT((*index_sets)[i][perspective].count(index) == 1);
+                                ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
+                                (*index_sets)[i][perspective].erase(index);
+                                ++num_updates.back();
+                                ++num_updates[i];
+                                trigger_map[index] = i;
+                            }
+                        }
+
+                        for (const auto index : added_indices[perspective]) {
+                            ASSERT(index < RawFeatures::kDimensions);
+                            ASSERT((*index_sets)[i][perspective].count(index) == 0);
+                            ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
+                            (*index_sets)[i][perspective].insert(index);
+                            ++num_updates.back();
+                            ++num_updates[i];
+                            trigger_map[index] = i;
+                        }
+                    }
+                }
+            };
+
+            std::cout << "feature set: " << RawFeatures::get_name()
+                      << "[" << RawFeatures::kDimensions << "]" << std::endl;
+            std::cout << "start testing with random games";
+
+            for (std::uint64_t i = 0; i < num_games; ++i) {
+                auto index_sets = make_index_sets(pos);
+                for (ply = 0; ply < MAX_PLY; ++ply) {
+                    MoveList<LEGAL> mg(pos); // Generate all legal hands
+
+                    // There was no legal move == Clog
+                    if (mg.size() == 0)
+                        break;
+
+                    // Randomly choose from the generated moves and advance the phase with the moves.
+                    Move m = mg.begin()[prng.rand(mg.size())];
+                    pos.do_move(m, state[ply]);
+
+                    ++num_moves;
+                    update_index_sets(pos, &index_sets);
+                    ASSERT(index_sets == make_index_sets(pos));
+                }
+
+                pos.set(StartFEN, false, &si, Threads.main());
+
+                // Output'.' every 100 times (so you can see that it's progressing)
+                if ((i % 100) == 0)
+                    std::cout << "." << std::flush;
+            }
+
+            std::cout << "passed." << std::endl;
+            std::cout << num_games << " games, " << num_moves << " moves, "
+                      << num_updates.back() << " updates, "
+                      << (1.0 * num_updates.back() / num_moves)
+                      << " updates per move" << std::endl;
+            std::size_t num_observed_indices = 0;
+
+            for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+                const auto count = std::count(trigger_map.begin(), trigger_map.end(), i);
+                num_observed_indices += count;
+                std::cout << "TriggerEvent(" << static_cast<int>(kRefreshTriggers[i])
+                          << "): " << count << " features ("
+                          << (100.0 * count / RawFeatures::kDimensions) << "%), "
+                          << num_updates[i] << " updates ("
+                          << (1.0 * num_updates[i] / num_moves) << " per move), "
+                          << num_resets[i] << " resets ("
+                          << (100.0 * num_resets[i] / num_moves) << "%)"
+                          << std::endl;
+            }
+            std::cout << "observed " << num_observed_indices << " ("
+                      << (100.0 * num_observed_indices / RawFeatures::kDimensions)
+                      << "% of " << RawFeatures::kDimensions
+                      << ") features" << std::endl;
+        }
+
+        // Output a string that represents the structure of the evaluation function
+        void print_info(std::istream& stream) {
+            std::cout << "network architecture: " << get_architecture_string() << std::endl;
+
+            while (true) {
+                std::string file_name;
+                stream >> file_name;
+                if (file_name.empty())
+                    break;
+
+                std::uint32_t hash_value;
+                std::string architecture;
+                const bool success = [&]() {
+                    std::ifstream file_stream(file_name, std::ios::binary);
+
+                    if (!file_stream)
+                        return false;
+                    if (!read_header(file_stream, &hash_value, &architecture))
+                        return false;
+
+                    return true;
+                }();
+
+                std::cout << file_name << ": ";
+                if (success) {
+                    if (hash_value == kHashValue) {
+                        std::cout << "matches with this binary";
+                        if (architecture != get_architecture_string()) {
+                            std::cout << ", but architecture string differs: " << architecture;
+                        }
+
+                        std::cout << std::endl;
+                    } else {
+                        std::cout << architecture << std::endl;
+                    }
+                } else {
+                    std::cout << "failed to read header" << std::endl;
+                }
+            }
+        }
+
+    }  // namespace
+
+    // USI extended command for NNUE evaluation function
+    void test_command(Position& pos, std::istream& stream) {
+        std::string sub_command;
+        stream >> sub_command;
+
+        if (sub_command == "test_features") {
+            test_features(pos);
+        } else if (sub_command == "info") {
+            print_info(stream);
+        } else {
+            std::cout << "usage:" << std::endl;
+            std::cout << " test nnue test_features" << std::endl;
+            std::cout << " test nnue info [path/to/" << fileName << "...]" << std::endl;
+        }
+    }
+
+}  // namespace Eval::NNUE
@@ -1,21 +1,12 @@
-// USI extended command interface for NNUE evaluation function
-
-#ifndef _NNUE_TEST_COMMAND_H_
+#ifndef _NNUE_TEST_COMMAND_H_
 #define _NNUE_TEST_COMMAND_H_

-#if defined(ENABLE_TEST_CMD) && defined(EVAL_NNUE)
+// USI extended command interface for NNUE evaluation function
+namespace Eval::NNUE {

-namespace Eval {
+    // USI extended command for NNUE evaluation function
+    void test_command(Position& pos, std::istream& stream);

-namespace NNUE {
-
-// USI extended command for NNUE evaluation function
-void TestCommand(Position& pos, std::istream& stream);
-
-}  // namespace NNUE
-
-}  // namespace Eval
-
-#endif  // defined(ENABLE_TEST_CMD) && defined(EVAL_NNUE)
+}  // namespace Eval::NNUE

 #endif
@@ -0,0 +1,10 @@
+#ifndef _NNUE_TRAINER_FEATURES_ALL_FACTORIZERS_H_
+#define _NNUE_TRAINER_FEATURES_ALL_FACTORIZERS_H_
+
+#include "factorizer.h"
+#include "factorizer_feature_set.h"
+
+#include "factorizer_half_kp.h"
+#include "factorizer_half_ka.h"
+
+#endif
@@ -1,110 +1,117 @@
-// NNUE evaluation function feature conversion class template
-
-#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_H_
+#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_H_
 #define _NNUE_TRAINER_FEATURES_FACTORIZER_H_

-#if defined(EVAL_NNUE)
+#include "nnue/nnue_common.h"

-#include "../../nnue_common.h"
-#include "../trainer.h"
+#include "nnue/trainer/trainer.h"

-namespace Eval {
+// NNUE evaluation function feature conversion class template
+namespace Eval::NNUE::Features {

-namespace NNUE {
+    // Class template that converts input features into learning features
+    // By default, the learning feature is the same as the original input feature, and specialized as necessary
+    template <typename FeatureType>
+    class Factorizer {
+    public:
+        static constexpr std::string get_name() {
+            return "Factorizer<" + FeatureType::get_name() + "> -> " + std::string("No factorizer");
+        }

-namespace Features {
+        static constexpr std::string get_factorizers_string() {
+            return "  - " + get_name();
+        }

-// Class template that converts input features into learning features
-// By default, the learning feature is the same as the original input feature, and specialized as necessary
-template <typename FeatureType>
-class Factorizer {
- public:
-  // Get the dimensionality of the learning feature
-  static constexpr IndexType GetDimensions() {
-    return FeatureType::kDimensions;
-  }
+        // Get the dimensionality of the learning feature
+        static constexpr IndexType get_dimensions() {
+            return FeatureType::kDimensions;
+        }

-  // Get index of learning feature and scale of learning rate
-  static void AppendTrainingFeatures(
-      IndexType base_index, std::vector<TrainingFeature>* training_features) {
-    assert(base_index <FeatureType::kDimensions);
-    training_features->emplace_back(base_index);
-  }
-};
+        // Get index of learning feature and scale of learning rate
+        static void append_training_features(
+            IndexType base_index, std::vector<TrainingFeature>* training_features) {

-// Learning feature information
-struct FeatureProperties {
-  bool active;
-  IndexType dimensions;
-};
+            assert(base_index <FeatureType::kDimensions);
+            training_features->emplace_back(base_index);
+        }
+    };

-// Add the original input features to the learning features
-template <typename FeatureType>
-IndexType AppendBaseFeature(
-    FeatureProperties properties, IndexType base_index,
-    std::vector<TrainingFeature>* training_features) {
-  assert(properties.dimensions == FeatureType::kDimensions);
-  assert(base_index < FeatureType::kDimensions);
-  training_features->emplace_back(base_index);
-  return properties.dimensions;
-}
+    // Learning feature information
+    struct FeatureProperties {
+        bool active;
+        IndexType dimensions;
+    };

-// If the learning rate scale is not 0, inherit other types of learning features
-template <typename FeatureType>
-IndexType InheritFeaturesIfRequired(
-    IndexType index_offset, FeatureProperties properties, IndexType base_index,
-    std::vector<TrainingFeature>* training_features) {
-  if (!properties.active) {
-    return 0;
-  }
-  assert(properties.dimensions == Factorizer<FeatureType>::GetDimensions());
-  assert(base_index < FeatureType::kDimensions);
-  const auto start = training_features->size();
-  Factorizer<FeatureType>::AppendTrainingFeatures(
-      base_index, training_features);
-  for (auto i = start; i < training_features->size(); ++i) {
-    auto& feature = (*training_features)[i];
-    assert(feature.GetIndex() < Factorizer<FeatureType>::GetDimensions());
-    feature.ShiftIndex(index_offset);
-  }
-  return properties.dimensions;
-}
+    // Add the original input features to the learning features
+    template <typename FeatureType>
+    IndexType append_base_feature(
+        FeatureProperties properties, IndexType base_index,
+        std::vector<TrainingFeature>* training_features) {

-// Return the index difference as needed, without adding learning features
-// Call instead of InheritFeaturesIfRequired() if there are no corresponding features
-IndexType SkipFeatures(FeatureProperties properties) {
-  if (!properties.active) {
-    return 0;
-  }
-  return properties.dimensions;
-}
-
-// Get the dimensionality of the learning feature
-template <std::size_t N>
-constexpr IndexType GetActiveDimensions(
-    const FeatureProperties (&properties)[N]) {
-  static_assert(N > 0, "");
-  IndexType dimensions = properties[0].dimensions;
-  for (std::size_t i = 1; i < N; ++i) {
-    if (properties[i].active) {
-      dimensions += properties[i].dimensions;
+        assert(properties.dimensions == FeatureType::kDimensions);
+        assert(base_index < FeatureType::kDimensions);
+        training_features->emplace_back(base_index);
+        return properties.dimensions;
    }
-  }
-  return dimensions;
-}

-// get the number of elements in the array
-template <typename T, std::size_t N>
-constexpr std::size_t GetArrayLength(const T (&/*array*/)[N]) {
-  return N;
-}
+    // If the learning rate scale is not 0, inherit other types of learning features
+    template <typename FeatureType>
+    IndexType inherit_features_if_required(
+        IndexType index_offset, FeatureProperties properties, IndexType base_index,
+        std::vector<TrainingFeature>* training_features) {

-}  // namespace Features
+        if (!properties.active) {
+            return 0;
+        }

-}  // namespace NNUE
+        assert(properties.dimensions == Factorizer<FeatureType>::get_dimensions());
+        assert(base_index < FeatureType::kDimensions);

-}  // namespace Eval
+        const auto start = training_features->size();
+        Factorizer<FeatureType>::append_training_features(
+            base_index, training_features);

-#endif  // defined(EVAL_NNUE)
+        for (auto i = start; i < training_features->size(); ++i) {
+            auto& feature = (*training_features)[i];
+            assert(feature.get_index() < Factorizer<FeatureType>::get_dimensions());
+            feature.shift_index(index_offset);
+        }
+
+        return properties.dimensions;
+    }
+
+    // Return the index difference as needed, without adding learning features
+    // Call instead of InheritFeaturesIfRequired() if there are no corresponding features
+    IndexType skip_features(FeatureProperties properties) {
+        if (!properties.active)
+            return 0;
+
+        return properties.dimensions;
+    }
+
+    // Get the dimensionality of the learning feature
+    template <std::size_t N>
+    constexpr IndexType get_active_dimensions(
+        const FeatureProperties (&properties)[N]) {
+
+        static_assert(N > 0, "");
+
+        IndexType dimensions = properties[0].dimensions;
+
+        for (std::size_t i = 1; i < N; ++i) {
+            if (properties[i].active) {
+                dimensions += properties[i].dimensions;
+            }
+        }
+
+        return dimensions;
+    }
+
+    // get the number of elements in the array
+    template <typename T, std::size_t N>
+    constexpr std::size_t get_array_length(const T (&/*array*/)[N]) {
+        return N;
+    }
+
+}  // namespace Eval::NNUE::Features

 #endif
@@ -1,104 +1,121 @@
-// Specialization for feature set of feature conversion class template of NNUE evaluation function
-
-#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_
+#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_
 #define _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_

-#if defined(EVAL_NNUE)
-
-#include "../../features/feature_set.h"
 #include "factorizer.h"

-namespace Eval {
+#include "nnue/features/feature_set.h"

-namespace NNUE {
+// Specialization for feature set of feature conversion class template of NNUE evaluation function
+namespace Eval::NNUE::Features {

-namespace Features {
+    // Class template that converts input features into learning features
+    // Specialization for FeatureSet
+    template <typename FirstFeatureType, typename... RemainingFeatureTypes>
+    class Factorizer<FeatureSet<FirstFeatureType, RemainingFeatureTypes...>> {
+    private:
+        using Head = Factorizer<FeatureSet<FirstFeatureType>>;
+        using Tail = Factorizer<FeatureSet<RemainingFeatureTypes...>>;

-// Class template that converts input features into learning features
-// Specialization for FeatureSet
-template <typename FirstFeatureType, typename... RemainingFeatureTypes>
-class Factorizer<FeatureSet<FirstFeatureType, RemainingFeatureTypes...>> {
- private:
-  using Head = Factorizer<FeatureSet<FirstFeatureType>>;
-  using Tail = Factorizer<FeatureSet<RemainingFeatureTypes...>>;
+    public:
+        // number of dimensions of original input features
+        static constexpr IndexType kBaseDimensions =
+            FeatureSet<FirstFeatureType, RemainingFeatureTypes...>::kDimensions;

- public:
-  // number of dimensions of original input features
-  static constexpr IndexType kBaseDimensions =
-      FeatureSet<FirstFeatureType, RemainingFeatureTypes...>::kDimensions;
-
-  // Get the dimensionality of the learning feature
-  static constexpr IndexType GetDimensions() {
-    return Head::GetDimensions() + Tail::GetDimensions();
-  }
-
-  // Get index of learning feature and scale of learning rate
-  static void AppendTrainingFeatures(
-      IndexType base_index, std::vector<TrainingFeature>* training_features,
-      IndexType base_dimensions = kBaseDimensions) {
-    assert(base_index < kBaseDimensions);
-    constexpr auto boundary = FeatureSet<RemainingFeatureTypes...>::kDimensions;
-    if (base_index < boundary) {
-      Tail::AppendTrainingFeatures(
-          base_index, training_features, base_dimensions);
-    } else {
-      const auto start = training_features->size();
-      Head::AppendTrainingFeatures(
-          base_index - boundary, training_features, base_dimensions);
-      for (auto i = start; i < training_features->size(); ++i) {
-        auto& feature = (*training_features)[i];
-        const auto index = feature.GetIndex();
-        assert(index < Head::GetDimensions() ||
-                   (index >= base_dimensions &&
-                    index < base_dimensions +
-                            Head::GetDimensions() - Head::kBaseDimensions));
-        if (index < Head::kBaseDimensions) {
-          feature.ShiftIndex(Tail::kBaseDimensions);
-        } else {
-          feature.ShiftIndex(Tail::GetDimensions() - Tail::kBaseDimensions);
+        static constexpr std::string get_factorizers_string() {
+            std::string str = "  - ";
+            str += Head::get_name();
+            str += '\n';
+            str += Tail::get_factorizers_string();
+            return str;
        }
-      }
-    }
-  }
-};

-// Class template that converts input features into learning features
-// Specialization when FeatureSet has one template argument
-template <typename FeatureType>
-class Factorizer<FeatureSet<FeatureType>> {
-public:
-  // number of dimensions of original input features
-  static constexpr IndexType kBaseDimensions = FeatureType::kDimensions;
+        // Get the dimensionality of the learning feature
+        static constexpr IndexType get_dimensions() {
+            return Head::get_dimensions() + Tail::get_dimensions();
+        }

-  // Get the dimensionality of the learning feature
-  static constexpr IndexType GetDimensions() {
-    return Factorizer<FeatureType>::GetDimensions();
-  }
+        // Get index of learning feature and scale of learning rate
+        static void append_training_features(
+            IndexType base_index, std::vector<TrainingFeature>* training_features,
+            IndexType base_dimensions = kBaseDimensions) {

-  // Get index of learning feature and scale of learning rate
-  static void AppendTrainingFeatures(
-      IndexType base_index, std::vector<TrainingFeature>* training_features,
-      IndexType base_dimensions = kBaseDimensions) {
-    assert(base_index < kBaseDimensions);
-    const auto start = training_features->size();
-    Factorizer<FeatureType>::AppendTrainingFeatures(
-        base_index, training_features);
-    for (auto i = start; i < training_features->size(); ++i) {
-      auto& feature = (*training_features)[i];
-      assert(feature.GetIndex() < Factorizer<FeatureType>::GetDimensions());
-      if (feature.GetIndex() >= kBaseDimensions) {
-        feature.ShiftIndex(base_dimensions - kBaseDimensions);
-      }
-    }
-  }
-};
+            assert(base_index < kBaseDimensions);

-}  // namespace Features
+            constexpr auto boundary = FeatureSet<RemainingFeatureTypes...>::kDimensions;

-}  // namespace NNUE
+            if (base_index < boundary) {
+                Tail::append_training_features(
+                    base_index, training_features, base_dimensions);
+            }
+            else {
+                const auto start = training_features->size();

-}  // namespace Eval
+                Head::append_training_features(
+                    base_index - boundary, training_features, base_dimensions);

-#endif  // defined(EVAL_NNUE)
+                for (auto i = start; i < training_features->size(); ++i) {
+                    auto& feature = (*training_features)[i];
+                    const auto index = feature.get_index();
+
+                    assert(index < Head::get_dimensions() ||
+                               (index >= base_dimensions &&
+                                index < base_dimensions +
+                                        Head::get_dimensions() - Head::kBaseDimensions));
+
+                    if (index < Head::kBaseDimensions) {
+                        feature.shift_index(Tail::kBaseDimensions);
+                    }
+                    else {
+                        feature.shift_index(Tail::get_dimensions() - Tail::kBaseDimensions);
+                    }
+                }
+            }
+        }
+    };
+
+    // Class template that converts input features into learning features
+    // Specialization when FeatureSet has one template argument
+    template <typename FeatureType>
+    class Factorizer<FeatureSet<FeatureType>> {
+    public:
+        // number of dimensions of original input features
+        static constexpr IndexType kBaseDimensions = FeatureType::kDimensions;
+
+        static constexpr std::string get_name() {
+            return Factorizer<FeatureType>::get_name();
+        }
+
+        static constexpr std::string get_factorizers_string() {
+            return "  - " + get_name();
+        }
+
+        // Get the dimensionality of the learning feature
+        static constexpr IndexType get_dimensions() {
+            return Factorizer<FeatureType>::get_dimensions();
+        }
+
+        // Get index of learning feature and scale of learning rate
+        static void append_training_features(
+            IndexType base_index, std::vector<TrainingFeature>* training_features,
+            IndexType base_dimensions = kBaseDimensions) {
+
+            assert(base_index < kBaseDimensions);
+
+            const auto start = training_features->size();
+
+            Factorizer<FeatureType>::append_training_features(
+                base_index, training_features);
+
+            for (auto i = start; i < training_features->size(); ++i) {
+                auto& feature = (*training_features)[i];
+                assert(feature.get_index() < Factorizer<FeatureType>::get_dimensions());
+                if (feature.get_index() >= kBaseDimensions) {
+                    feature.shift_index(base_dimensions - kBaseDimensions);
+                }
+            }
+        }
+    };
+
+}  // namespace Eval::NNUE::Features

 #endif
@@ -0,0 +1,93 @@
+#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KA_H_
+#define _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KA_H_
+
+#include "factorizer.h"
+
+#include "nnue/features/half_ka.h"
+#include "nnue/features/a.h"
+#include "nnue/features/half_relative_ka.h"
+
+// Specialization of NNUE evaluation function feature conversion class template for HalfKA
+namespace Eval::NNUE::Features {
+
+    // Class template that converts input features into learning features
+    // Specialization for HalfKA
+    template <Side AssociatedKing>
+    class Factorizer<HalfKA<AssociatedKing>> {
+    private:
+        using FeatureType = HalfKA<AssociatedKing>;
+
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions =
+            FeatureType::kMaxActiveDimensions;
+
+        // Type of learning feature
+        enum TrainingFeatureType {
+            kFeaturesHalfKA,
+            kFeaturesA,
+            kFeaturesHalfRelativeKA,
+            kNumTrainingFeatureTypes,
+        };
+
+        // Learning feature information
+        static constexpr FeatureProperties kProperties[] = {
+            // kFeaturesHalfA
+            {true, FeatureType::kDimensions},
+            // kFeaturesA
+            {true, Factorizer<A>::get_dimensions()},
+            // kFeaturesHalfRelativeKA
+            {true, Factorizer<HalfRelativeKA<AssociatedKing>>::get_dimensions()},
+        };
+
+        static_assert(get_array_length(kProperties) == kNumTrainingFeatureTypes, "");
+
+    public:
+        static constexpr std::string get_name() {
+            return std::string("Factorizer<") + FeatureType::kName + "> -> " + "A, HalfRelativeKA";
+        }
+
+        static constexpr std::string get_factorizers_string() {
+            return "  - " + get_name();
+        }
+
+        // Get the dimensionality of the learning feature
+        static constexpr IndexType get_dimensions() {
+            return get_active_dimensions(kProperties);
+        }
+
+        // Get index of learning feature and scale of learning rate
+        static void append_training_features(
+            IndexType base_index, std::vector<TrainingFeature>* training_features) {
+
+            // kFeaturesHalfA
+            IndexType index_offset = append_base_feature<FeatureType>(
+                kProperties[kFeaturesHalfKA], base_index, training_features);
+
+            const auto sq_k = static_cast<Square>(base_index / PS_END2);
+            const auto a = static_cast<IndexType>(base_index % PS_END2);
+
+            // kFeaturesA
+            index_offset += inherit_features_if_required<A>(
+                index_offset, kProperties[kFeaturesA], a, training_features);
+
+            // kFeaturesHalfRelativeKA
+            if (a >= PS_W_PAWN) {
+                index_offset += inherit_features_if_required<HalfRelativeKA<AssociatedKing>>(
+                    index_offset, kProperties[kFeaturesHalfRelativeKA],
+                    HalfRelativeKA<AssociatedKing>::make_index(sq_k, a),
+                    training_features);
+            }
+            else {
+                index_offset += skip_features(kProperties[kFeaturesHalfRelativeKA]);
+            }
+
+            assert(index_offset == get_dimensions());
+        }
+    };
+
+    template <Side AssociatedKing>
+    constexpr FeatureProperties Factorizer<HalfKA<AssociatedKing>>::kProperties[];
+
+}  // namespace Eval::NNUE::Features
+
+#endif // #ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KA_H_
@@ -1,103 +1,104 @@
-// Specialization of NNUE evaluation function feature conversion class template for HalfKP
-
-#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_
+#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_
 #define _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_

-#if defined(EVAL_NNUE)
-
-#include "../../features/half_kp.h"
-#include "../../features/p.h"
-#include "../../features/half_relative_kp.h"
 #include "factorizer.h"

-namespace Eval {
+#include "nnue/features/half_kp.h"
+#include "nnue/features/p.h"
+#include "nnue/features/half_relative_kp.h"

-namespace NNUE {
+// Specialization of NNUE evaluation function feature conversion class template for HalfKP
+namespace Eval::NNUE::Features {

-namespace Features {
+    // Class template that converts input features into learning features
+    // Specialization for HalfKP
+    template <Side AssociatedKing>
+    class Factorizer<HalfKP<AssociatedKing>> {
+    private:
+        using FeatureType = HalfKP<AssociatedKing>;

-// Class template that converts input features into learning features
-// Specialization for HalfKP
-template <Side AssociatedKing>
-class Factorizer<HalfKP<AssociatedKing>> {
- private:
-  using FeatureType = HalfKP<AssociatedKing>;
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions =
+            FeatureType::kMaxActiveDimensions;

-  // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-  static constexpr IndexType kMaxActiveDimensions =
-      FeatureType::kMaxActiveDimensions;
+        // Type of learning feature
+        enum TrainingFeatureType {
+            kFeaturesHalfKP,
+            kFeaturesHalfK,
+            kFeaturesP,
+            kFeaturesHalfRelativeKP,
+            kNumTrainingFeatureTypes,
+        };

-  // Type of learning feature
-  enum TrainingFeatureType {
-    kFeaturesHalfKP,
-    kFeaturesHalfK,
-    kFeaturesP,
-    kFeaturesHalfRelativeKP,
-    kNumTrainingFeatureTypes,
-  };
+        // Learning feature information
+        static constexpr FeatureProperties kProperties[] = {
+            // kFeaturesHalfKP
+            {true, FeatureType::kDimensions},
+            // kFeaturesHalfK
+            {true, SQUARE_NB},
+            // kFeaturesP
+            {true, Factorizer<P>::get_dimensions()},
+            // kFeaturesHalfRelativeKP
+            {true, Factorizer<HalfRelativeKP<AssociatedKing>>::get_dimensions()},
+        };

-  // Learning feature information
-  static constexpr FeatureProperties kProperties[] = {
-    // kFeaturesHalfKP
-    {true, FeatureType::kDimensions},
-    // kFeaturesHalfK
-    {true, SQUARE_NB},
-    // kFeaturesP
-    {true, Factorizer<P>::GetDimensions()},
-    // kFeaturesHalfRelativeKP
-    {true, Factorizer<HalfRelativeKP<AssociatedKing>>::GetDimensions()},
-  };
-  static_assert(GetArrayLength(kProperties) == kNumTrainingFeatureTypes, "");
+        static_assert(get_array_length(kProperties) == kNumTrainingFeatureTypes, "");

- public:
-  // Get the dimensionality of the learning feature
-  static constexpr IndexType GetDimensions() {
-    return GetActiveDimensions(kProperties);
-  }
+    public:
+        static constexpr std::string get_name() {
+            return std::string("Factorizer<") + FeatureType::kName + "> -> " + "HalfK, P, HalfRelativeKP";
+        }

-  // Get index of learning feature and scale of learning rate
-  static void AppendTrainingFeatures(
-      IndexType base_index, std::vector<TrainingFeature>* training_features) {
-    // kFeaturesHalfKP
-    IndexType index_offset = AppendBaseFeature<FeatureType>(
-        kProperties[kFeaturesHalfKP], base_index, training_features);
+        static constexpr std::string get_factorizers_string() {
+            return "  - " + get_name();
+        }

-    const auto sq_k = static_cast<Square>(base_index / PS_END);
-    const auto p = static_cast<IndexType>(base_index % PS_END);
-    // kFeaturesHalfK
-    {
-      const auto& properties = kProperties[kFeaturesHalfK];
-      if (properties.active) {
-        training_features->emplace_back(index_offset + sq_k);
-        index_offset += properties.dimensions;
-      }
-    }
-    // kFeaturesP
-    index_offset += InheritFeaturesIfRequired<P>(
-        index_offset, kProperties[kFeaturesP], p, training_features);
-    // kFeaturesHalfRelativeKP
-    if (p >= PS_W_PAWN) {
-      index_offset += InheritFeaturesIfRequired<HalfRelativeKP<AssociatedKing>>(
-          index_offset, kProperties[kFeaturesHalfRelativeKP],
-          HalfRelativeKP<AssociatedKing>::MakeIndex(sq_k, p),
-          training_features);
-    } else {
-      index_offset += SkipFeatures(kProperties[kFeaturesHalfRelativeKP]);
-    }
+        // Get the dimensionality of the learning feature
+        static constexpr IndexType get_dimensions() {
+            return get_active_dimensions(kProperties);
+        }

-    assert(index_offset == GetDimensions());
-  }
-};
+        // Get index of learning feature and scale of learning rate
+        static void append_training_features(
+            IndexType base_index, std::vector<TrainingFeature>* training_features) {

-template <Side AssociatedKing>
-constexpr FeatureProperties Factorizer<HalfKP<AssociatedKing>>::kProperties[];
+            // kFeaturesHalfKP
+            IndexType index_offset = append_base_feature<FeatureType>(
+                kProperties[kFeaturesHalfKP], base_index, training_features);

-}  // namespace Features
+            const auto sq_k = static_cast<Square>(base_index / PS_END);
+            const auto p = static_cast<IndexType>(base_index % PS_END);

-}  // namespace NNUE
+            // kFeaturesHalfK
+            {
+                const auto& properties = kProperties[kFeaturesHalfK];
+                if (properties.active) {
+                    training_features->emplace_back(index_offset + sq_k);
+                    index_offset += properties.dimensions;
+                }
+            }

-}  // namespace Eval
+            // kFeaturesP
+            index_offset += inherit_features_if_required<P>(
+                index_offset, kProperties[kFeaturesP], p, training_features);
+            // kFeaturesHalfRelativeKP
+            if (p >= PS_W_PAWN) {
+                index_offset += inherit_features_if_required<HalfRelativeKP<AssociatedKing>>(
+                    index_offset, kProperties[kFeaturesHalfRelativeKP],
+                    HalfRelativeKP<AssociatedKing>::make_index(sq_k, p),
+                    training_features);
+            }
+            else {
+                index_offset += skip_features(kProperties[kFeaturesHalfRelativeKP]);
+            }

-#endif  // defined(EVAL_NNUE)
+            assert(index_offset == get_dimensions());
+        }
+    };
+
+    template <Side AssociatedKing>
+    constexpr FeatureProperties Factorizer<HalfKP<AssociatedKing>>::kProperties[];
+
+}  // namespace Eval::NNUE::Features

 #endif
@@ -1,125 +1,122 @@
-// Common header of class template for learning NNUE evaluation function
-
-#ifndef _NNUE_TRAINER_H_
+#ifndef _NNUE_TRAINER_H_
 #define _NNUE_TRAINER_H_

-#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
-
-#include "../nnue_common.h"
-#include "../features/index_list.h"
+#include "nnue/nnue_common.h"
+#include "nnue/features/index_list.h"

 #include <sstream>
+
 #if defined(USE_BLAS)
 static_assert(std::is_same<LearnFloatType, float>::value, "");
 #include <cblas.h>
 #endif

-namespace Eval {
+// Common header of class template for learning NNUE evaluation function
+namespace Eval::NNUE {

-namespace NNUE {
+    // Ponanza constant used in the relation between evaluation value and winning percentage
+    constexpr double kPonanzaConstant = 600.0;

-// Ponanza constant used in the relation between evaluation value and winning percentage
-constexpr double kPonanzaConstant = 600.0;
+    // Class that represents one index of learning feature
+    class TrainingFeature {
+        using StorageType = std::uint32_t;
+        static_assert(std::is_unsigned<StorageType>::value, "");

-// Class that represents one index of learning feature
-class TrainingFeature {
-  using StorageType = std::uint32_t;
-  static_assert(std::is_unsigned<StorageType>::value, "");
+    public:
+        static constexpr std::uint32_t kIndexBits = 24;

- public:
-  static constexpr std::uint32_t kIndexBits = 24;
-  static_assert(kIndexBits < std::numeric_limits<StorageType>::digits, "");
-  static constexpr std::uint32_t kCountBits =
-      std::numeric_limits<StorageType>::digits - kIndexBits;
+        static_assert(kIndexBits < std::numeric_limits<StorageType>::digits, "");

-  explicit TrainingFeature(IndexType index) :
-      index_and_count_((index << kCountBits) | 1) {
-    assert(index < (1 << kIndexBits));
-  }
-  TrainingFeature& operator+=(const TrainingFeature& other) {
-    assert(other.GetIndex() == GetIndex());
-    assert(other.GetCount() + GetCount() < (1 << kCountBits));
-    index_and_count_ += other.GetCount();
-    return *this;
-  }
-  IndexType GetIndex() const {
-    return static_cast<IndexType>(index_and_count_ >> kCountBits);
-  }
-  void ShiftIndex(IndexType offset) {
-    assert(GetIndex() + offset < (1 << kIndexBits));
-    index_and_count_ += offset << kCountBits;
-  }
-  IndexType GetCount() const {
-    return static_cast<IndexType>(index_and_count_ & ((1 << kCountBits) - 1));
-  }
-  bool operator<(const TrainingFeature& other) const {
-    return index_and_count_ < other.index_and_count_;
-  }
+        static constexpr std::uint32_t kCountBits =
+            std::numeric_limits<StorageType>::digits - kIndexBits;

- private:
-  StorageType index_and_count_;
-};
+        explicit TrainingFeature(IndexType index) :
+            index_and_count_((index << kCountBits) | 1) {

-// Structure that represents one sample of training data
-struct Example {
-  std::vector<TrainingFeature> training_features[2];
-  Learner::PackedSfenValue psv;
-  int sign;
-  double weight;
-};
+            assert(index < (1 << kIndexBits));
+        }

-// Message used for setting hyperparameters
-struct Message {
-  Message(const std::string& name, const std::string& value = ""):
-      name(name), value(value), num_peekers(0), num_receivers(0) {}
-  const std::string name;
-  const std::string value;
-  std::uint32_t num_peekers;
-  std::uint32_t num_receivers;
-};
+        TrainingFeature& operator+=(const TrainingFeature& other) {
+            assert(other.get_index() == get_index());
+            assert(other.get_count() + get_count() < (1 << kCountBits));
+            index_and_count_ += other.get_count();
+            return *this;
+        }

-// determine whether to accept the message
-bool ReceiveMessage(const std::string& name, Message* message) {
-  const auto subscript = "[" + std::to_string(message->num_peekers) + "]";
-  if (message->name.substr(0, name.size() + 1) == name + "[") {
-    ++message->num_peekers;
-  }
-  if (message->name == name || message->name == name + subscript) {
-    ++message->num_receivers;
-    return true;
-  }
-  return false;
-}
+        IndexType get_index() const {
+            return static_cast<IndexType>(index_and_count_ >> kCountBits);
+        }

-// split the string
-std::vector<std::string> Split(const std::string& input, char delimiter) {
-  std::istringstream stream(input);
-  std::string field;
-  std::vector<std::string> fields;
-  while (std::getline(stream, field, delimiter)) {
-    fields.push_back(field);
-  }
-  return fields;
-}
+        void shift_index(IndexType offset) {
+            assert(get_index() + offset < (1 << kIndexBits));
+            index_and_count_ += offset << kCountBits;
+        }

-// round a floating point number to an integer
-template <typename IntType>
-IntType Round(double value) {
-  return static_cast<IntType>(std::floor(value + 0.5));
-}
+        IndexType get_count() const {
+            return static_cast<IndexType>(index_and_count_ & ((1 << kCountBits) - 1));
+        }

-// make_shared with alignment
-template <typename T, typename... ArgumentTypes>
-std::shared_ptr<T> MakeAlignedSharedPtr(ArgumentTypes&&... arguments) {
-  const auto ptr = new(std_aligned_alloc(alignof(T), sizeof(T)))
-      T(std::forward<ArgumentTypes>(arguments)...);
-  return std::shared_ptr<T>(ptr, AlignedDeleter<T>());
-}
+        bool operator<(const TrainingFeature& other) const {
+            return index_and_count_ < other.index_and_count_;
+        }

-}  // namespace NNUE
+    private:
+        StorageType index_and_count_;
+    };

-}  // namespace Eval
+    // Structure that represents one sample of training data
+    struct Example {
+        std::vector<TrainingFeature> training_features[2];
+        Learner::PackedSfenValue psv;
+        Value discrete_nn_eval;
+        int sign;
+        double weight;
+    };

-#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+    // Message used for setting hyperparameters
+    struct Message {
+        Message(const std::string& message_name, const std::string& message_value = "") :
+            name(message_name), value(message_value), num_peekers(0), num_receivers(0)
+        {
+        }
+
+        const std::string name;
+        const std::string value;
+        std::uint32_t num_peekers;
+        std::uint32_t num_receivers;
+    };
+
+    // determine whether to accept the message
+    bool receive_message(const std::string& name, Message* message) {
+        const auto subscript = "[" + std::to_string(message->num_peekers) + "]";
+
+        if (message->name.substr(0, name.size() + 1) == name + "[") {
+            ++message->num_peekers;
+        }
+
+        if (message->name == name || message->name == name + subscript) {
+            ++message->num_receivers;
+            return true;
+        }
+
+        return false;
+    }
+
+    // round a floating point number to an integer
+    template <typename IntType>
+    IntType round(double value) {
+        return static_cast<IntType>(std::floor(value + 0.5));
+    }
+
+    // make_shared with alignment
+    template <typename T, typename... ArgumentTypes>
+    std::shared_ptr<T> make_aligned_shared_ptr(ArgumentTypes&&... arguments) {
+        const auto ptr = new(std_aligned_alloc(alignof(T), sizeof(T)))
+            T(std::forward<ArgumentTypes>(arguments)...);
+
+        return std::shared_ptr<T>(ptr, AlignedDeleter<T>());
+    }
+
+}  // namespace Eval::NNUE

 #endif
@@ -1,301 +1,476 @@
-// Specialization of NNUE evaluation function learning class template for AffineTransform
-
-#ifndef _NNUE_TRAINER_AFFINE_TRANSFORM_H_
+#ifndef _NNUE_TRAINER_AFFINE_TRANSFORM_H_
 #define _NNUE_TRAINER_AFFINE_TRANSFORM_H_

-#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
-
-#include "../../learn/learn.h"
-#include "../layers/affine_transform.h"
 #include "trainer.h"

+#include "extra/stockfish_blas.h"
+
+#include "learn/learn.h"
+
+#include "nnue/layers/affine_transform.h"
+
+#include "thread.h"
+
 #include <random>

-namespace Eval {
+// Specialization of NNUE evaluation function learning class template for AffineTransform
+namespace Eval::NNUE {

-namespace NNUE {
+    // Learning: Affine transformation layer
+    template <typename PreviousLayer, IndexType OutputDimensions>
+    class Trainer<Layers::AffineTransform<PreviousLayer, OutputDimensions>> {
+    private:
+        // Type of layer to learn
+        using LayerType = Layers::AffineTransform<PreviousLayer, OutputDimensions>;

-// Learning: Affine transformation layer
-template <typename PreviousLayer, IndexType OutputDimensions>
-class Trainer<Layers::AffineTransform<PreviousLayer, OutputDimensions>> {
- private:
-  // Type of layer to learn
-  using LayerType = Layers::AffineTransform<PreviousLayer, OutputDimensions>;
+    public:
+        // factory function
+        static std::shared_ptr<Trainer> create(
+            LayerType* target_layer, FeatureTransformer* ft) {

- public:
-  // factory function
-  static std::shared_ptr<Trainer> Create(
-      LayerType* target_layer, FeatureTransformer* feature_transformer) {
-    return std::shared_ptr<Trainer>(
-        new Trainer(target_layer, feature_transformer));
-  }
-
-  // Set options such as hyperparameters
-  void SendMessage(Message* message) {
-    previous_layer_trainer_->SendMessage(message);
-    if (ReceiveMessage("momentum", message)) {
-      momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
-    }
-    if (ReceiveMessage("learning_rate_scale", message)) {
-      learning_rate_scale_ =
-          static_cast<LearnFloatType>(std::stod(message->value));
-    }
-    if (ReceiveMessage("reset", message)) {
-      DequantizeParameters();
-    }
-    if (ReceiveMessage("quantize_parameters", message)) {
-      QuantizeParameters();
-    }
-  }
-
-  // Initialize the parameters with random numbers
-  template <typename RNG>
-  void Initialize(RNG& rng) {
-    previous_layer_trainer_->Initialize(rng);
-    if (kIsOutputLayer) {
-      // Initialize output layer with 0
-      std::fill(std::begin(biases_), std::end(biases_),
-                static_cast<LearnFloatType>(0.0));
-      std::fill(std::begin(weights_), std::end(weights_),
-                static_cast<LearnFloatType>(0.0));
-    } else {
-      // Assuming that the input distribution is unit-mean 0.5, equal variance,
-      // Initialize the output distribution so that each unit has a mean of 0.5 and the same variance as the input
-      const double kSigma = 1.0 / std::sqrt(kInputDimensions);
-      auto distribution = std::normal_distribution<double>(0.0, kSigma);
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        double sum = 0.0;
-        for (IndexType j = 0; j < kInputDimensions; ++j) {
-          const auto weight = static_cast<LearnFloatType>(distribution(rng));
-          weights_[kInputDimensions * i + j] = weight;
-          sum += weight;
+            return std::shared_ptr<Trainer>(
+                new Trainer(target_layer, ft));
        }
-        biases_[i] = static_cast<LearnFloatType>(0.5 - 0.5 * sum);
-      }
-    }
-    QuantizeParameters();
-  }

-  // forward propagation
-  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
-    if (output_.size() < kOutputDimensions * batch.size()) {
-      output_.resize(kOutputDimensions * batch.size());
-      gradients_.resize(kInputDimensions * batch.size());
-    }
-    batch_size_ = static_cast<IndexType>(batch.size());
-    batch_input_ = previous_layer_trainer_->Propagate(batch);
+        // Set options such as hyperparameters
+        void send_message(Message* message) {
+            previous_layer_trainer_->send_message(message);
+
+            if (receive_message("momentum", message)) {
+                momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
+            }
+
+            if (receive_message("learning_rate_scale", message)) {
+                learning_rate_scale_ =
+                    static_cast<LearnFloatType>(std::stod(message->value));
+            }
+
+            if (receive_message("reset", message)) {
+                dequantize_parameters();
+            }
+
+            if (receive_message("quantize_parameters", message)) {
+                quantize_parameters();
+            }
+
+            if (receive_message("check_health", message)) {
+                check_health();
+            }
+        }
+
+        // Initialize the parameters with random numbers
+        template <typename RNG>
+        void initialize(RNG& rng) {
+            previous_layer_trainer_->initialize(rng);
+
+            if (kIsOutputLayer) {
+                // Initialize output layer with 0
+                std::fill(std::begin(biases_), std::end(biases_),
+                          static_cast<LearnFloatType>(0.0));
+                std::fill(std::begin(weights_), std::end(weights_),
+                          static_cast<LearnFloatType>(0.0));
+            }
+            else {
+                // Assuming that the input distribution is unit-mean 0.5, equal variance,
+                // Initialize the output distribution so that each unit has a mean of 0.5 and the same variance as the input
+                const double kSigma = 1.0 / std::sqrt(kInputDimensions);
+                auto distribution = std::normal_distribution<double>(0.0, kSigma);
+
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                    double sum = 0.0;
+                      for (IndexType j = 0; j < kInputDimensions; ++j) {
+                          const auto weight = static_cast<LearnFloatType>(distribution(rng));
+                          weights_[kInputDimensions * i + j] = weight;
+                          sum += weight;
+                      }
+
+                    biases_[i] = static_cast<LearnFloatType>(0.5 - 0.5 * sum);
+                }
+            }
+
+            quantize_parameters();
+        }
+
+        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
+        {
+            const auto size = batch_end - batch_begin;
+
+            if ((long)output_.size() < (long)kOutputDimensions * size) {
+                output_.resize(kOutputDimensions * size);
+                gradients_.resize(kInputDimensions * size);
+            }
+
+            if (thread_states_.size() < thread_pool.size())
+            {
+                thread_states_.resize(thread_pool.size());
+            }
+
+            combined_batch_size_ = size;
+            combined_batch_input_ = previous_layer_trainer_->step_start(thread_pool, batch_begin, batch_end);
+
+            auto& main_thread_state = thread_states_[0];
+
 #if defined(USE_BLAS)
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      cblas_scopy(kOutputDimensions, biases_, 1, &output_[batch_offset], 1);
-    }
-    cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
-                kOutputDimensions, batch_size_, kInputDimensions, 1.0,
-                weights_, kInputDimensions,
-                batch_input_, kInputDimensions,
-                1.0, &output_[0], kOutputDimensions);
-#else
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType input_batch_offset = kInputDimensions * b;
-      const IndexType output_batch_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        double sum = biases_[i];
-        for (IndexType j = 0; j < kInputDimensions; ++j) {
-          const IndexType index = kInputDimensions * i + j;
-          sum += weights_[index] * batch_input_[input_batch_offset + j];
-        }
-        output_[output_batch_offset + i] = static_cast<LearnFloatType>(sum);
-      }
-    }
-#endif
-    return output_.data();
-  }

-  // backpropagation
-  void Backpropagate(const LearnFloatType* gradients,
-                     LearnFloatType learning_rate) {
-    const LearnFloatType local_learning_rate =
-        learning_rate * learning_rate_scale_;
+            // update
+            cblas_sscal(
+                kOutputDimensions, momentum_, main_thread_state.biases_diff_, 1
+            );
+
+#else
+
+            Blas::sscal(
+                kOutputDimensions, momentum_, main_thread_state.biases_diff_, 1
+            );
+
+#endif
+
+            for (IndexType i = 1; i < thread_states_.size(); ++i)
+                thread_states_[i].reset_biases();
+
+            return output_.data();
+        }
+
+        // forward propagation
+        void propagate(Thread& th, const uint64_t offset, const uint64_t count) {
+
+            previous_layer_trainer_->propagate(th, offset, count);
+
 #if defined(USE_BLAS)
-    // backpropagate
-    cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
-                kInputDimensions, batch_size_, kOutputDimensions, 1.0,
+
+            for (IndexType b = offset; b < offset + count; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                cblas_scopy(
+                    kOutputDimensions, biases_, 1, &output_[batch_offset], 1
+                );
+            }
+
+            cblas_sgemm(
+                CblasColMajor, CblasTrans, CblasNoTrans,
+                kOutputDimensions, count, kInputDimensions,
+                1.0,
                weights_, kInputDimensions,
-                gradients, kOutputDimensions,
-                0.0, &gradients_[0], kInputDimensions);
-    // update
-    cblas_sscal(kOutputDimensions, momentum_, biases_diff_, 1);
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      cblas_saxpy(kOutputDimensions, 1.0,
-                  &gradients[batch_offset], 1, biases_diff_, 1);
-    }
-    cblas_saxpy(kOutputDimensions, -local_learning_rate,
-                biases_diff_, 1, biases_, 1);
-    cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
-                kOutputDimensions, kInputDimensions, batch_size_, 1.0,
-                gradients, kOutputDimensions,
-                batch_input_, kInputDimensions,
-                momentum_, weights_diff_, kInputDimensions);
-    cblas_saxpy(kOutputDimensions * kInputDimensions, -local_learning_rate,
-                weights_diff_, 1, weights_, 1);
+                combined_batch_input_ + offset * kInputDimensions, kInputDimensions,
+                1.0,
+                &output_[offset * kOutputDimensions], kOutputDimensions
+            );
 #else
-    // backpropagate
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType input_batch_offset = kInputDimensions * b;
-      const IndexType output_batch_offset = kOutputDimensions * b;
-      for (IndexType j = 0; j < kInputDimensions; ++j) {
-        double sum = 0.0;
-        for (IndexType i = 0; i < kOutputDimensions; ++i) {
-          const IndexType index = kInputDimensions * i + j;
-          sum += weights_[index] * gradients[output_batch_offset + i];
-        }
-        gradients_[input_batch_offset + j] = static_cast<LearnFloatType>(sum);
-      }
-    }
-    // update
-    for (IndexType i = 0; i < kOutputDimensions; ++i) {
-      biases_diff_[i] *= momentum_;
-    }
-    for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
-      weights_diff_[i] *= momentum_;
-    }
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType input_batch_offset = kInputDimensions * b;
-      const IndexType output_batch_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        biases_diff_[i] += gradients[output_batch_offset + i];
-      }
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        for (IndexType j = 0; j < kInputDimensions; ++j) {
-          const IndexType index = kInputDimensions * i + j;
-          weights_diff_[index] += gradients[output_batch_offset + i] *
-              batch_input_[input_batch_offset + j];
-        }
-      }
-    }
-    for (IndexType i = 0; i < kOutputDimensions; ++i) {
-      biases_[i] -= local_learning_rate * biases_diff_[i];
-    }
-    for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
-      weights_[i] -= local_learning_rate * weights_diff_[i];
-    }
-#endif
-    previous_layer_trainer_->Backpropagate(gradients_.data(), learning_rate);
-  }

- private:
-  // constructor
-  Trainer(LayerType* target_layer, FeatureTransformer* feature_transformer) :
-      batch_size_(0),
-      batch_input_(nullptr),
-      previous_layer_trainer_(Trainer<PreviousLayer>::Create(
-          &target_layer->previous_layer_, feature_transformer)),
-      target_layer_(target_layer),
-      biases_(),
-      weights_(),
-      biases_diff_(),
-      weights_diff_(),
-      momentum_(0.0),
-      learning_rate_scale_(1.0) {
-    DequantizeParameters();
-  }
+            for (IndexType b = offset; b < offset + count; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                Blas::scopy(
+                    kOutputDimensions, biases_, 1, &output_[batch_offset], 1
+                );
+            }

-  // Weight saturation and parameterization
-  void QuantizeParameters() {
-    for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
-      weights_[i] = std::max(-kMaxWeightMagnitude,
-                             std::min(+kMaxWeightMagnitude, weights_[i]));
-    }
-    for (IndexType i = 0; i < kOutputDimensions; ++i) {
-      target_layer_->biases_[i] =
-          Round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
-    }
-    for (IndexType i = 0; i < kOutputDimensions; ++i) {
-      const auto offset = kInputDimensions * i;
-      const auto padded_offset = LayerType::kPaddedInputDimensions * i;
-      for (IndexType j = 0; j < kInputDimensions; ++j) {
-        target_layer_->weights_[padded_offset + j] =
-            Round<typename LayerType::WeightType>(
-                weights_[offset + j] * kWeightScale);
-      }
-    }
-  }
-
-  // read parameterized integer
-  void DequantizeParameters() {
-    for (IndexType i = 0; i < kOutputDimensions; ++i) {
-      biases_[i] = static_cast<LearnFloatType>(
-          target_layer_->biases_[i] / kBiasScale);
-    }
-    for (IndexType i = 0; i < kOutputDimensions; ++i) {
-      const auto offset = kInputDimensions * i;
-      const auto padded_offset = LayerType::kPaddedInputDimensions * i;
-      for (IndexType j = 0; j < kInputDimensions; ++j) {
-        weights_[offset + j] = static_cast<LearnFloatType>(
-            target_layer_->weights_[padded_offset + j] / kWeightScale);
-      }
-    }
-    std::fill(std::begin(biases_diff_), std::end(biases_diff_),
-              static_cast<LearnFloatType>(0.0));
-    std::fill(std::begin(weights_diff_), std::end(weights_diff_),
-              static_cast<LearnFloatType>(0.0));
-  }
-
-  // number of input/output dimensions
-  static constexpr IndexType kInputDimensions = LayerType::kInputDimensions;
-  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
-
-  // If the output dimensionality is 1, the output layer
-  static constexpr bool kIsOutputLayer = kOutputDimensions == 1;
-
-  // Coefficient used for parameterization
-  static constexpr LearnFloatType kActivationScale =
-      std::numeric_limits<std::int8_t>::max();
-  static constexpr LearnFloatType kBiasScale = kIsOutputLayer ?
-      (kPonanzaConstant * FV_SCALE) :
-      ((1 << kWeightScaleBits) * kActivationScale);
-  static constexpr LearnFloatType kWeightScale = kBiasScale / kActivationScale;
-
-  // Upper limit of absolute value of weight used to prevent overflow when parameterizing integers
-  static constexpr LearnFloatType kMaxWeightMagnitude =
-      std::numeric_limits<typename LayerType::WeightType>::max() / kWeightScale;
-
-  // number of samples in mini-batch
-  IndexType batch_size_;
-
-  // Input mini batch
-  const LearnFloatType* batch_input_;
-
-  // Trainer of the previous layer
-  const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
-
-  // layer to learn
-  LayerType* const target_layer_;
-
-  // parameter
-  LearnFloatType biases_[kOutputDimensions];
-  LearnFloatType weights_[kOutputDimensions * kInputDimensions];
-
-  // Buffer used for updating parameters
-  LearnFloatType biases_diff_[kOutputDimensions];
-  LearnFloatType weights_diff_[kOutputDimensions * kInputDimensions];
-
-  // Forward propagation buffer
-  std::vector<LearnFloatType> output_;
-
-  // buffer for back propagation
-  std::vector<LearnFloatType> gradients_;
-
-  // hyper parameter
-  LearnFloatType momentum_;
-  LearnFloatType learning_rate_scale_;
-};
-
-}  // namespace NNUE
-
-}  // namespace Eval
-
-#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+            Blas::sgemm(
+                Blas::MatrixLayout::ColMajor, Blas::MatrixTranspose::Trans, Blas::MatrixTranspose::NoTrans,
+                kOutputDimensions, count, kInputDimensions,
+                1.0,
+                weights_, kInputDimensions,
+                combined_batch_input_ + offset * kInputDimensions, kInputDimensions,
+                1.0,
+                &output_[offset * kOutputDimensions], kOutputDimensions
+            );
+
+#endif
+        }
+
+        // backpropagation
+        void backpropagate(Thread& th,
+                           const LearnFloatType* gradients,
+                           uint64_t offset,
+                           uint64_t count) {
+
+            auto& thread_state = thread_states_[th.thread_idx()];
+            const auto momentum = th.thread_idx() == 0 ? momentum_ : 0.0f;
+#if defined(USE_BLAS)
+
+            cblas_sgemm(
+                CblasColMajor, CblasNoTrans, CblasNoTrans,
+                kInputDimensions, count, kOutputDimensions,
+                1.0,
+                weights_, kInputDimensions,
+                gradients + offset * kOutputDimensions, kOutputDimensions,
+                0.0,
+                &gradients_[offset * kInputDimensions], kInputDimensions
+            );
+
+            for (IndexType b = offset; b < offset + count; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                cblas_saxpy(
+                    kOutputDimensions, 1.0,
+                    &gradients[batch_offset], 1, thread_state.biases_diff_, 1
+                );
+            }
+
+            cblas_sgemm(
+                CblasRowMajor, CblasTrans, CblasNoTrans,
+                kOutputDimensions, kInputDimensions, count,
+                1.0,
+                gradients + offset * kOutputDimensions, kOutputDimensions,
+                combined_batch_input_ + offset * kInputDimensions, kInputDimensions,
+                momentum,
+                thread_state.weights_diff_, kInputDimensions
+            );
+
+#else
+
+            // backpropagate
+            Blas::sgemm(
+                Blas::MatrixLayout::ColMajor, Blas::MatrixTranspose::NoTrans, Blas::MatrixTranspose::NoTrans,
+                kInputDimensions, count, kOutputDimensions,
+                1.0,
+                weights_, kInputDimensions,
+                gradients + offset * kOutputDimensions, kOutputDimensions,
+                0.0,
+                &gradients_[offset * kInputDimensions], kInputDimensions
+            );
+
+            for (IndexType b = offset; b < offset + count; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                Blas::saxpy(kOutputDimensions, 1.0,
+                          &gradients[batch_offset], 1, thread_state.biases_diff_, 1);
+            }
+
+            Blas::sgemm(
+                Blas::MatrixLayout::RowMajor, Blas::MatrixTranspose::Trans, Blas::MatrixTranspose::NoTrans,
+                kOutputDimensions, kInputDimensions, count,
+                1.0,
+                gradients + offset * kOutputDimensions, kOutputDimensions,
+                combined_batch_input_ + offset * kInputDimensions, kInputDimensions,
+                momentum,
+                thread_state.weights_diff_, kInputDimensions
+            );
+
+#endif
+
+            previous_layer_trainer_->backpropagate(th, gradients_.data(), offset, count);
+        }
+
+        void reduce_thread_state()
+        {
+            for (IndexType i = 1; i < thread_states_.size(); ++i)
+            {
+                thread_states_[0] += thread_states_[i];
+            }
+        }
+
+        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate)
+        {
+            const LearnFloatType local_learning_rate =
+                learning_rate * learning_rate_scale_;
+
+            reduce_thread_state();
+
+            auto& main_thread_state = thread_states_[0];
+
+            for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                const double d = local_learning_rate * main_thread_state.biases_diff_[i];
+                biases_[i] -= d;
+                abs_biases_diff_sum_ += std::abs(d);
+            }
+            num_biases_diffs_ += kOutputDimensions;
+
+            for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
+                const double d = local_learning_rate * main_thread_state.weights_diff_[i];
+                weights_[i] -= d;
+                abs_weights_diff_sum_ += std::abs(d);
+            }
+            num_weights_diffs_ += kOutputDimensions * kInputDimensions;
+
+            previous_layer_trainer_->step_end(thread_pool, learning_rate);
+        }
+
+    private:
+        // constructor
+        Trainer(LayerType* target_layer, FeatureTransformer* ft) :
+            combined_batch_size_(0),
+            combined_batch_input_(nullptr),
+            previous_layer_trainer_(Trainer<PreviousLayer>::create(
+                &target_layer->previous_layer_, ft)),
+            target_layer_(target_layer),
+            biases_(),
+            weights_(),
+            momentum_(0.2),
+            learning_rate_scale_(1.0) {
+
+            dequantize_parameters();
+        }
+
+        void reset_stats() {
+            abs_biases_diff_sum_ = 0.0;
+            abs_weights_diff_sum_ = 0.0;
+            num_biases_diffs_ = 0;
+            num_weights_diffs_ = 0;
+        }
+
+        void check_health() {
+
+            double abs_bias_sum = 0.0;
+            double abs_weight_sum = 0.0;
+
+            for(auto b : biases_)
+                abs_bias_sum += std::abs(b);
+
+            for(auto w : weights_)
+                abs_weight_sum += std::abs(w);
+
+            auto out = sync_region_cout.new_region();
+
+            out << "INFO (check_health):"
+                << " layer " << LayerType::kLayerIndex
+                << " - " << LayerType::get_name()
+                << std::endl;
+
+            out << "  - avg_abs_bias        = " << abs_bias_sum / std::size(biases_) << std::endl;
+            out << "  - avg_abs_bias_diff   = " << abs_biases_diff_sum_ / num_biases_diffs_ << std::endl;
+            out << "  - avg_abs_weight      = " << abs_weight_sum / std::size(weights_) << std::endl;
+            out << "  - avg_abs_weight_diff = " << abs_weights_diff_sum_ / num_weights_diffs_ << std::endl;
+
+            out.unlock();
+
+            reset_stats();
+        }
+
+        // Weight saturation and parameterization
+        void quantize_parameters() {
+            for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
+                weights_[i] = std::max(-kMaxWeightMagnitude,
+                                       std::min(+kMaxWeightMagnitude, weights_[i]));
+            }
+
+            for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                target_layer_->biases_[i] =
+                    round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
+            }
+
+            for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                const auto offset = kInputDimensions * i;
+                const auto padded_offset = LayerType::kPaddedInputDimensions * i;
+                for (IndexType j = 0; j < kInputDimensions; ++j) {
+                    target_layer_->weights_[padded_offset + j] =
+                        round<typename LayerType::WeightType>(
+                            weights_[offset + j] * kWeightScale);
+                }
+            }
+        }
+
+        // read parameterized integer
+        void dequantize_parameters() {
+            for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                biases_[i] = static_cast<LearnFloatType>(
+                    target_layer_->biases_[i] / kBiasScale);
+            }
+
+            for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                const auto offset = kInputDimensions * i;
+                const auto padded_offset = LayerType::kPaddedInputDimensions * i;
+                for (IndexType j = 0; j < kInputDimensions; ++j) {
+                    weights_[offset + j] = static_cast<LearnFloatType>(
+                        target_layer_->weights_[padded_offset + j] / kWeightScale);
+                }
+            }
+
+            for (auto& state : thread_states_)
+            {
+                state.reset_weights();
+                state.reset_biases();
+            }
+
+
+            reset_stats();
+        }
+
+        // number of input/output dimensions
+        static constexpr IndexType kInputDimensions = LayerType::kInputDimensions;
+        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+
+        // If the output dimensionality is 1, the output layer
+        static constexpr bool kIsOutputLayer = kOutputDimensions == 1;
+
+        // Coefficient used for parameterization
+        static constexpr LearnFloatType kActivationScale =
+            std::numeric_limits<std::int8_t>::max();
+
+        static constexpr LearnFloatType kBiasScale = kIsOutputLayer ?
+            (kPonanzaConstant * FV_SCALE) :
+            ((1 << kWeightScaleBits) * kActivationScale);
+
+        static constexpr LearnFloatType kWeightScale = kBiasScale / kActivationScale;
+
+        // Upper limit of absolute value of weight used to prevent overflow when parameterizing integers
+        static constexpr LearnFloatType kMaxWeightMagnitude =
+            std::numeric_limits<typename LayerType::WeightType>::max() / kWeightScale;
+
+        // number of samples in mini-batch
+        IndexType combined_batch_size_;
+
+        double abs_biases_diff_sum_;
+        double abs_weights_diff_sum_;
+        uint64_t num_biases_diffs_;
+        uint64_t num_weights_diffs_;
+
+        // Input mini batch
+        const LearnFloatType* combined_batch_input_;
+
+        // Trainer of the previous layer
+        const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
+
+        // layer to learn
+        LayerType* const target_layer_;
+
+        // parameter
+        struct alignas(kCacheLineSize) ThreadState
+        {
+            // Buffer used for updating parameters
+            alignas(kCacheLineSize) LearnFloatType biases_diff_[kOutputDimensions];
+            alignas(kCacheLineSize) LearnFloatType weights_diff_[kOutputDimensions * kInputDimensions];
+
+            ThreadState() { reset_weights(); reset_biases(); }
+
+            ThreadState& operator+=(const ThreadState& other)
+            {
+                for (IndexType i = 0; i < kOutputDimensions; ++i)
+                {
+                    biases_diff_[i] += other.biases_diff_[i];
+                }
+
+                for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i)
+                {
+                    weights_diff_[i] += other.weights_diff_[i];
+                }
+
+                return *this;
+            }
+
+            void reset_weights()
+            {
+                std::fill(std::begin(weights_diff_), std::end(weights_diff_), 0.0f);
+            }
+
+            void reset_biases()
+            {
+                std::fill(std::begin(biases_diff_), std::end(biases_diff_), 0.0f);
+            }
+        };
+
+        alignas(kCacheLineSize) LearnFloatType biases_[kOutputDimensions];
+        alignas(kCacheLineSize) LearnFloatType weights_[kOutputDimensions * kInputDimensions];
+
+        std::vector<ThreadState, CacheLineAlignedAllocator<ThreadState>> thread_states_;
+
+        // Forward propagation buffer
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
+
+        // buffer for back propagation
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
+
+        // hyper parameter
+        LearnFloatType momentum_;
+        LearnFloatType learning_rate_scale_;
+    };
+
+}  // namespace Eval::NNUE

 #endif
@@ -1,142 +1,356 @@
-// Specialization of NNUE evaluation function learning class template for ClippedReLU
-
-#ifndef _NNUE_TRAINER_CLIPPED_RELU_H_
+#ifndef _NNUE_TRAINER_CLIPPED_RELU_H_
 #define _NNUE_TRAINER_CLIPPED_RELU_H_

-#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
-
-#include "../../learn/learn.h"
-#include "../layers/clipped_relu.h"
 #include "trainer.h"

-namespace Eval {
+#include "learn/learn.h"

-namespace NNUE {
+#include "nnue/layers/clipped_relu.h"

-// Learning: Affine transformation layer
-template <typename PreviousLayer>
-class Trainer<Layers::ClippedReLU<PreviousLayer>> {
- private:
-  // Type of layer to learn
-  using LayerType = Layers::ClippedReLU<PreviousLayer>;
+#include "thread.h"

- public:
-  // factory function
-  static std::shared_ptr<Trainer> Create(
-      LayerType* target_layer, FeatureTransformer* feature_transformer) {
-    return std::shared_ptr<Trainer>(
-        new Trainer(target_layer, feature_transformer));
-  }
+// Specialization of NNUE evaluation function learning class template for ClippedReLU
+namespace Eval::NNUE {

-  // Set options such as hyperparameters
-  void SendMessage(Message* message) {
-    previous_layer_trainer_->SendMessage(message);
-    if (ReceiveMessage("check_health", message)) {
-      CheckHealth();
-    }
-  }
+    // Learning: Affine transformation layer
+    template <typename PreviousLayer>
+    class Trainer<Layers::ClippedReLU<PreviousLayer>> {
+    private:
+        // Type of layer to learn
+        using LayerType = Layers::ClippedReLU<PreviousLayer>;

-  // Initialize the parameters with random numbers
-  template <typename RNG>
-  void Initialize(RNG& rng) {
-    previous_layer_trainer_->Initialize(rng);
-  }
+    public:
+        // factory function
+        static std::shared_ptr<Trainer> create(
+            LayerType* target_layer, FeatureTransformer* ft) {

-  // forward propagation
-  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
-    if (output_.size() < kOutputDimensions * batch.size()) {
-      output_.resize(kOutputDimensions * batch.size());
-      gradients_.resize(kInputDimensions * batch.size());
-    }
-    const auto input = previous_layer_trainer_->Propagate(batch);
-    batch_size_ = static_cast<IndexType>(batch.size());
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        const IndexType index = batch_offset + i;
-        output_[index] = std::max(+kZero, std::min(+kOne, input[index]));
-        min_activations_[i] = std::min(min_activations_[i], output_[index]);
-        max_activations_[i] = std::max(max_activations_[i], output_[index]);
-      }
-    }
-    return output_.data();
-  }
+            return std::shared_ptr<Trainer>(
+                new Trainer(target_layer, ft));
+        }

-  // backpropagation
-  void Backpropagate(const LearnFloatType* gradients,
-                     LearnFloatType learning_rate) {
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        const IndexType index = batch_offset + i;
-        gradients_[index] = gradients[index] *
-            (output_[index] > kZero) * (output_[index] < kOne);
-      }
-    }
-    previous_layer_trainer_->Backpropagate(gradients_.data(), learning_rate);
-  }
+        // Set options such as hyperparameters
+        void send_message(Message* message) {
+            previous_layer_trainer_->send_message(message);
+            if (receive_message("check_health", message)) {
+                check_health();
+            }
+        }

- private:
-  // constructor
-  Trainer(LayerType* target_layer, FeatureTransformer* feature_transformer) :
-      batch_size_(0),
-      previous_layer_trainer_(Trainer<PreviousLayer>::Create(
-          &target_layer->previous_layer_, feature_transformer)),
-      target_layer_(target_layer) {
-    std::fill(std::begin(min_activations_), std::end(min_activations_),
-              std::numeric_limits<LearnFloatType>::max());
-    std::fill(std::begin(max_activations_), std::end(max_activations_),
-              std::numeric_limits<LearnFloatType>::lowest());
-  }
+        // Initialize the parameters with random numbers
+        template <typename RNG>
+        void initialize(RNG& rng) {
+            previous_layer_trainer_->initialize(rng);
+        }

-  // Check if there are any problems with learning
-  void CheckHealth() {
-    const auto largest_min_activation = *std::max_element(
-        std::begin(min_activations_), std::end(min_activations_));
-    const auto smallest_max_activation = *std::min_element(
-        std::begin(max_activations_), std::end(max_activations_));
-    std::cout << "INFO: largest min activation = " << largest_min_activation
-              << ", smallest max activation = " << smallest_max_activation
-              << std::endl;
+        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
+        {
+            const auto size = batch_end - batch_begin;

-    std::fill(std::begin(min_activations_), std::end(min_activations_),
-              std::numeric_limits<LearnFloatType>::max());
-    std::fill(std::begin(max_activations_), std::end(max_activations_),
-              std::numeric_limits<LearnFloatType>::lowest());
-  }
+            if ((long)output_.size() < (long)kOutputDimensions * size) {
+              output_.resize(kOutputDimensions * size);
+              gradients_.resize(kInputDimensions * size);
+            }

-  // number of input/output dimensions
-  static constexpr IndexType kInputDimensions = LayerType::kOutputDimensions;
-  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+            if (thread_states_.size() < thread_pool.size())
+            {
+                thread_states_.resize(thread_pool.size());
+            }

-  // LearnFloatType constant
-  static constexpr LearnFloatType kZero = static_cast<LearnFloatType>(0.0);
-  static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
+            input_ = previous_layer_trainer_->step_start(thread_pool, batch_begin, batch_end);

-  // number of samples in mini-batch
-  IndexType batch_size_;
+            batch_size_ = size;

-  // Trainer of the previous layer
-  const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
+            return output_.data();
+        }

-  // layer to learn
-  LayerType* const target_layer_;
+        // forward propagation
+        void propagate(Thread& th, const uint64_t offset, const uint64_t count) {

-  // Forward propagation buffer
-  std::vector<LearnFloatType> output_;
+            auto& thread_state = thread_states_[th.thread_idx()];

-  // buffer for back propagation
-  std::vector<LearnFloatType> gradients_;
+            previous_layer_trainer_->propagate(th, offset, count);

-  // Health check statistics
-  LearnFloatType min_activations_[kOutputDimensions];
-  LearnFloatType max_activations_[kOutputDimensions];
-};
+#if defined (USE_SSE2)

-}  // namespace NNUE
+            {
+                static_assert(kOutputDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time");

-}  // namespace Eval
+                const __m128 kZero4 = _mm_set1_ps(+kZero);
+                const __m128 kOne4 = _mm_set1_ps(+kOne);

-#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+                for (IndexType b = offset; b < offset + count; ++b)
+                {
+                    const IndexType batch_offset = kOutputDimensions * b;
+
+                    for (IndexType i = 0; i < kOutputDimensions; i += 16)
+                    {
+                        __m128 out0 = _mm_loadu_ps(&input_[i + 0 + batch_offset]);
+                        __m128 out1 = _mm_loadu_ps(&input_[i + 4 + batch_offset]);
+                        __m128 out2 = _mm_loadu_ps(&input_[i + 8 + batch_offset]);
+                        __m128 out3 = _mm_loadu_ps(&input_[i + 12 + batch_offset]);
+
+                        out0 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out0));
+                        out1 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out1));
+                        out2 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out2));
+                        out3 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out3));
+
+                        _mm_storeu_ps(&output_[i + 0 + batch_offset], out0);
+                        _mm_storeu_ps(&output_[i + 4 + batch_offset], out1);
+                        _mm_storeu_ps(&output_[i + 8 + batch_offset], out2);
+                        _mm_storeu_ps(&output_[i + 12 + batch_offset], out3);
+
+                        __m128 minact0 = _mm_loadu_ps(&thread_state.min_activations_[i + 0]);
+                        __m128 minact1 = _mm_loadu_ps(&thread_state.min_activations_[i + 4]);
+                        __m128 minact2 = _mm_loadu_ps(&thread_state.min_activations_[i + 8]);
+                        __m128 minact3 = _mm_loadu_ps(&thread_state.min_activations_[i + 12]);
+
+                        __m128 maxact0 = _mm_loadu_ps(&thread_state.max_activations_[i + 0]);
+                        __m128 maxact1 = _mm_loadu_ps(&thread_state.max_activations_[i + 4]);
+                        __m128 maxact2 = _mm_loadu_ps(&thread_state.max_activations_[i + 8]);
+                        __m128 maxact3 = _mm_loadu_ps(&thread_state.max_activations_[i + 12]);
+
+                        minact0 = _mm_min_ps(out0, minact0);
+                        minact1 = _mm_min_ps(out1, minact1);
+                        minact2 = _mm_min_ps(out2, minact2);
+                        minact3 = _mm_min_ps(out3, minact3);
+
+                        maxact0 = _mm_max_ps(out0, maxact0);
+                        maxact1 = _mm_max_ps(out1, maxact1);
+                        maxact2 = _mm_max_ps(out2, maxact2);
+                        maxact3 = _mm_max_ps(out3, maxact3);
+
+                        _mm_storeu_ps(&thread_state.min_activations_[i + 0], minact0);
+                        _mm_storeu_ps(&thread_state.min_activations_[i + 4], minact1);
+                        _mm_storeu_ps(&thread_state.min_activations_[i + 8], minact2);
+                        _mm_storeu_ps(&thread_state.min_activations_[i + 12], minact3);
+
+                        _mm_storeu_ps(&thread_state.max_activations_[i + 0], maxact0);
+                        _mm_storeu_ps(&thread_state.max_activations_[i + 4], maxact1);
+                        _mm_storeu_ps(&thread_state.max_activations_[i + 8], maxact2);
+                        _mm_storeu_ps(&thread_state.max_activations_[i + 12], maxact3);
+                    }
+                }
+            }
+
+#else
+
+            for (IndexType b = offset; b < offset + count; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                    const IndexType index = batch_offset + i;
+                    output_[index] = std::max(+kZero, std::min(+kOne, input_[index]));
+                    thread_state.min_activations_[i] = std::min(thread_state.min_activations_[i], output_[index]);
+                    thread_state.max_activations_[i] = std::max(thread_state.max_activations_[i], output_[index]);
+                }
+            }
+
+#endif
+        }
+
+        // backpropagation
+        void backpropagate(Thread& th,
+                           const LearnFloatType* gradients,
+                           const uint64_t offset,
+                           const uint64_t count) {
+
+            auto& thread_state = thread_states_[th.thread_idx()];
+
+#if defined (USE_SSE2)
+
+            {
+                static_assert(kOutputDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time");
+
+                const __m128 kZero4 = _mm_set1_ps(+kZero);
+                const __m128 kOne4 = _mm_set1_ps(+kOne);
+
+                for (IndexType b = offset; b < offset + count; ++b)
+                {
+                    const IndexType batch_offset = kOutputDimensions * b;
+
+                    for (IndexType i = 0; i < kOutputDimensions; i += 16)
+                    {
+                        __m128 out0 = _mm_loadu_ps(&output_[batch_offset + i + 0]);
+                        __m128 out1 = _mm_loadu_ps(&output_[batch_offset + i + 4]);
+                        __m128 out2 = _mm_loadu_ps(&output_[batch_offset + i + 8]);
+                        __m128 out3 = _mm_loadu_ps(&output_[batch_offset + i + 12]);
+
+                        __m128 clipped0 = _mm_or_ps(_mm_cmple_ps(out0, kZero4), _mm_cmpge_ps(out0, kOne4));
+                        __m128 clipped1 = _mm_or_ps(_mm_cmple_ps(out1, kZero4), _mm_cmpge_ps(out1, kOne4));
+                        __m128 clipped2 = _mm_or_ps(_mm_cmple_ps(out2, kZero4), _mm_cmpge_ps(out2, kOne4));
+                        __m128 clipped3 = _mm_or_ps(_mm_cmple_ps(out3, kZero4), _mm_cmpge_ps(out3, kOne4));
+
+                        __m128 grad0 = _mm_loadu_ps(&gradients[batch_offset + i + 0]);
+                        __m128 grad1 = _mm_loadu_ps(&gradients[batch_offset + i + 4]);
+                        __m128 grad2 = _mm_loadu_ps(&gradients[batch_offset + i + 8]);
+                        __m128 grad3 = _mm_loadu_ps(&gradients[batch_offset + i + 12]);
+
+                        grad0 = _mm_andnot_ps(clipped0, grad0);
+                        grad1 = _mm_andnot_ps(clipped1, grad1);
+                        grad2 = _mm_andnot_ps(clipped2, grad2);
+                        grad3 = _mm_andnot_ps(clipped3, grad3);
+
+                        _mm_storeu_ps(&gradients_[batch_offset + i + 0], grad0);
+                        _mm_storeu_ps(&gradients_[batch_offset + i + 4], grad1);
+                        _mm_storeu_ps(&gradients_[batch_offset + i + 8], grad2);
+                        _mm_storeu_ps(&gradients_[batch_offset + i + 12], grad3);
+
+                        const int clipped_mask =
+                            (_mm_movemask_ps(clipped0) << 0)
+                            | (_mm_movemask_ps(clipped1) << 4)
+                            | (_mm_movemask_ps(clipped2) << 8)
+                            | (_mm_movemask_ps(clipped3) << 12);
+
+                        thread_state.num_clipped_ += popcount(clipped_mask);
+                    }
+                }
+            }
+
+#else
+
+            for (IndexType b = offset; b < offset + count; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                    const IndexType index = batch_offset + i;
+                    const bool clipped = (output_[index] <= kZero) | (output_[index] >= kOne);
+                    gradients_[index] = gradients[index] * !clipped;
+                    thread_state.num_clipped_ += clipped;
+                }
+            }
+
+#endif
+
+            thread_state.num_total_ += count * kOutputDimensions;
+
+            previous_layer_trainer_->backpropagate(th, gradients_.data(), offset, count);
+        }
+
+        void reduce_thread_state()
+        {
+            for (IndexType i = 1; i < thread_states_.size(); ++i)
+            {
+                thread_states_[0] += thread_states_[i];
+            }
+        }
+
+        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate)
+        {
+            previous_layer_trainer_->step_end(thread_pool, learning_rate);
+        }
+
+    private:
+        // constructor
+        Trainer(LayerType* target_layer, FeatureTransformer* ft) :
+            batch_size_(0),
+            previous_layer_trainer_(Trainer<PreviousLayer>::create(
+                &target_layer->previous_layer_, ft)),
+            target_layer_(target_layer) {
+
+            reset_stats();
+        }
+
+        void reset_stats() {
+            for(auto& state : thread_states_)
+                state.reset();
+        }
+
+        // Check if there are any problems with learning
+        void check_health() {
+
+            reduce_thread_state();
+
+            auto& main_thread_state = thread_states_[0];
+
+            const auto largest_min_activation = *std::max_element(
+                std::begin(main_thread_state.min_activations_), std::end(main_thread_state.min_activations_));
+            const auto smallest_max_activation = *std::min_element(
+                std::begin(main_thread_state.max_activations_), std::end(main_thread_state.max_activations_));
+
+            auto out = sync_region_cout.new_region();
+
+            out << "INFO (check_health):"
+                << " layer " << LayerType::kLayerIndex
+                << " - " << LayerType::get_name()
+                << std::endl;
+
+            out << "  - largest min activation = " << largest_min_activation
+                << " , smallest max activation = " << smallest_max_activation
+                << std::endl;
+
+            out << "  - clipped " << static_cast<double>(main_thread_state.num_clipped_) / main_thread_state.num_total_ * 100.0 << "% of outputs"
+                << std::endl;
+
+            out.unlock();
+
+            reset_stats();
+        }
+
+        // number of input/output dimensions
+        static constexpr IndexType kInputDimensions = LayerType::kOutputDimensions;
+        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+
+        // LearnFloatType constant
+        static constexpr LearnFloatType kZero = static_cast<LearnFloatType>(0.0);
+        static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
+
+        // number of samples in mini-batch
+        IndexType batch_size_;
+
+        IndexType num_total_;
+
+        const LearnFloatType* input_;
+
+        // Trainer of the previous layer
+        const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
+
+        // layer to learn
+        LayerType* const target_layer_;
+
+        // Forward propagation buffer
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
+
+        // buffer for back propagation
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
+
+        struct alignas(kCacheLineSize) ThreadState
+        {
+            // Health check statistics
+            LearnFloatType min_activations_[kOutputDimensions];
+            LearnFloatType max_activations_[kOutputDimensions];
+            IndexType num_clipped_;
+            IndexType num_total_;
+
+            ThreadState() { reset(); }
+
+            ThreadState& operator+=(const ThreadState& other)
+            {
+                for (IndexType i = 0; i < kOutputDimensions; ++i)
+                {
+                    min_activations_[i] = std::min(min_activations_[i], other.min_activations_[i]);
+                }
+
+                for (IndexType i = 0; i < kOutputDimensions; ++i)
+                {
+                    max_activations_[i] = std::max(max_activations_[i], other.max_activations_[i]);
+                }
+
+                num_clipped_ += other.num_clipped_;
+                num_total_ += other.num_total_;
+
+                return *this;
+            }
+
+            void reset()
+            {
+                std::fill(std::begin(min_activations_), std::end(min_activations_), std::numeric_limits<float>::max());
+                std::fill(std::begin(max_activations_), std::end(max_activations_), std::numeric_limits<float>::lowest());
+                num_clipped_ = 0;
+                num_total_ = 0;
+            }
+        };
+
+        std::vector<ThreadState, CacheLineAlignedAllocator<ThreadState>> thread_states_;
+    };
+
+}  // namespace Eval::NNUE

 #endif
--- a/Show More
+++ b/Show More