Merge branch 'master' into stockfish-nnue-2020-08-30-macos

2026-05-20 16:47:37 +00:00 · 2020-12-08 22:49:11 +08:00
parent bb26ce5aa1 3a1bd1185f
commit 055f907315
121 changed files with 23203 additions and 9127 deletions
@@ -28,43 +28,49 @@ else
 EXE = stockfish
 endif

-### Installation dir definitions
-PREFIX = /usr/local
-BINDIR = $(PREFIX)/bin
-
-### Built-in benchmark for pgo-builds
-PGOBENCH = ./$(EXE) bench
-PGOGENSFEN = ./$(EXE) gensfen depth 3 loop 100000
-
-### Source and object files
-SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp \
-	material.cpp misc.cpp movegen.cpp movepick.cpp pawns.cpp position.cpp psqt.cpp \
-	search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \
-	nnue/evaluate_nnue.cpp \
-	nnue/evaluate_nnue_learner.cpp \
-	nnue/features/half_kp.cpp \
-	nnue/features/half_relative_kp.cpp \
-	nnue/features/k.cpp \
-	nnue/features/p.cpp \
-	nnue/features/castling_right.cpp \
-	nnue/features/enpassant.cpp \
-	nnue/nnue_test_command.cpp \
-	extra/sfen_packer.cpp \
-	learn/gensfen2019.cpp \
-	learn/learner.cpp \
-	learn/learning_tools.cpp \
-	learn/multi_think.cpp
-
-OBJS = $(notdir $(SRCS:.cpp=.o))
-
-VPATH = syzygy:nnue:nnue/features:eval:extra:learn
-
 ### Establish the operating system name
 KERNEL = $(shell uname -s)
 ifeq ($(KERNEL),Linux)
 	OS = $(shell uname -o)
 endif

+### Installation dir definitions
+PREFIX = /usr/local
+BINDIR = $(PREFIX)/bin
+
+### Built-in benchmark for pgo-builds
+PGO_TRAINING_DATA_FILE = pgo_training_data.bin
+PGOBENCH = ./$(EXE) bench
+PGOGENSFEN = ./$(EXE) gensfen depth 3 loop 1000 sfen_format bin output_file_name $(PGO_TRAINING_DATA_FILE)
+
+### Source and object files
+SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp \
+	material.cpp misc.cpp movegen.cpp movepick.cpp pawns.cpp position.cpp psqt.cpp \
+	search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \
+	extra/stockfish_blas.cpp \
+	nnue/evaluate_nnue.cpp \
+	nnue/evaluate_nnue_learner.cpp \
+	nnue/features/half_kp.cpp \
+	nnue/features/half_ka.cpp \
+	nnue/features/half_relative_kp.cpp \
+	nnue/features/half_relative_ka.cpp \
+	nnue/features/k.cpp \
+	nnue/features/p.cpp \
+	nnue/features/a.cpp \
+	nnue/features/castling_right.cpp \
+	nnue/features/enpassant.cpp \
+	nnue/nnue_test_command.cpp \
+	learn/sfen_packer.cpp \
+	learn/learn.cpp \
+	learn/gensfen.cpp \
+	learn/opening_book.cpp \
+	learn/convert.cpp \
+	learn/transform.cpp
+
+OBJS = $(notdir $(SRCS:.cpp=.o))
+
+VPATH = syzygy:nnue:nnue/features:eval:extra:learn
+
 ### ==========================================================================
 ### Section 2. High-level Configuration
 ### ==========================================================================
@@ -99,17 +105,23 @@ endif

 ### 2.1. General and architecture defaults

+ifeq ($(ARCH),)
+   ARCH = x86-64-modern
+   help_skip_sanity = yes
+endif
 # explicitly check for the list of supported architectures (as listed with make help),
 # the user can override with `make ARCH=x86-32-vnni256 SUPPORTED_ARCH=true`
-ifeq ($(ARCH),$(filter $(ARCH),x86-64-vnni512 x86-64-vnni256 x86-64-avx512 x86-64-bmi2 x86-64-avx2 \
-                               x86-64-sse41-popcnt x86-64-modern x86-64-ssse3 x86-64-sse3-popcnt \
-                               x86-64 x86-32-sse41-popcnt x86-32-sse2 x86-32 ppc-64 ppc-32 \
-                               armv7 armv7-neon armv8 apple-silicon general-64 general-32))
+ifeq ($(ARCH), $(filter $(ARCH), \
+                 x86-64-vnni512 x86-64-vnni256 x86-64-avx512 x86-64-bmi2 x86-64-avx2 \
+                 x86-64-sse41-popcnt x86-64-modern x86-64-ssse3 x86-64-sse3-popcnt \
+                 x86-64 x86-32-sse41-popcnt x86-32-sse2 x86-32 ppc-64 ppc-32 \
+                 armv7 armv7-neon armv8 apple-silicon general-64 general-32))
   SUPPORTED_ARCH=true
 else
   SUPPORTED_ARCH=false
 endif

+blas = no
 optimize = yes
 debug = no
 sanitize = no
@@ -127,7 +139,6 @@ avx512 = no
 vnni256 = no
 vnni512 = no
 neon = no
-ARCH = x86-64-modern
 STRIP = strip

 ### 2.2 Architecture specific
@@ -306,9 +317,9 @@ endif
 ### ==========================================================================

 ### 3.1 Selecting compiler (default = gcc)
-CXXFLAGS += -Wall -Wcast-qual -fno-exceptions -std=c++17 $(EXTRACXXFLAGS) $(LEARNCXXFLAGS)
-DEPENDFLAGS += -std=c++17
-LDFLAGS += $(EXTRALDFLAGS) $(LEARNLDFLAGS)
+CXXFLAGS += -g -Wall -Wcast-qual -fno-exceptions -std=c++17 -fopenmp -I. $(EXTRACXXFLAGS)
+LDFLAGS += -fopenmp $(EXTRALDFLAGS)
+DEPENDFLAGS += -std=c++17 -I.

 ifeq ($(COMP),)
 	COMP=gcc
@@ -391,19 +402,6 @@ ifeq ($(COMP),clang)
 	endif
 endif

-ifeq ($(comp),icc)
-	profile_make = icc-profile-make
-	profile_use = icc-profile-use
-else
-ifeq ($(comp),clang)
-	profile_make = clang-profile-make
-	profile_use = clang-profile-use
-else
-	profile_make = gcc-profile-make
-	profile_use = gcc-profile-use
-endif
-endif
-
 ifeq ($(KERNEL),Darwin)
 	CXXFLAGS += -arch $(arch) -mmacosx-version-min=10.14
 	LDFLAGS += -arch $(arch) -mmacosx-version-min=10.14
@@ -415,20 +413,30 @@ endif
 # Currently we don't know how to make PGO builds with the NDK yet.
 ifeq ($(COMP),ndk)
 	CXXFLAGS += -stdlib=libc++ -fPIE
+	comp=clang
 	ifeq ($(arch),armv7)
-		comp=armv7a-linux-androideabi16-clang
 		CXX=armv7a-linux-androideabi16-clang++
 		CXXFLAGS += -mthumb -march=armv7-a -mfloat-abi=softfp -mfpu=neon
 		STRIP=arm-linux-androideabi-strip
 	endif
 	ifeq ($(arch),armv8)
-		comp=aarch64-linux-android21-clang
 		CXX=aarch64-linux-android21-clang++
 		STRIP=aarch64-linux-android-strip
 	endif
 	LDFLAGS += -static-libstdc++ -pie -lm -latomic
 endif

+ifeq ($(comp),icc)
+	profile_make = icc-profile-make
+	profile_use = icc-profile-use
+else ifeq ($(comp),clang)
+	profile_make = clang-profile-make
+	profile_use = clang-profile-use
+else
+	profile_make = gcc-profile-make
+	profile_use = gcc-profile-use
+endif
+
 ### Travis CI script uses COMPILER to overwrite CXX
 ifdef COMPILER
 	COMPCXX=$(COMPILER)
@@ -463,14 +471,33 @@ ifneq ($(comp),mingw)
 endif
 endif

-### 3.2.1 Debugging
+### 3.2.1. BLAS libraries
+ifeq ($(blas), yes)
+	LDFLAGS += -lopenblas
+
+	ifeq ($(KERNEL),Linux)
+		LDFLAGS +=
+	else
+		CXXFLAGS += -I/mingw64/include/OpenBLAS
+
+		ifeq ($(debug),yes)
+			LDFLAGS += -Wl,-static
+		else
+			LDFLAGS += -Wl,-s -static
+		endif
+	endif
+
+	CXXFLAGS += -DUSE_BLAS
+endif
+
+### 3.2.2 Debugging
 ifeq ($(debug),no)
 	CXXFLAGS += -DNDEBUG
 else
 	CXXFLAGS += -g
 endif

-### 3.2.2 Debugging with undefined behavior sanitizers
+### 3.2.3 Debugging with undefined behavior sanitizers
 ifneq ($(sanitize),no)
        CXXFLAGS += -g3 -fsanitize=$(sanitize)
        LDFLAGS += -fsanitize=$(sanitize)
@@ -600,11 +627,13 @@ endif
 ### needs access to the optimization flags.
 ifeq ($(optimize),yes)
 ifeq ($(debug), no)
-	ifeq ($(COMP),ndk)
-		CXXFLAGS += -flto=thin
-		LDFLAGS += $(CXXFLAGS)
-	else ifeq ($(comp),clang)
+	ifeq ($(comp),clang)
 		CXXFLAGS += -flto=thin
+		ifneq ($(findstring MINGW,$(KERNEL)),)
+			CXXFLAGS += -fuse-ld=lld
+		else ifneq ($(findstring MSYS,$(KERNEL)),)
+			CXXFLAGS += -fuse-ld=lld
+		endif
 		LDFLAGS += $(CXXFLAGS)

 # GCC and CLANG use different methods for parallelizing LTO and CLANG pretends to be
@@ -628,10 +657,12 @@ ifeq ($(debug), no)
 # So, only enable it for a cross from Linux by default.
 	else ifeq ($(comp),mingw)
 	ifeq ($(KERNEL),Linux)
+	ifneq ($(arch),i386)
 		CXXFLAGS += -flto
 		LDFLAGS += $(CXXFLAGS) -flto=jobserver
 	endif
 	endif
+	endif
 endif
 endif

@@ -707,11 +738,12 @@ help:
 	@echo "make -j build ARCH=x86-64-ssse3 COMP=clang"
 	@echo ""
 	@echo "-------------------------------"
-ifeq ($(SUPPORTED_ARCH), true)
+ifeq ($(SUPPORTED_ARCH)$(help_skip_sanity), true)
 	@echo "The selected architecture $(ARCH) will enable the following configuration: "
 	@$(MAKE) ARCH=$(ARCH) COMP=$(COMP) config-sanity
 else
 	@echo "Specify a supported architecture with the ARCH option for more details"
+	@echo ""
 endif


@@ -719,7 +751,7 @@ endif
        config-sanity icc-profile-use icc-profile-make gcc-profile-use gcc-profile-make \
        clang-profile-use clang-profile-make

-build: config-sanity
+build: net config-sanity
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) all

 profile-build: net config-sanity objclean profileclean
@@ -729,6 +761,7 @@ profile-build: net config-sanity objclean profileclean
 	@echo ""
 	@echo "Step 2/4. Running benchmark for pgo-build ..."
 	$(PGOBENCH) > /dev/null
+	$(PGOGENSFEN) > /dev/null
 	@echo ""
 	@echo "Step 3/4. Building optimized executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) objclean
@@ -745,12 +778,13 @@ install:
 	-cp $(EXE) $(BINDIR)
 	-strip $(BINDIR)/$(EXE)

-#clean all
+# clean all
 clean: objclean profileclean
 	@rm -f .depend *~ core

+# evaluation network (nnue)
 net:
-	$(eval nnuenet := $(shell grep EvalFile ucioption.cpp | grep Option | sed 's/.*\(nn-[a-z0-9]\{12\}.nnue\).*/\1/'))
+	$(eval nnuenet := $(shell grep EvalFileDefaultName evaluate.h | grep define | sed 's/.*\(nn-[a-z0-9]\{12\}.nnue\).*/\1/'))
 	@echo "Default net: $(nnuenet)"
 	$(eval nnuedownloadurl := https://tests.stockfishchess.org/api/nn/$(nnuenet))
 	$(eval curl_or_wget := $(shell if hash curl 2>/dev/null; then echo "curl -skL"; elif hash wget 2>/dev/null; then echo "wget -qO-"; fi))
@@ -772,7 +806,6 @@ net:
            echo "shasum / sha256sum not found, skipping net validation"; \
        fi

-
 # clean binaries and objects
 objclean:
 	@rm -f $(EXE) *.o ./syzygy/*.o ./nnue/*.o ./nnue/features/*.o ./learn/*.o ./extra/*.o ./eval/*.o
@@ -782,6 +815,7 @@ profileclean:
 	@rm -rf profdir
 	@rm -f bench.txt *.gcda *.gcno ./syzygy/*.gcda ./nnue/*.gcda ./nnue/features/*.gcda *.s ./learn/*.gcda ./extra/*.gcda ./eval/*.gcda
 	@rm -f stockfish.profdata *.profraw
+	@rm -f $(PGO_TRAINING_DATA_FILE)

 default:
 	help
@@ -792,7 +826,7 @@ default:

 all: $(EXE) .depend

-config-sanity:
+config-sanity: net
 	@echo ""
 	@echo "Config:"
 	@echo "debug: '$(debug)'"
@@ -913,6 +947,6 @@ profile-learn: config-sanity objclean profileclean
 	rm generated_kifu.bin

 .depend:
-	-@$(CXX) $(DEPENDFLAGS) -MM $(SRCS) > $@ 2> /dev/null
+	-@$(CXX) $(DEPENDFLAGS) -MM $(SRCS) > $@

 -include .depend
@@ -164,5 +164,7 @@ vector<string> setup_bench(const Position& current, istream& is) {
          ++posCounter;
      }

+  list.emplace_back("setoption name Use NNUE value true");
+
  return list;
 }
@@ -1,82 +0,0 @@
-#ifndef _EVALUATE_COMMON_H_
-#define _EVALUATE_COMMON_H_
-
-// A common header-like function for modern evaluation functions (EVAL_KPPT and EVAL_KPP_KKPT).
-
-#if defined(EVAL_NNUE) || defined(EVAL_LEARN)
-#include <functional>
-
-// KK file name
-#define KK_BIN "KK_synthesized.bin"
-
-// KKP file name
-#define KKP_BIN "KKP_synthesized.bin"
-
-// KPP file name
-#define KPP_BIN "KPP_synthesized.bin"
-
-namespace Eval
-{
-
-#if defined(USE_EVAL_HASH)
-	// prefetch function
-	void prefetch_evalhash(const Key key);
-#endif
-
-	// An operator that applies the function f to each parameter of the evaluation function.
-	// Used for parameter analysis etc.
-	// type indicates the survey target.
-	// type = -1 :KK,KKP,KPP all
-	// type = 0: KK only
-	// type = 1: KKP only
-	// type = 2: KPP only
-	void foreach_eval_param(std::function<void(int32_t, int32_t)>f, int type = -1);
-
-	// --------------------------
-	// for learning
-	// --------------------------
-
-#if defined(EVAL_LEARN)
-	// Initialize the gradient array during learning
-	// Pass the learning rate as an argument. If 0.0, the default value is used.
-	// The epoch of update_weights() gradually changes from eta to eta2 until eta_epoch.
-	// After eta2_epoch, gradually change from eta2 to eta3.
-	void init_grad(double eta1, uint64_t eta_epoch, double eta2, uint64_t eta2_epoch, double eta3);
-
-	// Add the gradient difference value to the gradient array for all features that appear in the current phase.
-	// freeze[0]: Flag that kk does not learn
-	// freeze[1]: Flag that kkp does not learn
-	// freeze[2]: Flag that kpp does not learn
-	// freeze[3]: Flag that kppp does not learn
-	void add_grad(Position& pos, Color rootColor, double delt_grad, const std::array<bool, 4>& freeze);
-
-	// Do SGD or AdaGrad or something based on the current gradient.
-	// epoch: Generation counter (starting from 0)
-	// freeze[0]: Flag that kk does not learn
-	// freeze[1]: Flag that kkp does not learn
-	// freeze[2]: Flag that kpp does not learn
-	// freeze[3]: Flag that kppp does not learn
-	void update_weights(uint64_t epoch, const std::array<bool, 4>& freeze);
-
-	// Save the evaluation function parameters to a file.
-	// You can specify the extension added to the end of the file.
-	void save_eval(std::string suffix);
-
-	// Get the current eta.
-	double get_eta();
-
-	// --learning related commands
-
-	// A function that normalizes KK. Note that it is not completely equivalent to the original evaluation function.
-	// By making the values of kkp and kpp as close to zero as possible, the value of the feature factor (which is zero) that did not appear during learning
-	// The idea of ensuring it is valid.
-	void regularize_kk();
-
-#endif
-
-
-}
-
-#endif  // defined(EVAL_NNUE) || defined(EVAL_LEARN)
-
-#endif // _EVALUATE_KPPT_COMMON_H_
@@ -20,61 +20,25 @@
 #include <cassert>
 #include <cstdlib>
 #include <cstring>   // For std::memset
+#include <fstream>
 #include <iomanip>
 #include <sstream>
 #include <iostream>
-#include <set>
+#include <streambuf>
+#include <vector>
+
+#include "nnue/evaluate_nnue.h"

 #include "bitboard.h"
 #include "evaluate.h"
 #include "material.h"
+#include "misc.h"
 #include "pawns.h"
 #include "thread.h"
 #include "uci.h"
+#include "incbin/incbin.h"

-#ifdef EVAL_LEARN
-namespace Learner
-{
-    extern bool use_raw_nnue_eval;
-}
-#endif
-
-namespace Eval {
-
-  bool useNNUE;
-  std::string eval_file_loaded="None";
-
-  void init_NNUE() {
-
-    useNNUE = Options["Use NNUE"];
-    std::string eval_file = std::string(Options["EvalFile"]);
-    if (useNNUE && eval_file_loaded != eval_file)
-        if (Eval::NNUE::load_eval_file(eval_file))
-            eval_file_loaded = eval_file;
-  }
-
-  void verify_NNUE() {
-
-    std::string eval_file = std::string(Options["EvalFile"]);
-    if (useNNUE && eval_file_loaded != eval_file)
-    {
-        UCI::OptionsMap defaults;
-        UCI::init(defaults);
-
-        sync_cout << "info string ERROR: NNUE evaluation used, but the network file " << eval_file << " was not loaded successfully." << sync_endl;
-        sync_cout << "info string ERROR: The UCI option EvalFile might need to specify the full path, including the directory/folder name, to the file." << sync_endl;
-        sync_cout << "info string ERROR: The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/"+std::string(defaults["EvalFile"]) << sync_endl;
-        sync_cout << "info string ERROR: If the UCI option Use NNUE is set to true, network evaluation parameters compatible with the program must be available." << sync_endl;
-        sync_cout << "info string ERROR: The engine will be terminated now." << sync_endl;
-        std::exit(EXIT_FAILURE);
-    }
-
-    if (useNNUE)
-        sync_cout << "info string NNUE evaluation using " << eval_file << " enabled." << sync_endl;
-    else
-        sync_cout << "info string classical evaluation enabled." << sync_endl;
-  }
-}
+using namespace std;

 namespace Trace {

@@ -120,11 +84,11 @@ using namespace Trace;
 namespace {

  // Threshold for lazy and space evaluation
-  constexpr Value LazyThreshold1 =  Value(1400);
-  constexpr Value LazyThreshold2 =  Value(1300);
-  constexpr Value SpaceThreshold = Value(12222);
-  constexpr Value NNUEThreshold1 =   Value(550);
-  constexpr Value NNUEThreshold2 =   Value(150);
+  constexpr Value LazyThreshold1 =  Value(1565);
+  constexpr Value LazyThreshold2 =  Value(1102);
+  constexpr Value SpaceThreshold = Value(11551);
+  constexpr Value NNUEThreshold1 =   Value(682);
+  constexpr Value NNUEThreshold2 =   Value(176);

  // KingAttackWeights[PieceType] contains king attack weights by piece type
  constexpr int KingAttackWeights[PIECE_TYPE_NB] = { 0, 0, 81, 52, 44, 10 };
@@ -132,7 +96,7 @@ namespace {
  // SafeCheck[PieceType][single/multiple] contains safe check bonus by piece type,
  // higher if multiple safe checks are possible for that piece type.
  constexpr int SafeCheck[][2] = {
-      {}, {}, {792, 1283}, {645, 967}, {1084, 1897}, {772, 1119}
+      {}, {}, {803, 1292}, {639, 974}, {1087, 1878}, {759, 1132}
  };

 #define S(mg, eg) make_score(mg, eg)
@@ -140,19 +104,25 @@ namespace {
  // MobilityBonus[PieceType-2][attacked] contains bonuses for middle and end game,
  // indexed by piece type and number of attacked squares in the mobility area.
  constexpr Score MobilityBonus[][32] = {
-    { S(-62,-81), S(-53,-56), S(-12,-31), S( -4,-16), S(  3,  5), S( 13, 11), // Knight
-      S( 22, 17), S( 28, 20), S( 33, 25) },
-    { S(-48,-59), S(-20,-23), S( 16, -3), S( 26, 13), S( 38, 24), S( 51, 42), // Bishop
-      S( 55, 54), S( 63, 57), S( 63, 65), S( 68, 73), S( 81, 78), S( 81, 86),
-      S( 91, 88), S( 98, 97) },
-    { S(-60,-78), S(-20,-17), S(  2, 23), S(  3, 39), S(  3, 70), S( 11, 99), // Rook
-      S( 22,103), S( 31,121), S( 40,134), S( 40,139), S( 41,158), S( 48,164),
-      S( 57,168), S( 57,169), S( 62,172) },
-    { S(-30,-48), S(-12,-30), S( -8, -7), S( -9, 19), S( 20, 40), S( 23, 55), // Queen
-      S( 23, 59), S( 35, 75), S( 38, 78), S( 53, 96), S( 64, 96), S( 65,100),
-      S( 65,121), S( 66,127), S( 67,131), S( 67,133), S( 72,136), S( 72,141),
-      S( 77,147), S( 79,150), S( 93,151), S(108,168), S(108,168), S(108,171),
-      S(110,182), S(114,182), S(114,192), S(116,219) }
+    { S(-62,-79), S(-53,-57), S(-12,-31), S( -3,-17), S(  3,  7), S( 12, 13), // Knight
+      S( 21, 16), S( 28, 21), S( 37, 26) },
+    { S(-47,-59), S(-20,-25), S( 14, -8), S( 29, 12), S( 39, 21), S( 53, 40), // Bishop
+      S( 53, 56), S( 60, 58), S( 62, 65), S( 69, 72), S( 78, 78), S( 83, 87),
+      S( 91, 88), S( 96, 98) },
+    { S(-60,-82), S(-24,-15), S(  0, 17) ,S(  3, 43), S(  4, 72), S( 14,100), // Rook
+      S( 20,102), S( 30,122), S( 41,133), S(41 ,139), S( 41,153), S( 45,160),
+      S( 57,165), S( 58,170), S( 67,175) },
+    { S(-29,-49), S(-16,-29), S( -8, -8), S( -8, 17), S( 18, 39), S( 25, 54), // Queen
+      S( 23, 59), S( 37, 73), S( 41, 76), S( 54, 95), S( 65, 95) ,S( 68,101),
+      S( 69,124), S( 70,128), S( 70,132), S( 70,133) ,S( 71,136), S( 72,140),
+      S( 74,147), S( 76,149), S( 90,153), S(104,169), S(105,171), S(106,171),
+      S(112,178), S(114,185), S(114,187), S(119,221) }
+  };
+
+  // BishopPawns[distance from edge] contains a file-dependent penalty for pawns on
+  // squares of the same color as our bishop.
+  constexpr Score BishopPawns[int(FILE_NB) / 2] = {
+    S(3, 8), S(3, 9), S(1, 8), S(3, 7)
  };

  // KingProtector[knight/bishop] contains penalty for each distance unit to own king
@@ -160,32 +130,31 @@ namespace {

  // Outpost[knight/bishop] contains bonuses for each knight or bishop occupying a
  // pawn protected square on rank 4 to 6 which is also safe from a pawn attack.
-  constexpr Score Outpost[] = { S(56, 36), S(30, 23) };
+  constexpr Score Outpost[] = { S(56, 34), S(31, 23) };

  // PassedRank[Rank] contains a bonus according to the rank of a passed pawn
  constexpr Score PassedRank[RANK_NB] = {
-    S(0, 0), S(10, 28), S(17, 33), S(15, 41), S(62, 72), S(168, 177), S(276, 260)
+    S(0, 0), S(9, 28), S(15, 31), S(17, 39), S(64, 70), S(171, 177), S(277, 260)
  };

  // RookOnFile[semiopen/open] contains bonuses for each rook when there is
  // no (friendly) pawn on the rook file.
-  constexpr Score RookOnFile[] = { S(19, 7), S(48, 29) };
+  constexpr Score RookOnFile[] = { S(19, 7), S(48, 27) };

  // ThreatByMinor/ByRook[attacked PieceType] contains bonuses according to
  // which piece type attacks which one. Attacks on lesser pieces which are
  // pawn-defended are not considered.
  constexpr Score ThreatByMinor[PIECE_TYPE_NB] = {
-    S(0, 0), S(5, 32), S(57, 41), S(77, 56), S(88, 119), S(79, 161)
+    S(0, 0), S(5, 32), S(55, 41), S(77, 56), S(89, 119), S(79, 162)
  };

  constexpr Score ThreatByRook[PIECE_TYPE_NB] = {
-    S(0, 0), S(3, 46), S(37, 68), S(42, 60), S(0, 38), S(58, 41)
+    S(0, 0), S(3, 44), S(37, 68), S(42, 60), S(0, 39), S(58, 43)
  };

  // Assorted bonuses and penalties
  constexpr Score BadOutpost          = S( -7, 36);
  constexpr Score BishopOnKingRing    = S( 24,  0);
-  constexpr Score BishopPawns         = S(  3,  7);
  constexpr Score BishopXRayPawns     = S(  4,  5);
  constexpr Score CorneredBishop      = S( 50, 50);
  constexpr Score FlankAttacks        = S(  8,  0);
@@ -198,7 +167,6 @@ namespace {
  constexpr Score ReachableOutpost    = S( 31, 22);
  constexpr Score RestrictedPiece     = S(  7,  7);
  constexpr Score RookOnKingRing      = S( 16,  0);
-  constexpr Score RookOnQueenFile     = S(  6, 11);
  constexpr Score SliderOnQueen       = S( 60, 18);
  constexpr Score ThreatByKing        = S( 24, 89);
  constexpr Score ThreatByPawnPush    = S( 48, 39);
@@ -387,7 +355,7 @@ namespace {
                // when the bishop is outside the pawn chain.
                Bitboard blocked = pos.pieces(Us, PAWN) & shift<Down>(pos.pieces());

-                score -= BishopPawns * pos.pawns_on_same_color_squares(Us, s)
+                score -= BishopPawns[edge_distance(file_of(s))] * pos.pawns_on_same_color_squares(Us, s)
                                     * (!(attackedBy[Us][PAWN] & s) + popcount(blocked & CenterFiles));

                // Penalty for all enemy pawns x-rayed
@@ -414,10 +382,6 @@ namespace {

        if (Pt == ROOK)
        {
-            // Bonus for rook on the same file as a queen
-            if (file_bb(s) & pos.pieces(QUEEN))
-                score += RookOnQueenFile;
-
            // Bonus for rook on an open or semi-open file
            if (pos.is_on_semiopen_file(Us, s))
                score += RookOnFile[pos.is_on_semiopen_file(Them, s)];
@@ -515,18 +479,18 @@ namespace {
    int kingFlankAttack  = popcount(b1) + popcount(b2);
    int kingFlankDefense = popcount(b3);

-    kingDanger +=        kingAttackersCount[Them] * kingAttackersWeight[Them]
-                 + 185 * popcount(kingRing[Us] & weak)
-                 + 148 * popcount(unsafeChecks)
-                 +  98 * popcount(pos.blockers_for_king(Us))
-                 +  69 * kingAttacksCount[Them]
-                 +   3 * kingFlankAttack * kingFlankAttack / 8
-                 +       mg_value(mobility[Them] - mobility[Us])
-                 - 873 * !pos.count<QUEEN>(Them)
-                 - 100 * bool(attackedBy[Us][KNIGHT] & attackedBy[Us][KING])
-                 -   6 * mg_value(score) / 8
-                 -   4 * kingFlankDefense
-                 +  37;
+    kingDanger +=        kingAttackersCount[Them] * kingAttackersWeight[Them] // (~10 Elo)
+                 + 185 * popcount(kingRing[Us] & weak)                        // (~15 Elo)
+                 + 148 * popcount(unsafeChecks)                               // (~4 Elo)
+                 +  98 * popcount(pos.blockers_for_king(Us))                  // (~2 Elo)
+                 +  69 * kingAttacksCount[Them]                               // (~0.5 Elo)
+                 +   3 * kingFlankAttack * kingFlankAttack / 8                // (~0.5 Elo)
+                 +       mg_value(mobility[Them] - mobility[Us])              // (~0.5 Elo)
+                 - 873 * !pos.count<QUEEN>(Them)                              // (~24 Elo)
+                 - 100 * bool(attackedBy[Us][KNIGHT] & attackedBy[Us][KING])  // (~5 Elo)
+                 -   6 * mg_value(score) / 8                                  // (~8 Elo)
+                 -   4 * kingFlankDefense                                     // (~5 Elo)
+                 +  37;                                                       // (~0.5 Elo)

    // Transform the kingDanger units into a Score, and subtract it from the evaluation
    if (kingDanger > 100)
@@ -843,7 +807,9 @@ namespace {
            sf = 37 + 3 * (pos.count<QUEEN>(WHITE) == 1 ? pos.count<BISHOP>(BLACK) + pos.count<KNIGHT>(BLACK)
                                                        : pos.count<BISHOP>(WHITE) + pos.count<KNIGHT>(WHITE));
        else
-            sf = std::min(sf, 36 + 7 * pos.count<PAWN>(strongSide));
+            sf = std::min(sf, 36 + 7 * pos.count<PAWN>(strongSide)) - 4 * !pawnsOnBothFlanks;
+
+        sf -= 4 * !pawnsOnBothFlanks;
    }

    // Interpolate between the middlegame and (scaled by 'sf') endgame score
@@ -947,19 +913,47 @@ make_v:
 /// evaluation of the position from the point of view of the side to move.

 Value Eval::evaluate(const Position& pos) {
-#ifdef EVAL_LEARN
-  if (Learner::use_raw_nnue_eval) {
-      return NNUE::evaluate(pos);
+
+  Value v;
+
+  if (NNUE::useNNUE == NNUE::UseNNUEMode::Pure) {
+      v = NNUE::evaluate(pos);
+
+      // Guarantee evaluation does not hit the tablebase range
+      v = std::clamp(v, VALUE_TB_LOSS_IN_MAX_PLY + 1, VALUE_TB_WIN_IN_MAX_PLY - 1);
+
+      return v;
  }
-#endif
+  else if (NNUE::useNNUE == NNUE::UseNNUEMode::False)
+      v = Evaluation<NO_TRACE>(pos).value();
+  else
+  {
+      // Scale and shift NNUE for compatibility with search and classical evaluation
+      auto  adjusted_NNUE = [&](){
+         int mat = pos.non_pawn_material() + PawnValueMg * pos.count<PAWN>();
+         return NNUE::evaluate(pos) * (720 + mat / 32) / 1024 + Tempo;
+      };

-  bool classical = !Eval::useNNUE
-                ||  abs(eg_value(pos.psq_score())) * 16 > NNUEThreshold1 * (16 + pos.rule50_count());
-  Value v = classical ? Evaluation<NO_TRACE>(pos).value()
-                      : NNUE::evaluate(pos) * 5 / 4 + Tempo;
+      // If there is PSQ imbalance use classical eval, with small probability if it is small
+      Value psq = Value(abs(eg_value(pos.psq_score())));
+      int   r50 = 16 + pos.rule50_count();
+      bool  largePsq = psq * 16 > (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50;
+      bool  classical = largePsq || (psq > PawnValueMg / 4 && !(pos.this_thread()->nodes & 0xB));

-  if (classical && Eval::useNNUE && abs(v) * 16 < NNUEThreshold2 * (16 + pos.rule50_count()))
-      v = NNUE::evaluate(pos) * 5 / 4 + Tempo;
+      bool strongClassical = pos.non_pawn_material() < 2 * RookValueMg && pos.count<PAWN>() < 2;
+
+      v = classical || strongClassical ? Evaluation<NO_TRACE>(pos).value() : adjusted_NNUE();
+
+      // If the classical eval is small and imbalance large, use NNUE nevertheless.
+      // For the case of opposite colored bishops, switch to NNUE eval with
+      // small probability if the classical eval is less than the threshold.
+      if (   largePsq && !strongClassical
+          && (   abs(v) * 16 < NNUEThreshold2 * r50
+              || (   pos.opposite_bishops()
+                  && abs(v) * 16 < (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50
+                  && !(pos.this_thread()->nodes & 0xB))))
+          v = adjusted_NNUE();
+  }

  // Damp down the evaluation linearly when shuffling
  v = v * (100 - pos.rule50_count()) / 100;
@@ -1015,7 +1009,7 @@ std::string Eval::trace(const Position& pos) {

  ss << "\nClassical evaluation: " << to_cp(v) << " (white side)\n";

-  if (Eval::useNNUE)
+  if (NNUE::useNNUE != NNUE::UseNNUEMode::False)
  {
      v = NNUE::evaluate(pos);
      v = pos.side_to_move() == WHITE ? v : -v;
@@ -26,23 +26,13 @@
 class Position;

 namespace Eval {
-
  std::string trace(const Position& pos);
  Value evaluate(const Position& pos);

-  extern bool useNNUE;
-  extern std::string eval_file_loaded;
-  void init_NNUE();
-  void verify_NNUE();
-
-  namespace NNUE {
-
-    Value evaluate(const Position& pos);
-    Value compute_eval(const Position& pos);
-    void  update_eval(const Position& pos);
-    bool  load_eval_file(const std::string& evalFile);
-
-  } // namespace NNUE
+  // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
+  // for the build process (profile-build and fishtest) to work. Do not change the
+  // name of the macro, as it is used in the Makefile.
+  #define EvalFileDefaultName   "nn-c3ca321c51c9.nnue"

 } // namespace Eval

@@ -1,429 +0,0 @@
-#if defined (EVAL_LEARN)
-
-#include "../misc.h"
-#include "../position.h"
-
-#include <sstream>
-#include <fstream>
-#include <cstring> // std::memset()
-
-using namespace std;
-
-// -----------------------------------
-// stage compression/decompression
-// -----------------------------------
-
-// Class that handles bitstream
-// useful when doing aspect encoding
-struct BitStream
-{
-  // Set the memory to store the data in advance.
-  // Assume that memory is cleared to 0.
-  void  set_data(uint8_t* data_) { data = data_; reset(); }
-
-  // Get the pointer passed in set_data().
-  uint8_t* get_data() const { return data; }
-
-  // Get the cursor.
-  int get_cursor() const { return bit_cursor; }
-
-  // reset the cursor
-  void reset() { bit_cursor = 0; }
-
-  // Write 1bit to the stream.
-  // If b is non-zero, write out 1. If 0, write 0.
-  void write_one_bit(int b)
-  {
-    if (b)
-      data[bit_cursor / 8] |= 1 << (bit_cursor & 7);
-
-    ++bit_cursor;
-  }
-
-  // Get 1 bit from the stream.
-  int read_one_bit()
-  {
-    int b = (data[bit_cursor / 8] >> (bit_cursor & 7)) & 1;
-    ++bit_cursor;
-
-    return b;
-  }
-
-  // write n bits of data
-  // Data shall be written out from the lower order of d.
-  void write_n_bit(int d, int n)
-  {
-    for (int i = 0; i <n; ++i)
-      write_one_bit(d & (1 << i));
-  }
-
-  // read n bits of data
-  // Reverse conversion of write_n_bit().
-  int read_n_bit(int n)
-  {
-    int result = 0;
-    for (int i = 0; i < n; ++i)
-      result |= read_one_bit() ? (1 << i) : 0;
-
-    return result;
-  }
-
-private:
-  // Next bit position to read/write.
-  int bit_cursor;
-
-  // data entity
-  uint8_t* data;
-};
-
-
-// Huffman coding
-// * is simplified from mini encoding to make conversion easier.
-//
-// 1 box on the board (other than NO_PIECE) = 2 to 6 bits (+ 1-bit flag + 1-bit forward and backward)
-// 1 piece of hand piece = 1-5bit (+ 1-bit flag + 1bit ahead and behind)
-//
-// empty xxxxx0 + 0 (none)
-// step xxxx01 + 2 xxxx0 + 2
-// incense xx0011 + 2 xx001 + 2
-// Katsura xx1011 + 2 xx101 + 2
-// silver xx0111 + 2 xx011 + 2
-// Gold x01111 + 1 x0111 + 1 // Gold is valid and has no flags.
-// corner 011111 + 2 01111 + 2
-// Fly 111111 + 2 11111 + 2
-//
-// Assuming all pieces are on the board,
-// Sky 81-40 pieces = 41 boxes = 41bit
-// Walk 4bit*18 pieces = 72bit
-// Incense 6bit*4 pieces = 24bit
-// Katsura 6bit*4 pieces = 24bit
-// Silver 6bit*4 pieces = 24bit
-// Gold 6bit* 4 pieces = 24bit
-// corner 8bit* 2 pieces = 16bit
-// Fly 8bit* 2 pieces = 16bit
-// -------
-// 241bit + 1bit (turn) + 7bit × 2 (King's position after) = 256bit
-//
-// When the piece on the board moves to the hand piece, the piece on the board becomes empty, so the box on the board can be expressed with 1 bit,
-// Since the hand piece can be expressed by 1 bit less than the piece on the board, the total number of bits does not change in the end.
-// Therefore, in this expression, any aspect can be expressed by this bit number.
-// It is a hand piece and no flag is required, but if you include this, the bit number of the piece on the board will be -1
-// Since the total number of bits can be fixed, we will include this as well.
-
-// Huffman Encoding
-//
-// Empty  xxxxxxx0
-// Pawn   xxxxx001 + 1 bit (Side to move)
-// Knight xxxxx011 + 1 bit (Side to move)
-// Bishop xxxxx101 + 1 bit (Side to move)
-// Rook   xxxxx111 + 1 bit (Side to move)
-
-struct HuffmanedPiece
-{
-  int code; // how it will be coded
-  int bits; // How many bits do you have
-};
-
-HuffmanedPiece huffman_table[] =
-{
-  {0b0000,1}, // NO_PIECE
-  {0b0001,4}, // PAWN
-  {0b0011,4}, // KNIGHT
-  {0b0101,4}, // BISHOP
-  {0b0111,4}, // ROOK
-  {0b1001,4}, // QUEEN
-};
-
-// Class for compressing/decompressing sfen
-// sfen can be packed to 256bit (32bytes) by Huffman coding.
-// This is proven by mini. The above is Huffman coding.
-//
-// Internal format = 1-bit turn + 7-bit king position *2 + piece on board (Huffman coding) + hand piece (Huffman coding)
-// Side to move (White = 0, Black = 1) (1bit)
-// White King Position (6 bits)
-// Black King Position (6 bits)
-// Huffman Encoding of the board
-// Castling availability (1 bit x 4)
-// En passant square (1 or 1 + 6 bits)
-// Rule 50 (6 bits)
-// Game play (8 bits)
-//
-// TODO(someone): Rename SFEN to FEN.
-//
-struct SfenPacker
-{
-  // Pack sfen and store in data[32].
-  void pack(const Position& pos)
-  {
-// cout << pos;
-
-    memset(data, 0, 32 /* 256bit */);
-    stream.set_data(data);
-
-    // turn
-    // Side to move.
-    stream.write_one_bit((int)(pos.side_to_move()));
-
-    // 7-bit positions for leading and trailing balls
-    // White king and black king, 6 bits for each.
-    for(auto c: Colors)
-      stream.write_n_bit(pos.king_square(c), 6);
-
-    // Write the pieces on the board other than the kings.
-    for (Rank r = RANK_8; r >= RANK_1; --r)
-    {
-      for (File f = FILE_A; f <= FILE_H; ++f)
-      {
-        Piece pc = pos.piece_on(make_square(f, r));
-        if (type_of(pc) == KING)
-          continue;
-        write_board_piece_to_stream(pc);
-      }
-    }
-
-    // TODO(someone): Support chess960.
-    stream.write_one_bit(pos.can_castle(WHITE_OO));
-    stream.write_one_bit(pos.can_castle(WHITE_OOO));
-    stream.write_one_bit(pos.can_castle(BLACK_OO));
-    stream.write_one_bit(pos.can_castle(BLACK_OOO));
-
-    if (pos.ep_square() == SQ_NONE) {
-      stream.write_one_bit(0);
-    }
-    else {
-      stream.write_one_bit(1);
-      stream.write_n_bit(static_cast<int>(pos.ep_square()), 6);
-    }
-
-    stream.write_n_bit(pos.state()->rule50, 6);
-
-    stream.write_n_bit(1 + (pos.game_ply()-(pos.side_to_move() == BLACK)) / 2, 8);
-
-    assert(stream.get_cursor() <= 256);
-  }
-
-  // sfen packed by pack() (256bit = 32bytes)
-  // Or sfen to decode with unpack()
-  uint8_t *data; // uint8_t[32];
-
-//private:
-  // Position::set_from_packed_sfen(uint8_t data[32]) I want to use these functions, so the line is bad, but I want to keep it public.
-
-  BitStream stream;
-
-  // Output the board pieces to stream.
-  void write_board_piece_to_stream(Piece pc)
-  {
-    // piece type
-    PieceType pr = type_of(pc);
-    auto c = huffman_table[pr];
-    stream.write_n_bit(c.code, c.bits);
- 
-    if (pc == NO_PIECE)
-      return;
-
-    // first and second flag
-    stream.write_one_bit(color_of(pc));
-  }
-
-  // Read one board piece from stream
-  Piece read_board_piece_from_stream()
-  {
-    PieceType pr = NO_PIECE_TYPE;
-    int code = 0, bits = 0;
-    while (true)
-    {
-      code |= stream.read_one_bit() << bits;
-      ++bits;
-
-      assert(bits <= 6);
-
-      for (pr = NO_PIECE_TYPE; pr <KING; ++pr)
-        if (huffman_table[pr].code == code
-          && huffman_table[pr].bits == bits)
-          goto Found;
-    }
-  Found:;
-    if (pr == NO_PIECE_TYPE)
-      return NO_PIECE;
-
-    // first and second flag
-    Color c = (Color)stream.read_one_bit();
-    
-    return make_piece(c, pr);
-  }
-};
-
-
-// -----------------------------------
-// Add to Position class
-// -----------------------------------
-
-// Add a function that directly unpacks for speed. It's pretty tough.
-// Write it by combining packer::unpack() and Position::set().
-// If there is a problem with the passed phase and there is an error, non-zero is returned.
-int Position::set_from_packed_sfen(const PackedSfen& sfen , StateInfo * si, Thread* th, bool mirror)
-{
-	SfenPacker packer;
-	auto& stream = packer.stream;
-	stream.set_data((uint8_t*)&sfen);
-
-	std::memset(this, 0, sizeof(Position));
-	std::memset(si, 0, sizeof(StateInfo));
-  std::fill_n(&pieceList[0][0], sizeof(pieceList) / sizeof(Square), SQ_NONE);
-  st = si;
-
-	// Active color
-	sideToMove = (Color)stream.read_one_bit();
-
-  pieceList[W_KING][0] = SQUARE_NB;
-  pieceList[B_KING][0] = SQUARE_NB;
-
-	// First the position of the ball
-	if (mirror)
-	{
-		for (auto c : Colors)
-			board[flip_file((Square)stream.read_n_bit(6))] = make_piece(c, KING);
-	}
-	else
-	{
-		for (auto c : Colors)
-			board[stream.read_n_bit(6)] = make_piece(c, KING);
-	}
-
-  // Piece placement
-  for (Rank r = RANK_8; r >= RANK_1; --r)
-  {
-    for (File f = FILE_A; f <= FILE_H; ++f)
-    {
-      auto sq = make_square(f, r);
-      if (mirror) {
-        sq = flip_file(sq);
-      }
-
-      // it seems there are already balls
-      Piece pc;
-      if (type_of(board[sq]) != KING)
-      {
-        assert(board[sq] == NO_PIECE);
-        pc = packer.read_board_piece_from_stream();
-      }
-      else
-      {
-        pc = board[sq];
-        board[sq] = NO_PIECE; // put_piece() will catch ASSERT unless you remove it all.
-      }
-
-      // There may be no pieces, so skip in that case.
-      if (pc == NO_PIECE)
-        continue;
-
-      put_piece(Piece(pc), sq);
-
-      //cout << sq << ' ' << board[sq] << ' ' << stream.get_cursor() << endl;
-
-      if (stream.get_cursor()> 256)
-        return 1;
-      //assert(stream.get_cursor() <= 256);
-
-    }
-  }
-
-  // Castling availability.
-  // TODO(someone): Support chess960.
-  st->castlingRights = 0;
-  if (stream.read_one_bit()) {
-    Square rsq;
-    for (rsq = relative_square(WHITE, SQ_H1); piece_on(rsq) != W_ROOK; --rsq) {}
-    set_castling_right(WHITE, rsq);
-  }
-  if (stream.read_one_bit()) {
-    Square rsq;
-    for (rsq = relative_square(WHITE, SQ_A1); piece_on(rsq) != W_ROOK; ++rsq) {}
-    set_castling_right(WHITE, rsq);
-  }
-  if (stream.read_one_bit()) {
-    Square rsq;
-    for (rsq = relative_square(BLACK, SQ_H1); piece_on(rsq) != B_ROOK; --rsq) {}
-    set_castling_right(BLACK, rsq);
-  }
-  if (stream.read_one_bit()) {
-    Square rsq;
-    for (rsq = relative_square(BLACK, SQ_A1); piece_on(rsq) != B_ROOK; ++rsq) {}
-    set_castling_right(BLACK, rsq);
-  }
-
-  // En passant square. Ignore if no pawn capture is possible
-  if (stream.read_one_bit()) {
-    Square ep_square = static_cast<Square>(stream.read_n_bit(6));
-    if (mirror) {
-      ep_square = flip_file(ep_square);
-    }
-    st->epSquare = ep_square;
-
-    if (!(attackers_to(st->epSquare) & pieces(sideToMove, PAWN))
-      || !(pieces(~sideToMove, PAWN) & (st->epSquare + pawn_push(~sideToMove))))
-      st->epSquare = SQ_NONE;
-  }
-  else {
-    st->epSquare = SQ_NONE;
-  }
-
-  // Halfmove clock
-  st->rule50 = static_cast<Square>(stream.read_n_bit(6));
-
-  // Fullmove number
-  gamePly = static_cast<Square>(stream.read_n_bit(8));
-  // Convert from fullmove starting from 1 to gamePly starting from 0,
-  // handle also common incorrect FEN with fullmove = 0.
-  gamePly = std::max(2 * (gamePly - 1), 0) + (sideToMove == BLACK);
-
-  assert(stream.get_cursor() <= 256);
-
-  chess960 = false;
-  thisThread = th;
-set_state(st);
-
-  //std::cout << *this << std::endl;
-
-  assert(pos_is_ok());
-
-	return 0;
-}
-
-// Give the board, hand piece, and turn, and return the sfen.
-//std::string Position::sfen_from_rawdata(Piece board[81], Hand hands[2], Color turn, int gamePly_)
-//{
-// // Copy it to an internal structure and call sfen() if the conversion process depends only on it
-// // Maybe it will be converted normally...
-//  Position pos;
-//
-//  memcpy(pos.board, board, sizeof(Piece) * 81);
-//  memcpy(pos.hand, hands, sizeof(Hand) * 2);
-//  pos.sideToMove = turn;
-//  pos.gamePly = gamePly_;
-//
-//  return pos.sfen();
-//
-// // Implementation of ↑ is beautiful, but slow.
-// // This is a bottleneck when learning a large amount of game records, so write a function to unpack directly.
-//}
-
-// Get the packed sfen. Returns to the buffer specified in the argument.
-void Position::sfen_pack(PackedSfen& sfen)
-{
-  SfenPacker sp;
-  sp.data = (uint8_t*)&sfen;
-  sp.pack(*this);
-}
-
-//// Unpack the packed sfen. Returns an sfen string.
-//std::string Position::sfen_unpack(const PackedSfen& sfen)
-//{
-// SfenPacker sp;
-// sp.data = (uint8_t*)&sfen;
-// return sp.unpack();
-//}
-
-
-#endif // USE_SFEN_PACKER
@@ -0,0 +1,140 @@
+#ifndef _STOCKFISH_BLAS_H_
+#define _STOCKFISH_BLAS_H_
+
+struct ThreadPool;
+
+#if defined (_MSC_VER)
+#define SF_BLAS_RESTRICT __restrict
+#elif defined (__INTEL_COMPILER)
+#define SF_BLAS_RESTRICT restrict
+#elif defined (__clang__)
+#define SF_BLAS_RESTRICT __restrict__
+#elif defined (__GNUC__)
+#define SF_BLAS_RESTRICT __restrict__
+#endif
+
+namespace Blas {
+
+    enum struct MatrixLayout {
+        RowMajor = 101,
+        ColMajor = 102
+    };
+
+    enum struct MatrixTranspose {
+        NoTrans = 111,
+        Trans = 112
+    };
+
+    void scopy(
+        const int N,
+        const float * SF_BLAS_RESTRICT X,
+        float * SF_BLAS_RESTRICT Y
+    );
+
+    void scopy(
+        const int N,
+        const float * SF_BLAS_RESTRICT X, const int incX,
+        float * SF_BLAS_RESTRICT Y, const int incY
+    );
+
+    void scopy(
+        ThreadPool& thread_pool,
+        const int N,
+        const float * SF_BLAS_RESTRICT X,
+        float * SF_BLAS_RESTRICT Y
+    );
+
+    void scopy(
+        ThreadPool& thread_pool,
+        const int N,
+        const float * SF_BLAS_RESTRICT X, const int incX,
+        float * SF_BLAS_RESTRICT Y, const int incY
+    );
+
+    void sscal(
+        const int N,
+        const float alpha,
+        float * SF_BLAS_RESTRICT X
+    );
+
+    void sscal(
+        const int N,
+        const float alpha,
+        float * SF_BLAS_RESTRICT X, const int incX
+    );
+
+    void sscal(
+        ThreadPool& thread_pool,
+        const int N,
+        const float alpha,
+        float * SF_BLAS_RESTRICT X
+    );
+
+    void sscal(
+        ThreadPool& thread_pool,
+        const int N,
+        const float alpha,
+        float * SF_BLAS_RESTRICT X, const int incX
+    );
+
+    void saxpy(
+        const int N,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT X,
+        float * SF_BLAS_RESTRICT Y
+    );
+
+    void saxpy(
+        const int N,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT X, const int incX,
+        float * SF_BLAS_RESTRICT Y, const int incY
+    );
+
+    void saxpy(
+        ThreadPool& thread_pool,
+        const int N,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT X,
+        float * SF_BLAS_RESTRICT Y
+    );
+
+    void saxpy(
+        ThreadPool& thread_pool,
+        const int N,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT X, const int incX,
+        float * SF_BLAS_RESTRICT Y, const int incY
+    );
+
+    void sgemm(
+        ThreadPool& thread_pool,
+        MatrixLayout layout, MatrixTranspose TransA, MatrixTranspose TransB,
+        const int M, const int N, const int K,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT A, const int lda,
+        const float * SF_BLAS_RESTRICT B, const int ldb,
+        const float beta,
+        float * SF_BLAS_RESTRICT C, const int ldc
+    );
+
+    void sgemm(
+        MatrixLayout layout, MatrixTranspose TransA, MatrixTranspose TransB,
+        const int M, const int N, const int K,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT A, const int lda,
+        const float * SF_BLAS_RESTRICT B, const int ldb,
+        const float beta,
+        float * SF_BLAS_RESTRICT C, const int ldc
+    );
+
+    void test(
+        ThreadPool& thread_pool
+    );
+
+    void bench(
+        ThreadPool& thread_pool
+    );
+}
+
+#endif
@@ -0,0 +1,26 @@
+The file "incbin.h" is free and unencumbered software released into
+the public domain by Dale Weiler, see:
+   <https://github.com/graphitemaster/incbin>
+
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <http://unlicense.org/>
@@ -0,0 +1,368 @@
+/**
+ * @file incbin.h
+ * @author Dale Weiler
+ * @brief Utility for including binary files
+ *
+ * Facilities for including binary files into the current translation unit and
+ * making use from them externally in other translation units.
+ */
+#ifndef INCBIN_HDR
+#define INCBIN_HDR
+#include <limits.h>
+#if   defined(__AVX512BW__) || \
+      defined(__AVX512CD__) || \
+      defined(__AVX512DQ__) || \
+      defined(__AVX512ER__) || \
+      defined(__AVX512PF__) || \
+      defined(__AVX512VL__) || \
+      defined(__AVX512F__)
+# define INCBIN_ALIGNMENT_INDEX 6
+#elif defined(__AVX__)      || \
+      defined(__AVX2__)
+# define INCBIN_ALIGNMENT_INDEX 5
+#elif defined(__SSE__)      || \
+      defined(__SSE2__)     || \
+      defined(__SSE3__)     || \
+      defined(__SSSE3__)    || \
+      defined(__SSE4_1__)   || \
+      defined(__SSE4_2__)   || \
+      defined(__neon__)
+# define INCBIN_ALIGNMENT_INDEX 4
+#elif ULONG_MAX != 0xffffffffu
+# define INCBIN_ALIGNMENT_INDEX 3
+# else
+# define INCBIN_ALIGNMENT_INDEX 2
+#endif
+
+/* Lookup table of (1 << n) where `n' is `INCBIN_ALIGNMENT_INDEX' */
+#define INCBIN_ALIGN_SHIFT_0 1
+#define INCBIN_ALIGN_SHIFT_1 2
+#define INCBIN_ALIGN_SHIFT_2 4
+#define INCBIN_ALIGN_SHIFT_3 8
+#define INCBIN_ALIGN_SHIFT_4 16
+#define INCBIN_ALIGN_SHIFT_5 32
+#define INCBIN_ALIGN_SHIFT_6 64
+
+/* Actual alignment value */
+#define INCBIN_ALIGNMENT \
+    INCBIN_CONCATENATE( \
+        INCBIN_CONCATENATE(INCBIN_ALIGN_SHIFT, _), \
+        INCBIN_ALIGNMENT_INDEX)
+
+/* Stringize */
+#define INCBIN_STR(X) \
+    #X
+#define INCBIN_STRINGIZE(X) \
+    INCBIN_STR(X)
+/* Concatenate */
+#define INCBIN_CAT(X, Y) \
+    X ## Y
+#define INCBIN_CONCATENATE(X, Y) \
+    INCBIN_CAT(X, Y)
+/* Deferred macro expansion */
+#define INCBIN_EVAL(X) \
+    X
+#define INCBIN_INVOKE(N, ...) \
+    INCBIN_EVAL(N(__VA_ARGS__))
+
+/* Green Hills uses a different directive for including binary data */
+#if defined(__ghs__)
+#  if (__ghs_asm == 2)
+#    define INCBIN_MACRO ".file"
+/* Or consider the ".myrawdata" entry in the ld file */
+#  else
+#    define INCBIN_MACRO "\tINCBIN"
+#  endif
+#else
+#  define INCBIN_MACRO ".incbin"
+#endif
+
+#ifndef _MSC_VER
+#  define INCBIN_ALIGN \
+    __attribute__((aligned(INCBIN_ALIGNMENT)))
+#else
+#  define INCBIN_ALIGN __declspec(align(INCBIN_ALIGNMENT))
+#endif
+
+#if defined(__arm__) || /* GNU C and RealView */ \
+    defined(__arm) || /* Diab */ \
+    defined(_ARM) /* ImageCraft */
+#  define INCBIN_ARM
+#endif
+
+#ifdef __GNUC__
+/* Utilize .balign where supported */
+#  define INCBIN_ALIGN_HOST ".balign " INCBIN_STRINGIZE(INCBIN_ALIGNMENT) "\n"
+#  define INCBIN_ALIGN_BYTE ".balign 1\n"
+#elif defined(INCBIN_ARM)
+/*
+ * On arm assemblers, the alignment value is calculated as (1 << n) where `n' is
+ * the shift count. This is the value passed to `.align'
+ */
+#  define INCBIN_ALIGN_HOST ".align " INCBIN_STRINGIZE(INCBIN_ALIGNMENT_INDEX) "\n"
+#  define INCBIN_ALIGN_BYTE ".align 0\n"
+#else
+/* We assume other inline assembler's treat `.align' as `.balign' */
+#  define INCBIN_ALIGN_HOST ".align " INCBIN_STRINGIZE(INCBIN_ALIGNMENT) "\n"
+#  define INCBIN_ALIGN_BYTE ".align 1\n"
+#endif
+
+/* INCBIN_CONST is used by incbin.c generated files */
+#if defined(__cplusplus)
+#  define INCBIN_EXTERNAL extern "C"
+#  define INCBIN_CONST    extern const
+#else
+#  define INCBIN_EXTERNAL extern
+#  define INCBIN_CONST    const
+#endif
+
+/**
+ * @brief Optionally override the linker section into which data is emitted.
+ *
+ * @warning If you use this facility, you'll have to deal with platform-specific linker output
+ * section naming on your own
+ *
+ * Overriding the default linker output section, e.g for esp8266/Arduino:
+ * @code
+ * #define INCBIN_OUTPUT_SECTION ".irom.text"
+ * #include "incbin.h"
+ * INCBIN(Foo, "foo.txt");
+ * // Data is emitted into program memory that never gets copied to RAM
+ * @endcode
+ */
+#if !defined(INCBIN_OUTPUT_SECTION)
+#  if defined(__APPLE__)
+#    define INCBIN_OUTPUT_SECTION         ".const_data"
+#  else
+#    define INCBIN_OUTPUT_SECTION         ".rodata"
+#  endif
+#endif
+
+#if defined(__APPLE__)
+/* The directives are different for Apple branded compilers */
+#  define INCBIN_SECTION         INCBIN_OUTPUT_SECTION "\n"
+#  define INCBIN_GLOBAL(NAME)    ".globl " INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME "\n"
+#  define INCBIN_INT             ".long "
+#  define INCBIN_MANGLE          "_"
+#  define INCBIN_BYTE            ".byte "
+#  define INCBIN_TYPE(...)
+#else
+#  define INCBIN_SECTION         ".section " INCBIN_OUTPUT_SECTION "\n"
+#  define INCBIN_GLOBAL(NAME)    ".global " INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME "\n"
+#  if defined(__ghs__)
+#    define INCBIN_INT           ".word "
+#  else
+#    define INCBIN_INT           ".int "
+#  endif
+#  if defined(__USER_LABEL_PREFIX__)
+#    define INCBIN_MANGLE        INCBIN_STRINGIZE(__USER_LABEL_PREFIX__)
+#  else
+#    define INCBIN_MANGLE        ""
+#  endif
+#  if defined(INCBIN_ARM)
+/* On arm assemblers, `@' is used as a line comment token */
+#    define INCBIN_TYPE(NAME)    ".type " INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME ", %object\n"
+#  elif defined(__MINGW32__) || defined(__MINGW64__)
+/* Mingw doesn't support this directive either */
+#    define INCBIN_TYPE(NAME)
+#  else
+/* It's safe to use `@' on other architectures */
+#    define INCBIN_TYPE(NAME)    ".type " INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME ", @object\n"
+#  endif
+#  define INCBIN_BYTE            ".byte "
+#endif
+
+/* List of style types used for symbol names */
+#define INCBIN_STYLE_CAMEL 0
+#define INCBIN_STYLE_SNAKE 1
+
+/**
+ * @brief Specify the prefix to use for symbol names.
+ *
+ * By default this is `g', producing symbols of the form:
+ * @code
+ * #include "incbin.h"
+ * INCBIN(Foo, "foo.txt");
+ *
+ * // Now you have the following symbols:
+ * // const unsigned char gFooData[];
+ * // const unsigned char *const gFooEnd;
+ * // const unsigned int gFooSize;
+ * @endcode
+ *
+ * If however you specify a prefix before including: e.g:
+ * @code
+ * #define INCBIN_PREFIX incbin
+ * #include "incbin.h"
+ * INCBIN(Foo, "foo.txt");
+ *
+ * // Now you have the following symbols instead:
+ * // const unsigned char incbinFooData[];
+ * // const unsigned char *const incbinFooEnd;
+ * // const unsigned int incbinFooSize;
+ * @endcode
+ */
+#if !defined(INCBIN_PREFIX)
+#  define INCBIN_PREFIX g
+#endif
+
+/**
+ * @brief Specify the style used for symbol names.
+ *
+ * Possible options are
+ * - INCBIN_STYLE_CAMEL "CamelCase"
+ * - INCBIN_STYLE_SNAKE "snake_case"
+ *
+ * Default option is *INCBIN_STYLE_CAMEL* producing symbols of the form:
+ * @code
+ * #include "incbin.h"
+ * INCBIN(Foo, "foo.txt");
+ *
+ * // Now you have the following symbols:
+ * // const unsigned char <prefix>FooData[];
+ * // const unsigned char *const <prefix>FooEnd;
+ * // const unsigned int <prefix>FooSize;
+ * @endcode
+ *
+ * If however you specify a style before including: e.g:
+ * @code
+ * #define INCBIN_STYLE INCBIN_STYLE_SNAKE
+ * #include "incbin.h"
+ * INCBIN(foo, "foo.txt");
+ *
+ * // Now you have the following symbols:
+ * // const unsigned char <prefix>foo_data[];
+ * // const unsigned char *const <prefix>foo_end;
+ * // const unsigned int <prefix>foo_size;
+ * @endcode
+ */
+#if !defined(INCBIN_STYLE)
+#  define INCBIN_STYLE INCBIN_STYLE_CAMEL
+#endif
+
+/* Style lookup tables */
+#define INCBIN_STYLE_0_DATA Data
+#define INCBIN_STYLE_0_END End
+#define INCBIN_STYLE_0_SIZE Size
+#define INCBIN_STYLE_1_DATA _data
+#define INCBIN_STYLE_1_END _end
+#define INCBIN_STYLE_1_SIZE _size
+
+/* Style lookup: returning identifier */
+#define INCBIN_STYLE_IDENT(TYPE) \
+    INCBIN_CONCATENATE( \
+        INCBIN_STYLE_, \
+        INCBIN_CONCATENATE( \
+            INCBIN_EVAL(INCBIN_STYLE), \
+            INCBIN_CONCATENATE(_, TYPE)))
+
+/* Style lookup: returning string literal */
+#define INCBIN_STYLE_STRING(TYPE) \
+    INCBIN_STRINGIZE( \
+        INCBIN_STYLE_IDENT(TYPE)) \
+
+/* Generate the global labels by indirectly invoking the macro with our style
+ * type and concatenating the name against them. */
+#define INCBIN_GLOBAL_LABELS(NAME, TYPE) \
+    INCBIN_INVOKE( \
+        INCBIN_GLOBAL, \
+        INCBIN_CONCATENATE( \
+            NAME, \
+            INCBIN_INVOKE( \
+                INCBIN_STYLE_IDENT, \
+                TYPE))) \
+    INCBIN_INVOKE( \
+        INCBIN_TYPE, \
+        INCBIN_CONCATENATE( \
+            NAME, \
+            INCBIN_INVOKE( \
+                INCBIN_STYLE_IDENT, \
+                TYPE)))
+
+/**
+ * @brief Externally reference binary data included in another translation unit.
+ *
+ * Produces three external symbols that reference the binary data included in
+ * another translation unit.
+ *
+ * The symbol names are a concatenation of `INCBIN_PREFIX' before *NAME*; with
+ * "Data", as well as "End" and "Size" after. An example is provided below.
+ *
+ * @param NAME The name given for the binary data
+ *
+ * @code
+ * INCBIN_EXTERN(Foo);
+ *
+ * // Now you have the following symbols:
+ * // extern const unsigned char <prefix>FooData[];
+ * // extern const unsigned char *const <prefix>FooEnd;
+ * // extern const unsigned int <prefix>FooSize;
+ * @endcode
+ */
+#define INCBIN_EXTERN(NAME) \
+    INCBIN_EXTERNAL const INCBIN_ALIGN unsigned char \
+        INCBIN_CONCATENATE( \
+            INCBIN_CONCATENATE(INCBIN_PREFIX, NAME), \
+            INCBIN_STYLE_IDENT(DATA))[]; \
+    INCBIN_EXTERNAL const INCBIN_ALIGN unsigned char *const \
+    INCBIN_CONCATENATE( \
+        INCBIN_CONCATENATE(INCBIN_PREFIX, NAME), \
+        INCBIN_STYLE_IDENT(END)); \
+    INCBIN_EXTERNAL const unsigned int \
+        INCBIN_CONCATENATE( \
+            INCBIN_CONCATENATE(INCBIN_PREFIX, NAME), \
+            INCBIN_STYLE_IDENT(SIZE))
+
+/**
+ * @brief Include a binary file into the current translation unit.
+ *
+ * Includes a binary file into the current translation unit, producing three symbols
+ * for objects that encode the data and size respectively.
+ *
+ * The symbol names are a concatenation of `INCBIN_PREFIX' before *NAME*; with
+ * "Data", as well as "End" and "Size" after. An example is provided below.
+ *
+ * @param NAME The name to associate with this binary data (as an identifier.)
+ * @param FILENAME The file to include (as a string literal.)
+ *
+ * @code
+ * INCBIN(Icon, "icon.png");
+ *
+ * // Now you have the following symbols:
+ * // const unsigned char <prefix>IconData[];
+ * // const unsigned char *const <prefix>IconEnd;
+ * // const unsigned int <prefix>IconSize;
+ * @endcode
+ *
+ * @warning This must be used in global scope
+ * @warning The identifiers may be different if INCBIN_STYLE is not default
+ *
+ * To externally reference the data included by this in another translation unit
+ * please @see INCBIN_EXTERN.
+ */
+#ifdef _MSC_VER
+#define INCBIN(NAME, FILENAME) \
+    INCBIN_EXTERN(NAME)
+#else
+#define INCBIN(NAME, FILENAME) \
+    __asm__(INCBIN_SECTION \
+            INCBIN_GLOBAL_LABELS(NAME, DATA) \
+            INCBIN_ALIGN_HOST \
+            INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(DATA) ":\n" \
+            INCBIN_MACRO " \"" FILENAME "\"\n" \
+            INCBIN_GLOBAL_LABELS(NAME, END) \
+            INCBIN_ALIGN_BYTE \
+            INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(END) ":\n" \
+                INCBIN_BYTE "1\n" \
+            INCBIN_GLOBAL_LABELS(NAME, SIZE) \
+            INCBIN_ALIGN_HOST \
+            INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(SIZE) ":\n" \
+                INCBIN_INT INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(END) " - " \
+                           INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(DATA) "\n" \
+            INCBIN_ALIGN_HOST \
+            ".text\n" \
+    ); \
+    INCBIN_EXTERN(NAME)
+
+#endif
+#endif
@@ -0,0 +1,667 @@
+#ifndef LEARNER_AUTOGRAD_H
+#define LEARNER_AUTOGRAD_H
+
+#include <cmath>
+#include <utility>
+#include <type_traits>
+#include <memory>
+#include <tuple>
+#include <optional>
+#include <algorithm>
+#include <cstdint>
+
+namespace Learner
+{
+    template <typename T>
+    struct ValueWithGrad
+    {
+        T value;
+        T grad;
+
+        ValueWithGrad& operator+=(const ValueWithGrad<T>& rhs)
+        {
+            value += rhs.value;
+            grad += rhs.grad;
+            return *this;
+        }
+
+        ValueWithGrad& operator-=(const ValueWithGrad<T>& rhs)
+        {
+            value -= rhs.value;
+            grad -= rhs.grad;
+            return *this;
+        }
+
+        ValueWithGrad& operator*=(T rhs)
+        {
+            value *= rhs;
+            grad *= rhs;
+            return *this;
+        }
+
+        ValueWithGrad& operator/=(T rhs)
+        {
+            value /= rhs;
+            grad /= rhs;
+            return *this;
+        }
+
+        [[nodiscard]] ValueWithGrad abs() const
+        {
+            return { std::abs(value), std::abs(grad) };
+        }
+
+        [[nodiscard]] ValueWithGrad clamp_grad(T max) const
+        {
+            return { value, std::clamp(grad, -max, max) };
+        }
+    };
+}
+
+namespace Learner::Autograd::UnivariateStatic
+{
+
+    template <typename T>
+    struct Identity
+    {
+        using type = T;
+    };
+
+    template <typename T>
+    using Id = typename Identity<T>::type;
+
+    template <typename T>
+    using StoreValueOrRef = std::conditional_t<
+            std::is_rvalue_reference_v<T>,
+            std::remove_reference_t<T>,
+            const std::remove_reference_t<T>&
+        >;
+
+    namespace Detail
+    {
+        using CallIdType = std::uint32_t;
+
+        struct CallId
+        {
+            CallIdType call_id{};
+
+            constexpr CallId() :
+                call_id(0)
+            {
+            }
+
+            constexpr CallId(CallIdType id) :
+                call_id(id)
+            {
+            }
+
+            [[nodiscard]] bool operator==(CallId rhs) const noexcept
+            {
+                return call_id == rhs.call_id;
+            }
+
+            [[nodiscard]] bool operator!=(CallId rhs) const noexcept
+            {
+                return call_id != rhs.call_id;
+            }
+        };
+
+        [[nodiscard]] inline CallId next_call_id()
+        {
+            static thread_local CallIdType s_call_id = 0;
+            return CallId{ s_call_id++ };
+        }
+
+        template <typename T, typename Tuple>
+        struct TupleContains;
+
+        template <typename T, typename... Us>
+        struct TupleContains<T, std::tuple<Us...>> : std::disjunction<std::is_same<T, Us>...> {};
+
+        template <typename T, typename Tuple>
+        constexpr bool TupleContainsV = TupleContains<T, Tuple>::value;
+
+        template <typename... Ts>
+        constexpr bool AreAllConstantV = (std::remove_reference_t<Ts>::is_constant && ...);
+    }
+
+    template <typename T, typename ChildT>
+    struct Evaluable
+    {
+        constexpr Evaluable() = default;
+
+        // We append a unique call id so that we can invalidate the cache when
+        // the next computation starts. A single evaluation should see
+        // the same call_id at every node.
+        template <typename... ArgsTs>
+        [[nodiscard]] auto eval(const std::tuple<ArgsTs...>& args) const
+        {
+            const auto call_id = Detail::next_call_id();
+            const auto new_args = std::tuple_cat(args, std::tuple(call_id));
+            return ValueWithGrad<T>{ value(new_args), grad(new_args) };
+        }
+
+        template <typename... ArgsTs,
+            typename SFINAE = std::enable_if_t<Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
+        [[nodiscard]] auto value(const std::tuple<ArgsTs...>& args) const
+        {
+            const ChildT* this_ = static_cast<const ChildT*>(this);
+
+            const auto call_id = std::get<Detail::CallId>(args);
+            if (!value_cache.has_value() || value_cache_call_id != call_id)
+            {
+                value_cache_call_id = call_id;
+                value_cache = this_->calculate_value(args);
+            }
+
+            return *value_cache;
+        }
+
+        template <typename... ArgsTs,
+            typename SFINAE = std::enable_if_t<!Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
+        [[nodiscard]] auto value(const std::tuple<ArgsTs...>& args, ...) const
+        {
+            const auto call_id = Detail::next_call_id();
+            const auto new_args = std::tuple_cat(args, std::tuple(call_id));
+            return value(new_args);
+        }
+
+        template <typename... ArgsTs,
+            typename SFINAE = std::enable_if_t<Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
+        [[nodiscard]] auto grad(const std::tuple<ArgsTs...>& args) const
+        {
+            if constexpr (ChildT::is_constant)
+            {
+                return T(0.0);
+            }
+            else
+            {
+                const ChildT* this_ = static_cast<const ChildT*>(this);
+
+                const auto call_id = std::get<Detail::CallId>(args);
+                if (!grad_cache.has_value() || grad_cache_call_id != call_id)
+                {
+                    grad_cache_call_id = call_id;
+                    grad_cache = this_->calculate_grad(args);
+                }
+
+                return *grad_cache;
+            }
+        }
+
+        template <typename... ArgsTs,
+            typename SFINAE = std::enable_if_t<!Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
+        [[nodiscard]] auto grad(const std::tuple<ArgsTs...>& args, ...) const
+        {
+            const auto call_id = Detail::next_call_id();
+            const auto new_args = std::tuple_cat(args, std::tuple(call_id));
+            return grad(new_args);
+        }
+
+    private:
+        mutable std::optional<T> value_cache;
+        mutable std::optional<T> grad_cache;
+        mutable Detail::CallId value_cache_call_id{};
+        mutable Detail::CallId grad_cache_call_id{};
+    };
+
+    template <typename T, int I>
+    struct VariableParameter : Evaluable<T, VariableParameter<T, I>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = false;
+
+        constexpr VariableParameter()
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
+        {
+            return std::get<I>(args);
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>&) const
+        {
+            return T(1.0);
+        }
+    };
+
+    template <typename T, int I>
+    struct ConstantParameter : Evaluable<T, ConstantParameter<T, I>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = true;
+
+        constexpr ConstantParameter()
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
+        {
+            return std::get<I>(args);
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>&) const
+        {
+            return T(0.0);
+        }
+    };
+
+    template <typename T>
+    struct Constant : Evaluable<T, Constant<T>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = true;
+
+        constexpr Constant(T x) :
+            m_x(std::move(x))
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>&) const
+        {
+            return m_x;
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>&) const
+        {
+            return T(0.0);
+        }
+
+    private:
+        T m_x;
+    };
+
+    // The "constant" may change between executions, but is assumed to be
+    // constant during a single evaluation.
+    template <typename T>
+    struct ConstantRef : Evaluable<T, ConstantRef<T>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = true;
+
+        constexpr ConstantRef(const T& x) :
+            m_x(x)
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>&) const
+        {
+            return m_x;
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>&) const
+        {
+            return T(0.0);
+        }
+
+    private:
+        const T& m_x;
+    };
+
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    struct Sum : Evaluable<T, Sum<LhsT, RhsT, T>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = Detail::AreAllConstantV<LhsT, RhsT>;
+
+        constexpr Sum(LhsT&& lhs, RhsT&& rhs) :
+            m_lhs(std::forward<LhsT>(lhs)),
+            m_rhs(std::forward<RhsT>(rhs))
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_lhs.value(args) + m_rhs.value(args);
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_lhs.grad(args) + m_rhs.grad(args);
+        }
+
+    private:
+        StoreValueOrRef<LhsT> m_lhs;
+        StoreValueOrRef<RhsT> m_rhs;
+    };
+
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator+(LhsT&& lhs, RhsT&& rhs)
+    {
+        return Sum<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
+    }
+
+    template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator+(LhsT&& lhs, Id<T> rhs)
+    {
+        return Sum<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
+    }
+
+    template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator+(Id<T> lhs, RhsT&& rhs)
+    {
+        return Sum<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
+    }
+
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    struct Difference : Evaluable<T, Difference<LhsT, RhsT, T>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = Detail::AreAllConstantV<LhsT, RhsT>;
+
+        constexpr Difference(LhsT&& lhs, RhsT&& rhs) :
+            m_lhs(std::forward<LhsT>(lhs)),
+            m_rhs(std::forward<RhsT>(rhs))
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_lhs.value(args) - m_rhs.value(args);
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_lhs.grad(args) - m_rhs.grad(args);
+        }
+
+    private:
+        StoreValueOrRef<LhsT> m_lhs;
+        StoreValueOrRef<RhsT> m_rhs;
+    };
+
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator-(LhsT&& lhs, RhsT&& rhs)
+    {
+        return Difference<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
+    }
+
+    template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator-(LhsT&& lhs, Id<T> rhs)
+    {
+        return Difference<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
+    }
+
+    template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator-(Id<T> lhs, RhsT&& rhs)
+    {
+        return Difference<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
+    }
+
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    struct Product : Evaluable<T, Product<LhsT, RhsT, T>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = Detail::AreAllConstantV<LhsT, RhsT>;
+
+        constexpr Product(LhsT&& lhs, RhsT&& rhs) :
+            m_lhs(std::forward<LhsT>(lhs)),
+            m_rhs(std::forward<RhsT>(rhs))
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_lhs.value(args) * m_rhs.value(args);
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_lhs.grad(args) * m_rhs.value(args) + m_lhs.value(args) * m_rhs.grad(args);
+        }
+
+    private:
+        StoreValueOrRef<LhsT> m_lhs;
+        StoreValueOrRef<RhsT> m_rhs;
+    };
+
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator*(LhsT&& lhs, RhsT&& rhs)
+    {
+        return Product<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
+    }
+
+    template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator*(LhsT&& lhs, Id<T> rhs)
+    {
+        return Product<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
+    }
+
+    template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator*(Id<T> lhs, RhsT&& rhs)
+    {
+        return Product<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
+    }
+
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    struct Quotient : Evaluable<T, Quotient<LhsT, RhsT, T>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = Detail::AreAllConstantV<LhsT, RhsT>;
+
+        constexpr Quotient(LhsT&& lhs, RhsT&& rhs) :
+            m_lhs(std::forward<LhsT>(lhs)),
+            m_rhs(std::forward<RhsT>(rhs))
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_lhs.value(args) / m_rhs.value(args);
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        {
+            auto g = m_rhs.value(args);
+            return (m_lhs.grad(args) * g - m_lhs.value(args) * m_rhs.grad(args)) / (g * g);
+        }
+
+    private:
+        StoreValueOrRef<LhsT> m_lhs;
+        StoreValueOrRef<RhsT> m_rhs;
+    };
+
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator/(LhsT&& lhs, RhsT&& rhs)
+    {
+        return Quotient<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
+    }
+
+    template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator/(LhsT&& lhs, Id<T> rhs)
+    {
+        return Quotient<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
+    }
+
+    template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator/(Id<T> lhs, RhsT&& rhs)
+    {
+        return Quotient<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
+    }
+
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
+    struct Negation : Evaluable<T, Negation<ArgT, T>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = Detail::AreAllConstantV<ArgT>;
+
+        constexpr explicit Negation(ArgT&& x) :
+            m_x(std::forward<ArgT>(x))
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
+        {
+            return -m_x.value(args);
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        {
+            return -m_x.grad(args);
+        }
+
+    private:
+        StoreValueOrRef<ArgT> m_x;
+    };
+
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
+    [[nodiscard]] constexpr auto operator-(ArgT&& x)
+    {
+        return Negation<ArgT&&>(std::forward<ArgT>(x));
+    }
+
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
+    struct Sigmoid : Evaluable<T, Sigmoid<ArgT, T>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = Detail::AreAllConstantV<ArgT>;
+
+        constexpr explicit Sigmoid(ArgT&& x) :
+            m_x(std::forward<ArgT>(x))
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
+        {
+            return value_(m_x.value(args));
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_x.grad(args) * grad_(m_x.value(args));
+        }
+
+    private:
+        StoreValueOrRef<ArgT> m_x;
+
+        [[nodiscard]] T value_(T x) const
+        {
+            return 1.0 / (1.0 + std::exp(-x));
+        }
+
+        [[nodiscard]] T grad_(T x) const
+        {
+            return value_(x) * (1.0 - value_(x));
+        }
+    };
+
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
+    [[nodiscard]] constexpr auto sigmoid(ArgT&& x)
+    {
+        return Sigmoid<ArgT&&>(std::forward<ArgT>(x));
+    }
+
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
+    struct Pow : Evaluable<T, Pow<ArgT, T>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = Detail::AreAllConstantV<ArgT>;
+
+        constexpr explicit Pow(ArgT&& x, Id<T> exponent) :
+            m_x(std::forward<ArgT>(x)),
+            m_exponent(std::move(exponent))
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
+        {
+            return std::pow(m_x.value(args), m_exponent);
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_exponent * std::pow(m_x.value(args), m_exponent - T(1.0)) * m_x.grad(args);
+        }
+
+    private:
+        StoreValueOrRef<ArgT> m_x;
+        T m_exponent;
+    };
+
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
+    [[nodiscard]] constexpr auto pow(ArgT&& x, Id<T> exp)
+    {
+        return Pow<ArgT&&>(std::forward<ArgT>(x), std::move(exp));
+    }
+
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
+    struct Log : Evaluable<T, Log<ArgT, T>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = Detail::AreAllConstantV<ArgT>;
+
+        constexpr explicit Log(ArgT&& x) :
+            m_x(std::forward<ArgT>(x))
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
+        {
+            return value_(m_x.value(args));
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_x.grad(args) * grad_(m_x.value(args));
+        }
+
+    private:
+        StoreValueOrRef<ArgT> m_x;
+
+        T value_(T x) const
+        {
+            return std::log(x);
+        }
+
+        T grad_(T x) const
+        {
+            return 1.0 / x;
+        }
+    };
+
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
+    [[nodiscard]] constexpr auto log(ArgT&& x)
+    {
+        return Log<ArgT&&>(std::forward<ArgT>(x));
+    }
+
+}
+
+#endif
@@ -0,0 +1,815 @@
+#include "convert.h"
+
+#include "uci.h"
+#include "misc.h"
+#include "thread.h"
+#include "position.h"
+#include "tt.h"
+
+#include "extra/nnue_data_binpack_format.h"
+
+#include "nnue/evaluate_nnue.h"
+
+#include "syzygy/tbprobe.h"
+
+#include <sstream>
+#include <fstream>
+#include <unordered_set>
+#include <iomanip>
+#include <list>
+#include <cmath>    // std::exp(),std::pow(),std::log()
+#include <cstring>  // memcpy()
+#include <memory>
+#include <limits>
+#include <optional>
+#include <chrono>
+#include <random>
+#include <regex>
+#include <filesystem>
+
+using namespace std;
+
+namespace Learner
+{
+    bool fen_is_ok(Position& pos, std::string input_fen) {
+        std::string pos_fen = pos.fen();
+        std::istringstream ss_input(input_fen);
+        std::istringstream ss_pos(pos_fen);
+
+        // example : "2r4r/4kpp1/nb1np3/p2p3p/B2P1BP1/PP6/4NPKP/2R1R3 w - h6 0 24"
+        //       --> "2r4r/4kpp1/nb1np3/p2p3p/B2P1BP1/PP6/4NPKP/2R1R3"
+        std::string str_input, str_pos;
+        ss_input >> str_input;
+        ss_pos >> str_pos;
+
+        // Only compare "Piece placement field" between input_fen and pos.fen().
+        return str_input == str_pos;
+    }
+
+    void convert_bin(
+        const vector<string>& filenames,
+        const string& output_file_name,
+        const int ply_minimum,
+        const int ply_maximum,
+        const int interpolate_eval,
+        const int src_score_min_value,
+        const int src_score_max_value,
+        const int dest_score_min_value,
+        const int dest_score_max_value,
+        const bool check_invalid_fen,
+        const bool check_illegal_move)
+    {
+        std::cout << "check_invalid_fen=" << check_invalid_fen << std::endl;
+        std::cout << "check_illegal_move=" << check_illegal_move << std::endl;
+
+        std::fstream fs;
+        uint64_t data_size = 0;
+        uint64_t filtered_size = 0;
+        uint64_t filtered_size_fen = 0;
+        uint64_t filtered_size_move = 0;
+        uint64_t filtered_size_ply = 0;
+        auto th = Threads.main();
+        auto& tpos = th->rootPos;
+        // convert plain rag to packed sfenvalue for Yaneura king
+        fs.open(output_file_name, ios::app | ios::binary);
+        StateListPtr states;
+        for (auto filename : filenames) {
+            std::cout << "convert " << filename << " ... ";
+            std::string line;
+            ifstream ifs;
+            ifs.open(filename);
+            PackedSfenValue p;
+            data_size = 0;
+            filtered_size = 0;
+            filtered_size_fen = 0;
+            filtered_size_move = 0;
+            filtered_size_ply = 0;
+            p.gamePly = 1; // Not included in apery format. Should be initialized
+            bool ignore_flag_fen = false;
+            bool ignore_flag_move = false;
+            bool ignore_flag_ply = false;
+            while (std::getline(ifs, line)) {
+                std::stringstream ss(line);
+                std::string token;
+                std::string value;
+                ss >> token;
+                if (token == "fen") {
+                    states = StateListPtr(new std::deque<StateInfo>(1)); // Drop old and create a new one
+                    std::string input_fen = line.substr(4);
+                    tpos.set(input_fen, false, &states->back(), Threads.main());
+                    if (check_invalid_fen && !fen_is_ok(tpos, input_fen)) {
+                        ignore_flag_fen = true;
+                        filtered_size_fen++;
+                    }
+                    else {
+                        tpos.sfen_pack(p.sfen);
+                    }
+                }
+                else if (token == "move") {
+                    ss >> value;
+                    Move move = UCI::to_move(tpos, value);
+                    if (check_illegal_move && move == MOVE_NONE) {
+                        ignore_flag_move = true;
+                        filtered_size_move++;
+                    }
+                    else {
+                        p.move = move;
+                    }
+                }
+                else if (token == "score") {
+                    double score;
+                    ss >> score;
+                    // Training Formula ?Issue #71 ?nodchip/Stockfish https://github.com/nodchip/Stockfish/issues/71
+                    // Normalize to [0.0, 1.0].
+                    score = (score - src_score_min_value) / (src_score_max_value - src_score_min_value);
+                    // Scale to [dest_score_min_value, dest_score_max_value].
+                    score = score * (dest_score_max_value - dest_score_min_value) + dest_score_min_value;
+                    p.score = Math::clamp((int32_t)std::round(score), -(int32_t)VALUE_MATE, (int32_t)VALUE_MATE);
+                }
+                else if (token == "ply") {
+                    int temp;
+                    ss >> temp;
+                    if (temp < ply_minimum || temp > ply_maximum) {
+                        ignore_flag_ply = true;
+                        filtered_size_ply++;
+                    }
+                    p.gamePly = uint16_t(temp); // No cast here?
+                    if (interpolate_eval != 0) {
+                        p.score = min(3000, interpolate_eval * temp);
+                    }
+                }
+                else if (token == "result") {
+                    int temp;
+                    ss >> temp;
+                    p.game_result = int8_t(temp); // Do you need a cast here?
+                    if (interpolate_eval) {
+                        p.score = p.score * p.game_result;
+                    }
+                }
+                else if (token == "e") {
+                    if (!(ignore_flag_fen || ignore_flag_move || ignore_flag_ply)) {
+                        fs.write((char*)&p, sizeof(PackedSfenValue));
+                        data_size += 1;
+                        // debug
+                        // std::cout<<tpos<<std::endl;
+                        // std::cout<<p.score<<","<<int(p.gamePly)<<","<<int(p.game_result)<<std::endl;
+                    }
+                    else {
+                        filtered_size++;
+                    }
+                    ignore_flag_fen = false;
+                    ignore_flag_move = false;
+                    ignore_flag_ply = false;
+                }
+            }
+            std::cout << "done " << data_size << " parsed " << filtered_size << " is filtered"
+                << " (invalid fen:" << filtered_size_fen << ", illegal move:" << filtered_size_move << ", invalid ply:" << filtered_size_ply << ")" << std::endl;
+            ifs.close();
+        }
+        std::cout << "all done" << std::endl;
+        fs.close();
+    }
+
+    static inline void ltrim(std::string& s) {
+        s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](int ch) {
+            return !std::isspace(ch);
+            }));
+    }
+
+    static inline void rtrim(std::string& s) {
+        s.erase(std::find_if(s.rbegin(), s.rend(), [](int ch) {
+            return !std::isspace(ch);
+            }).base(), s.end());
+    }
+
+    static inline void trim(std::string& s) {
+        ltrim(s);
+        rtrim(s);
+    }
+
+    int parse_game_result_from_pgn_extract(std::string result) {
+        // White Win
+        if (result == "\"1-0\"") {
+            return 1;
+        }
+        // Black Win
+        else if (result == "\"0-1\"") {
+            return -1;
+        }
+        // Draw
+        else {
+            return 0;
+        }
+    }
+
+    // 0.25 -->  0.25 * PawnValueEg
+    // #-4  --> -mate_in(4)
+    // #3   -->  mate_in(3)
+    // -M4  --> -mate_in(4)
+    // +M3  -->  mate_in(3)
+    Value parse_score_from_pgn_extract(std::string eval, bool& success) {
+        success = true;
+
+        if (eval.substr(0, 1) == "#") {
+            if (eval.substr(1, 1) == "-") {
+                return -mate_in(stoi(eval.substr(2, eval.length() - 2)));
+            }
+            else {
+                return mate_in(stoi(eval.substr(1, eval.length() - 1)));
+            }
+        }
+        else if (eval.substr(0, 2) == "-M") {
+            //std::cout << "eval=" << eval << std::endl;
+            return -mate_in(stoi(eval.substr(2, eval.length() - 2)));
+        }
+        else if (eval.substr(0, 2) == "+M") {
+            //std::cout << "eval=" << eval << std::endl;
+            return mate_in(stoi(eval.substr(2, eval.length() - 2)));
+        }
+        else {
+            char* endptr;
+            double value = strtod(eval.c_str(), &endptr);
+
+            if (*endptr != '\0') {
+                success = false;
+                return VALUE_ZERO;
+            }
+            else {
+                return Value(value * static_cast<double>(PawnValueEg));
+            }
+        }
+    }
+
+    // for Debug
+    //#define DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT
+
+    bool is_like_fen(std::string fen) {
+        int count_space = std::count(fen.cbegin(), fen.cend(), ' ');
+        int count_slash = std::count(fen.cbegin(), fen.cend(), '/');
+
+#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
+        //std::cout << "count_space=" << count_space << std::endl;
+        //std::cout << "count_slash=" << count_slash << std::endl;
+#endif
+
+        return count_space == 5 && count_slash == 7;
+    }
+
+    void convert_bin_from_pgn_extract(
+        const vector<string>& filenames,
+        const string& output_file_name,
+        const bool pgn_eval_side_to_move,
+        const bool convert_no_eval_fens_as_score_zero)
+    {
+        std::cout << "pgn_eval_side_to_move=" << pgn_eval_side_to_move << std::endl;
+        std::cout << "convert_no_eval_fens_as_score_zero=" << convert_no_eval_fens_as_score_zero << std::endl;
+
+        auto th = Threads.main();
+        auto& pos = th->rootPos;
+
+        std::fstream ofs;
+        ofs.open(output_file_name, ios::out | ios::binary);
+
+        int game_count = 0;
+        int fen_count = 0;
+
+        for (auto filename : filenames) {
+            std::cout << now_string() << " convert " << filename << std::endl;
+            ifstream ifs;
+            ifs.open(filename);
+
+            int game_result = 0;
+
+            std::string line;
+            while (std::getline(ifs, line)) {
+
+                if (line.empty()) {
+                    continue;
+                }
+
+                else if (line.substr(0, 1) == "[") {
+                    std::regex pattern_result(R"(\[Result (.+?)\])");
+                    std::smatch match;
+
+                    // example: [Result "1-0"]
+                    if (std::regex_search(line, match, pattern_result)) {
+                        game_result = parse_game_result_from_pgn_extract(match.str(1));
+#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
+                        std::cout << "game_result=" << game_result << std::endl;
+#endif
+                        game_count++;
+                        if (game_count % 10000 == 0) {
+                            std::cout << now_string() << " game_count=" << game_count << ", fen_count=" << fen_count << std::endl;
+                        }
+                    }
+
+                    continue;
+                }
+
+                else {
+                    int gamePly = 1;
+                    auto itr = line.cbegin();
+
+                    while (true) {
+                        gamePly++;
+
+                        PackedSfenValue psv;
+                        memset((char*)&psv, 0, sizeof(PackedSfenValue));
+
+                        // fen
+                        {
+                            bool fen_found = false;
+
+                            while (!fen_found) {
+                                std::regex pattern_bracket(R"(\{(.+?)\})");
+                                std::smatch match;
+                                if (!std::regex_search(itr, line.cend(), match, pattern_bracket)) {
+                                    break;
+                                }
+
+                                itr += match.position(0) + match.length(0) - 1;
+                                std::string str_fen = match.str(1);
+                                trim(str_fen);
+
+                                if (is_like_fen(str_fen)) {
+                                    fen_found = true;
+
+                                    StateInfo si;
+                                    pos.set(str_fen, false, &si, th);
+                                    pos.sfen_pack(psv.sfen);
+                                }
+
+#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
+                                std::cout << "str_fen=" << str_fen << std::endl;
+                                std::cout << "fen_found=" << fen_found << std::endl;
+#endif
+                            }
+
+                            if (!fen_found) {
+                                break;
+                            }
+                        }
+
+                        // move
+                        {
+                            std::regex pattern_move(R"(\}(.+?)\{)");
+                            std::smatch match;
+                            if (!std::regex_search(itr, line.cend(), match, pattern_move)) {
+                                break;
+                            }
+
+                            itr += match.position(0) + match.length(0) - 1;
+                            std::string str_move = match.str(1);
+                            trim(str_move);
+#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
+                            std::cout << "str_move=" << str_move << std::endl;
+#endif
+                            psv.move = UCI::to_move(pos, str_move);
+                        }
+
+                        // eval
+                        bool eval_found = false;
+                        {
+                            std::regex pattern_bracket(R"(\{(.+?)\})");
+                            std::smatch match;
+                            if (!std::regex_search(itr, line.cend(), match, pattern_bracket)) {
+                                break;
+                            }
+
+                            std::string str_eval_clk = match.str(1);
+                            trim(str_eval_clk);
+#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
+                            std::cout << "str_eval_clk=" << str_eval_clk << std::endl;
+#endif
+
+                            // example: { [%eval 0.25] [%clk 0:10:00] }
+                            // example: { [%eval #-4] [%clk 0:10:00] }
+                            // example: { [%eval #3] [%clk 0:10:00] }
+                            // example: { +0.71/22 1.2s }
+                            // example: { -M4/7 0.003s }
+                            // example: { M3/245 0.017s }
+                            // example: { +M1/245 0.010s, White mates }
+                            // example: { 0.60 }
+                            // example: { book }
+                            // example: { rnbqkb1r/pp3ppp/2p1pn2/3p4/2PP4/2N2N2/PP2PPPP/R1BQKB1R w KQkq - 0 5 }
+
+                            // Considering the absence of eval
+                            if (!is_like_fen(str_eval_clk)) {
+                                itr += match.position(0) + match.length(0) - 1;
+
+                                if (str_eval_clk != "book") {
+                                    std::regex pattern_eval1(R"(\[\%eval (.+?)\])");
+                                    std::regex pattern_eval2(R"((.+?)\/)");
+
+                                    std::string str_eval;
+                                    if (std::regex_search(str_eval_clk, match, pattern_eval1) ||
+                                        std::regex_search(str_eval_clk, match, pattern_eval2)) {
+                                        str_eval = match.str(1);
+                                        trim(str_eval);
+                                    }
+                                    else {
+                                        str_eval = str_eval_clk;
+                                    }
+
+                                    bool success = false;
+                                    Value value = parse_score_from_pgn_extract(str_eval, success);
+                                    if (success) {
+                                        eval_found = true;
+                                        psv.score = Math::clamp(value, -VALUE_MATE, VALUE_MATE);
+                                    }
+
+#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
+                                    std::cout << "str_eval=" << str_eval << std::endl;
+                                    std::cout << "success=" << success << ", psv.score=" << psv.score << std::endl;
+#endif
+                                }
+                            }
+                        }
+
+                        // write
+                        if (eval_found || convert_no_eval_fens_as_score_zero) {
+                            if (!eval_found && convert_no_eval_fens_as_score_zero) {
+                                psv.score = 0;
+                            }
+
+                            psv.gamePly = gamePly;
+                            psv.game_result = game_result;
+
+                            if (pos.side_to_move() == BLACK) {
+                                if (!pgn_eval_side_to_move) {
+                                    psv.score *= -1;
+                                }
+                                psv.game_result *= -1;
+                            }
+
+                            ofs.write((char*)&psv, sizeof(PackedSfenValue));
+
+                            fen_count++;
+                        }
+                    }
+
+                    game_result = 0;
+                }
+            }
+        }
+
+        std::cout << now_string() << " game_count=" << game_count << ", fen_count=" << fen_count << std::endl;
+        std::cout << now_string() << " all done" << std::endl;
+        ofs.close();
+    }
+
+    void convert_plain(
+        const vector<string>& filenames,
+        const string& output_file_name)
+    {
+        Position tpos;
+        std::ofstream ofs;
+        ofs.open(output_file_name, ios::app);
+        auto th = Threads.main();
+        for (auto filename : filenames) {
+            std::cout << "convert " << filename << " ... ";
+
+            // Just convert packedsfenvalue to text
+            std::fstream fs;
+            fs.open(filename, ios::in | ios::binary);
+            PackedSfenValue p;
+            while (true)
+            {
+                if (fs.read((char*)&p, sizeof(PackedSfenValue))) {
+                    StateInfo si;
+                    tpos.set_from_packed_sfen(p.sfen, &si, th);
+
+                    // write as plain text
+                    ofs << "fen " << tpos.fen() << std::endl;
+                    ofs << "move " << UCI::move(Move(p.move), false) << std::endl;
+                    ofs << "score " << p.score << std::endl;
+                    ofs << "ply " << int(p.gamePly) << std::endl;
+                    ofs << "result " << int(p.game_result) << std::endl;
+                    ofs << "e" << std::endl;
+                }
+                else {
+                    break;
+                }
+            }
+            fs.close();
+            std::cout << "done" << std::endl;
+        }
+        ofs.close();
+        std::cout << "all done" << std::endl;
+    }
+
+    static inline const std::string plain_extension = ".plain";
+    static inline const std::string bin_extension = ".bin";
+    static inline const std::string binpack_extension = ".binpack";
+
+    static bool file_exists(const std::string& name)
+    {
+        std::ifstream f(name);
+        return f.good();
+    }
+
+    static bool ends_with(const std::string& lhs, const std::string& end)
+    {
+        if (end.size() > lhs.size()) return false;
+
+        return std::equal(end.rbegin(), end.rend(), lhs.rbegin());
+    }
+
+    static bool is_convert_of_type(
+        const std::string& input_path,
+        const std::string& output_path,
+        const std::string& expected_input_extension,
+        const std::string& expected_output_extension)
+    {
+        return ends_with(input_path, expected_input_extension)
+            && ends_with(output_path, expected_output_extension);
+    }
+
+    using ConvertFunctionType = void(std::string inputPath, std::string outputPath, std::ios_base::openmode om, bool validate);
+
+    static ConvertFunctionType* get_convert_function(const std::string& input_path, const std::string& output_path)
+    {
+        if (is_convert_of_type(input_path, output_path, plain_extension, bin_extension))
+            return binpack::convertPlainToBin;
+        if (is_convert_of_type(input_path, output_path, plain_extension, binpack_extension))
+            return binpack::convertPlainToBinpack;
+
+        if (is_convert_of_type(input_path, output_path, bin_extension, plain_extension))
+            return binpack::convertBinToPlain;
+        if (is_convert_of_type(input_path, output_path, bin_extension, binpack_extension))
+            return binpack::convertBinToBinpack;
+
+        if (is_convert_of_type(input_path, output_path, binpack_extension, plain_extension))
+            return binpack::convertBinpackToPlain;
+        if (is_convert_of_type(input_path, output_path, binpack_extension, bin_extension))
+            return binpack::convertBinpackToBin;
+
+        return nullptr;
+    }
+
+    static void convert(const std::string& input_path, const std::string& output_path, std::ios_base::openmode om, bool validate)
+    {
+        if(!file_exists(input_path))
+        {
+            std::cerr << "Input file does not exist.\n";
+            return;
+        }
+
+        auto func = get_convert_function(input_path, output_path);
+        if (func != nullptr)
+        {
+            func(input_path, output_path, om, validate);
+        }
+        else
+        {
+            std::cerr << "Conversion between files of these types is not supported.\n";
+        }
+    }
+
+    static void convert(const std::vector<std::string>& args)
+    {
+        if (args.size() < 2 || args.size() > 4)
+        {
+            std::cerr << "Invalid arguments.\n";
+            std::cerr << "Usage: convert from_path to_path [append] [validate]\n";
+            return;
+        }
+
+        const bool append = std::find(args.begin() + 2, args.end(), "append") != args.end();
+        const bool validate = std::find(args.begin() + 2, args.end(), "validate") != args.end();
+
+        const std::ios_base::openmode openmode =
+            append
+            ? std::ios_base::app
+            : std::ios_base::trunc;
+
+        convert(args[0], args[1], openmode, validate);
+    }
+
+    void convert(istringstream& is)
+    {
+        std::vector<std::string> args;
+
+        while (true)
+        {
+            std::string token = "";
+            is >> token;
+            if (token == "")
+                break;
+
+            args.push_back(token);
+        }
+
+        convert(args);
+    }
+
+    static void append_files_from_dir(
+        std::vector<std::string>& filenames,
+        const std::string& base_dir,
+        const std::string& target_dir)
+    {
+        string kif_base_dir = Path::combine(base_dir, target_dir);
+
+        namespace sys = std::filesystem;
+        sys::path p(kif_base_dir); // Origin of enumeration
+        std::for_each(sys::directory_iterator(p), sys::directory_iterator(),
+            [&](const sys::path& path) {
+                if (sys::is_regular_file(path))
+                    filenames.push_back(Path::combine(target_dir, path.filename().generic_string()));
+            });
+    }
+
+    static void rebase_files(
+        std::vector<std::string>& filenames,
+        const std::string& base_dir)
+    {
+        for (auto& file : filenames)
+        {
+            file = Path::combine(base_dir, file);
+        }
+    }
+
+    void convert_bin_from_pgn_extract(std::istringstream& is)
+    {
+        std::vector<std::string> filenames;
+
+        string base_dir;
+        string target_dir;
+
+        bool pgn_eval_side_to_move = false;
+        bool convert_no_eval_fens_as_score_zero = false;
+
+        string output_file_name = "shuffled_sfen.bin";
+
+        while (true)
+        {
+            string option;
+            is >> option;
+
+            if (option == "")
+                break;
+
+            if (option == "targetdir") is >> target_dir;
+            else if (option == "targetfile")
+            {
+                std::string filename;
+                is >> filename;
+                filenames.push_back(filename);
+            }
+
+            else if (option == "basedir")   is >> base_dir;
+
+            else if (option == "pgn_eval_side_to_move") is >> pgn_eval_side_to_move;
+            else if (option == "convert_no_eval_fens_as_score_zero") is >> convert_no_eval_fens_as_score_zero;
+            else if (option == "output_file_name") is >> output_file_name;
+            else
+            {
+                cout << "Unknown option: " << option << ". Ignoring.\n";
+            }
+        }
+
+        if (!target_dir.empty())
+        {
+            append_files_from_dir(filenames, base_dir, target_dir);
+        }
+        rebase_files(filenames, base_dir);
+
+        Eval::NNUE::init();
+
+        cout << "convert_bin_from_pgn-extract.." << endl;
+        convert_bin_from_pgn_extract(
+            filenames,
+            output_file_name,
+            pgn_eval_side_to_move,
+            convert_no_eval_fens_as_score_zero);
+    }
+
+    void convert_bin(std::istringstream& is)
+    {
+        std::vector<std::string> filenames;
+
+        string base_dir;
+        string target_dir;
+
+        int ply_minimum = 0;
+        int ply_maximum = 114514;
+        bool interpolate_eval = 0;
+        bool check_invalid_fen = false;
+        bool check_illegal_move = false;
+
+        bool pgn_eval_side_to_move = false;
+        bool convert_no_eval_fens_as_score_zero = false;
+
+        double src_score_min_value = 0.0;
+        double src_score_max_value = 1.0;
+        double dest_score_min_value = 0.0;
+        double dest_score_max_value = 1.0;
+
+        string output_file_name = "shuffled_sfen.bin";
+
+        while (true)
+        {
+            string option;
+            is >> option;
+
+            if (option == "")
+                break;
+
+            if (option == "targetdir") is >> target_dir;
+            else if (option == "targetfile")
+            {
+                std::string filename;
+                is >> filename;
+                filenames.push_back(filename);
+            }
+
+            else if (option == "basedir")   is >> base_dir;
+
+            else if (option == "ply_minimum") is >> ply_minimum;
+            else if (option == "ply_maximum") is >> ply_maximum;
+            else if (option == "interpolate_eval") is >> interpolate_eval;
+            else if (option == "check_invalid_fen") is >> check_invalid_fen;
+            else if (option == "check_illegal_move") is >> check_illegal_move;
+            else if (option == "pgn_eval_side_to_move") is >> pgn_eval_side_to_move;
+            else if (option == "convert_no_eval_fens_as_score_zero") is >> convert_no_eval_fens_as_score_zero;
+            else if (option == "src_score_min_value") is >> src_score_min_value;
+            else if (option == "src_score_max_value") is >> src_score_max_value;
+            else if (option == "dest_score_min_value") is >> dest_score_min_value;
+            else if (option == "dest_score_max_value") is >> dest_score_max_value;
+            else if (option == "output_file_name") is >> output_file_name;
+            else
+            {
+                cout << "Unknown option: " << option << ". Ignoring.\n";
+            }
+        }
+
+        if (!target_dir.empty())
+        {
+            append_files_from_dir(filenames, base_dir, target_dir);
+        }
+        rebase_files(filenames, base_dir);
+
+        Eval::NNUE::init();
+
+        cout << "convert_bin.." << endl;
+            convert_bin(
+                filenames,
+                output_file_name,
+                ply_minimum,
+                ply_maximum,
+                interpolate_eval,
+                src_score_min_value,
+                src_score_max_value,
+                dest_score_min_value,
+                dest_score_max_value,
+                check_invalid_fen,
+                check_illegal_move
+            );
+    }
+
+    void convert_plain(std::istringstream& is)
+    {
+        std::vector<std::string> filenames;
+
+        string base_dir;
+        string target_dir;
+
+        string output_file_name = "shuffled_sfen.bin";
+
+        while (true)
+        {
+            string option;
+            is >> option;
+
+            if (option == "")
+                break;
+
+            if (option == "targetdir") is >> target_dir;
+            else if (option == "targetfile")
+            {
+                std::string filename;
+                is >> filename;
+                filenames.push_back(filename);
+            }
+
+            else if (option == "basedir")   is >> base_dir;
+
+            else if (option == "output_file_name") is >> output_file_name;
+            else
+            {
+                cout << "Unknown option: " << option << ". Ignoring.\n";
+            }
+        }
+
+        if (!target_dir.empty())
+        {
+            append_files_from_dir(filenames, base_dir, target_dir);
+        }
+        rebase_files(filenames, base_dir);
+
+        Eval::NNUE::init();
+
+        cout << "convert_plain.." << endl;
+        convert_plain(filenames, output_file_name);
+    }
+}
@@ -0,0 +1,18 @@
+#ifndef _CONVERT_H_
+#define _CONVERT_H_
+
+#include <vector>
+#include <string>
+#include <sstream>
+
+namespace Learner {
+    void convert(std::istringstream& is);
+
+    void convert_bin_from_pgn_extract(std::istringstream& is);
+
+    void convert_bin(std::istringstream& is);
+
+    void convert_plain(std::istringstream& is);
+}
+
+#endif
@@ -0,0 +1,962 @@
+#include "gensfen.h"
+
+#include "sfen_writer.h"
+#include "packed_sfen.h"
+#include "opening_book.h"
+
+#include "misc.h"
+#include "position.h"
+#include "thread.h"
+#include "tt.h"
+#include "uci.h"
+
+#include "extra/nnue_data_binpack_format.h"
+
+#include "nnue/evaluate_nnue.h"
+#include "nnue/evaluate_nnue_learner.h"
+
+#include "syzygy/tbprobe.h"
+
+#include <atomic>
+#include <chrono>
+#include <climits>
+#include <cmath>
+#include <cstring>
+#include <filesystem>
+#include <fstream>
+#include <iomanip>
+#include <limits>
+#include <list>
+#include <memory>
+#include <optional>
+#include <random>
+#include <shared_mutex>
+#include <sstream>
+#include <unordered_set>
+
+using namespace std;
+
+namespace Learner
+{
+    // Class to generate sfen with multiple threads
+    struct Gensfen
+    {
+        struct Params
+        {
+            // Min and max depths for search during gensfen
+            int search_depth_min = 3;
+            int search_depth_max = -1;
+
+            // Number of the nodes to be searched.
+            // 0 represents no limits.
+            uint64_t nodes = 0;
+
+            // Upper limit of evaluation value of generated situation
+            int eval_limit = 3000;
+
+            // minimum ply with random move
+            // maximum ply with random move
+            // Number of random moves in one station
+            int random_move_minply = 1;
+            int random_move_maxply = 24;
+            int random_move_count = 5;
+
+            // Move kings with a probability of 1/N when randomly moving like Apery software.
+            // When you move the king again, there is a 1/N chance that it will randomly moved
+            // once in the opponent's turn.
+            // Apery has N=2. Specifying 0 here disables this function.
+            int random_move_like_apery = 0;
+
+            // For when using multi pv instead of random move.
+            // random_multi_pv is the number of candidates for MultiPV.
+            // When adopting the move of the candidate move, the difference
+            // between the evaluation value of the move of the 1st place
+            // and the evaluation value of the move of the Nth place is.
+            // Must be in the range random_multi_pv_diff.
+            // random_multi_pv_depth is the search depth for MultiPV.
+            int random_multi_pv = 0;
+            int random_multi_pv_diff = 32000;
+            int random_multi_pv_depth = -1;
+
+            // The minimum and maximum ply (number of steps from
+            // the initial phase) of the sfens to write out.
+            int write_minply = 16;
+            int write_maxply = 400;
+
+            uint64_t save_every = std::numeric_limits<uint64_t>::max();
+
+            std::string output_file_name = "generated_kifu";
+
+            SfenOutputType sfen_format = SfenOutputType::Binpack;
+
+            std::string seed;
+
+            bool write_out_draw_game_in_training_data_generation = true;
+            bool detect_draw_by_consecutive_low_score = true;
+            bool detect_draw_by_insufficient_mating_material = true;
+
+            bool ensure_quiet = false;
+
+            uint64_t num_threads;
+
+            std::string book;
+
+            void enforce_constraints()
+            {
+                search_depth_max = std::max(search_depth_min, search_depth_max);
+                random_multi_pv_depth = std::max(search_depth_min, random_multi_pv_depth);
+
+                // Limit the maximum to a one-stop score. (Otherwise you might not end the loop)
+                eval_limit = std::min(eval_limit, (int)mate_in(2));
+
+                save_every = std::max(save_every, REPORT_STATS_EVERY);
+
+                num_threads = Options["Threads"];
+            }
+        };
+
+        // Hash to limit the export of identical sfens
+        static constexpr uint64_t GENSFEN_HASH_SIZE = 64 * 1024 * 1024;
+        // It must be 2**N because it will be used as the mask to calculate hash_index.
+        static_assert((GENSFEN_HASH_SIZE& (GENSFEN_HASH_SIZE - 1)) == 0);
+
+        static constexpr uint64_t REPORT_DOT_EVERY = 5000;
+        static constexpr uint64_t REPORT_STATS_EVERY = 200000;
+        static_assert(REPORT_STATS_EVERY % REPORT_DOT_EVERY == 0);
+
+        Gensfen(
+            const Params& prm
+        ) :
+            params(prm),
+            prng(prm.seed),
+            sfen_writer(prm.output_file_name, prm.num_threads, prm.save_every, prm.sfen_format)
+        {
+            hash.resize(GENSFEN_HASH_SIZE);
+
+            if (!prm.book.empty())
+            {
+                opening_book = open_opening_book(prm.book, prng);
+                if (opening_book == nullptr)
+                {
+                    std::cout << "WARNING: Failed to open opening book " << prm.book << ". Falling back to startpos.\n";
+                }
+            }
+
+            // Output seed to veryfy by the user if it's not identical by chance.
+            std::cout << prng << std::endl;
+        }
+
+        void generate(uint64_t limit);
+
+    private:
+        Params params;
+
+        PRNG prng;
+
+        std::mutex stats_mutex;
+        TimePoint last_stats_report_time;
+
+        // sfen exporter
+        SfenWriter sfen_writer;
+
+        SynchronizedRegionLogger::Region out;
+
+        vector<Key> hash; // 64MB*sizeof(HASH_KEY) = 512MB
+
+        std::unique_ptr<OpeningBook> opening_book;
+
+        static void set_gensfen_search_limits();
+
+        void generate_worker(
+            Thread& th,
+            std::atomic<uint64_t>& counter,
+            uint64_t limit);
+
+        bool was_seen_before(const Position& pos);
+
+        optional<int8_t> get_current_game_result(
+            Position& pos,
+            const vector<int>& move_hist_scores) const;
+
+        vector<uint8_t> generate_random_move_flags();
+
+        optional<Move> choose_random_move(
+            Position& pos,
+            std::vector<uint8_t>& random_move_flag,
+            int ply,
+            int& random_move_c);
+
+        bool commit_psv(
+            Thread& th,
+            PSVector& sfens,
+            int8_t lastTurnIsWin,
+            std::atomic<uint64_t>& counter,
+            uint64_t limit,
+            Color result_color);
+
+        void report(uint64_t done, uint64_t new_done);
+
+        void maybe_report(uint64_t done);
+    };
+
+    void Gensfen::set_gensfen_search_limits()
+    {
+        // About Search::Limits
+        // Be careful because this member variable is global and affects other threads.
+        auto& limits = Search::Limits;
+
+        // Make the search equivalent to the "go infinite" command. (Because it is troublesome if time management is done)
+        limits.infinite = true;
+
+        // Since PV is an obstacle when displayed, erase it.
+        limits.silent = true;
+
+        // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it.
+        limits.nodes = 0;
+
+        // depth is also processed by the one passed as an argument of Learner::search().
+        limits.depth = 0;
+    }
+
+    void Gensfen::generate(uint64_t limit)
+    {
+        last_stats_report_time = 0;
+
+        set_gensfen_search_limits();
+
+        std::atomic<uint64_t> counter{0};
+        Threads.execute_with_workers([&counter, limit, this](Thread& th) {
+            generate_worker(th, counter, limit);
+        });
+        Threads.wait_for_workers_finished();
+
+        sfen_writer.flush();
+
+        if (limit % REPORT_STATS_EVERY != 0)
+        {
+            report(limit, limit % REPORT_STATS_EVERY);
+        }
+
+        std::cout << std::endl;
+    }
+
+    void Gensfen::generate_worker(
+        Thread& th,
+        std::atomic<uint64_t>& counter,
+        uint64_t limit)
+    {
+        // For the time being, it will be treated as a draw
+        // at the maximum number of steps to write.
+        // Maximum StateInfo + Search PV to advance to leaf buffer
+        std::vector<StateInfo, AlignedAllocator<StateInfo>> states(
+            params.write_maxply + MAX_PLY /* == search_depth_min + α */);
+
+        StateInfo si;
+
+        // end flag
+        bool quit = false;
+
+        // repeat until the specified number of times
+        while (!quit)
+        {
+            // It is necessary to set a dependent thread for Position.
+            // When parallelizing, Threads (since this is a vector<Thread*>,
+            // Do the same for up to Threads[0]...Threads[thread_num-1].
+            auto& pos = th.rootPos;
+            if (opening_book != nullptr)
+            {
+                auto& fen = opening_book->next_fen();
+                pos.set(fen, false, &si, &th);
+            }
+            else
+            {
+                pos.set(StartFEN, false, &si, &th);
+            }
+
+            int resign_counter = 0;
+            bool should_resign = prng.rand(10) > 1;
+            // Vector for holding the sfens in the current simulated game.
+            PSVector packed_sfens;
+            packed_sfens.reserve(params.write_maxply + MAX_PLY);
+
+            // Precomputed flags. Used internally by choose_random_move.
+            vector<uint8_t> random_move_flag = generate_random_move_flags();
+
+            // A counter that keeps track of the number of random moves
+            // When random_move_minply == -1, random moves are
+            // performed continuously, so use it at this time.
+            // Used internally by choose_random_move.
+            int actual_random_move_count = 0;
+
+            // Save history of move scores for adjudication
+            vector<int> move_hist_scores;
+
+            auto flush_psv = [&](int8_t result) {
+                quit = commit_psv(th, packed_sfens, result, counter, limit, pos.side_to_move());
+            };
+
+            for (int ply = 0; ; ++ply)
+            {
+                // Current search depth
+                const int depth = params.search_depth_min + (int)prng.rand(params.search_depth_max - params.search_depth_min + 1);
+
+                // Starting search calls init_for_search
+                auto [search_value, search_pv] = Search::search(pos, depth, 1, params.nodes);
+
+                // This has to be performed after search because it needs to know
+                // rootMoves which are filled in init_for_search.
+                const auto result = get_current_game_result(pos, move_hist_scores);
+                if (result.has_value())
+                {
+                    flush_psv(result.value());
+                    break;
+                }
+
+                // Always adjudivate by eval limit.
+                // Also because of this we don't have to check for TB/MATE scores
+                if (abs(search_value) >= params.eval_limit)
+                {
+                    resign_counter++;
+                    if ((should_resign && resign_counter >= 4) || abs(search_value) >= VALUE_KNOWN_WIN) {
+                        flush_psv((search_value >= params.eval_limit) ? 1 : -1);
+                        break;
+                    }
+                }
+                else
+                {
+                    resign_counter = 0;
+                }
+
+                // In case there is no PV and the game was not ended here
+                // there is nothing we can do, we can't continue the game,
+                // we don't know the result, so discard this game.
+                if (search_pv.empty())
+                {
+                    break;
+                }
+
+                // Save the move score for adjudication.
+                move_hist_scores.push_back(search_value);
+
+                // Discard stuff before write_minply is reached
+                // because it can harm training due to overfitting.
+                // Initial positions would be too common.
+                if (ply >= params.write_minply)
+                {
+                    packed_sfens.emplace_back(PackedSfenValue());
+
+                    auto& psv = packed_sfens.back();
+
+                    if (params.ensure_quiet)
+                    {
+                        auto [qsearch_value, qsearch_pv] = Search::qsearch(pos);
+                        if (qsearch_pv.empty())
+                        {
+                            // Here we only write the position data.
+                            // Result is added after the whole game is done.
+                            pos.sfen_pack(psv.sfen);
+
+                            // Already a quiet position
+                            psv.score = search_value;
+                            psv.move = search_pv[0];
+                            psv.gamePly = ply;
+                        }
+                        else
+                        {
+                            // Navigate to a quiet
+                            int old_ply = ply;
+                            for (auto m : qsearch_pv)
+                            {
+                                pos.do_move(m, states[ply++]);
+                            }
+
+                            if (was_seen_before(pos))
+                            {
+                                // Just skip the move.
+                                packed_sfens.pop_back();
+                            }
+                            else
+                            {
+                                // Reevaluate
+                                auto [quiet_search_value, quiet_search_pv] = Search::search(pos, depth, 1, params.nodes);
+                                if (quiet_search_pv.empty())
+                                {
+                                    // Just skip the move.
+                                    packed_sfens.pop_back();
+                                }
+                                else
+                                {
+                                    // Here we only write the position data.
+                                    // Result is added after the whole game is done.
+                                    pos.sfen_pack(psv.sfen);
+
+                                    psv.score = quiet_search_value;
+                                    psv.move = quiet_search_pv[0];
+                                    psv.gamePly = ply;
+                                }
+                            }
+
+                            // Get back to the game
+                            for (auto it = qsearch_pv.rbegin(); it != qsearch_pv.rend(); ++it)
+                            {
+                                pos.undo_move(*it);
+                            }
+                            ply = old_ply;
+                        }
+                    }
+                    else
+                    {
+                        if (was_seen_before(pos))
+                        {
+                            packed_sfens.pop_back();
+                        }
+                        else
+                        {
+                            // Here we only write the position data.
+                            // Result is added after the whole game is done.
+                            pos.sfen_pack(psv.sfen);
+
+                            psv.score = search_value;
+                            psv.move = search_pv[0];
+                            psv.gamePly = ply;
+                        }
+                    }
+                }
+
+                // Update the next move according to best search result or random move.
+                auto random_move = choose_random_move(pos, random_move_flag, ply, actual_random_move_count);
+                const Move next_move = random_move.has_value() ? *random_move : search_pv[0];
+
+                // We don't have the whole game yet, but it ended,
+                // so the writing process ends and the next game starts.
+                // This shouldn't really happen.
+                if (!is_ok(next_move))
+                {
+                    break;
+                }
+
+                // Do move.
+                pos.do_move(next_move, states[ply]);
+            }
+        }
+    }
+
+    bool Gensfen::was_seen_before(const Position& pos)
+    {
+        // Look into the position hashtable to see if the same
+        // position was seen before.
+        // This is a good heuristic to exlude already seen
+        // positions without many false positives.
+        auto key = pos.key();
+        auto hash_index = (size_t)(key & (GENSFEN_HASH_SIZE - 1));
+        auto old_key = hash[hash_index];
+        if (key == old_key)
+        {
+            return true;
+        }
+        else
+        {
+            // Replace with the current key.
+            hash[hash_index] = key;
+            return false;
+        }
+    }
+
+    optional<int8_t> Gensfen::get_current_game_result(
+        Position& pos,
+        const vector<int>& move_hist_scores) const
+    {
+        // Variables for draw adjudication.
+        // Todo: Make this as an option.
+
+        // start the adjudication when ply reaches this value
+        constexpr int adj_draw_ply = 80;
+
+        // 4 move scores for each side have to be checked
+        constexpr int adj_draw_cnt = 8;
+
+        // move score in CP
+        constexpr int adj_draw_score = 0;
+
+        // For the time being, it will be treated as a
+        // draw at the maximum number of steps to write.
+        const int ply = move_hist_scores.size();
+
+        // has it reached the max length or is a draw
+        if (ply >= params.write_maxply || pos.is_draw(ply))
+        {
+            return 0;
+        }
+
+        if(pos.this_thread()->rootMoves.empty())
+        {
+            // If there is no legal move
+            return pos.checkers()
+                ? -1 /* mate */
+                : 0 /* stalemate */;
+        }
+
+        // Adjudicate game to a draw if the last 4 scores of each engine is 0.
+        if (params.detect_draw_by_consecutive_low_score)
+        {
+            if (ply >= adj_draw_ply)
+            {
+                int num_cons_plies_within_draw_score = 0;
+                bool is_adj_draw = false;
+
+                for (auto it = move_hist_scores.rbegin();
+                    it != move_hist_scores.rend(); ++it)
+                {
+                    if (abs(*it) <= adj_draw_score)
+                    {
+                        num_cons_plies_within_draw_score++;
+                    }
+                    else
+                    {
+                        // Draw scores must happen on consecutive plies
+                        break;
+                    }
+
+                    if (num_cons_plies_within_draw_score >= adj_draw_cnt)
+                    {
+                        is_adj_draw = true;
+                        break;
+                    }
+                }
+
+                if (is_adj_draw)
+                {
+                    return 0;
+                }
+            }
+        }
+
+        // Draw by insufficient mating material
+        if (params.detect_draw_by_insufficient_mating_material)
+        {
+            if (pos.count<ALL_PIECES>() <= 4)
+            {
+                int num_pieces = pos.count<ALL_PIECES>();
+
+                // (1) KvK
+                if (num_pieces == 2)
+                {
+                    return 0;
+                }
+
+                // (2) KvK + 1 minor piece
+                if (num_pieces == 3)
+                {
+                    int minor_pc = pos.count<BISHOP>(WHITE) + pos.count<KNIGHT>(WHITE) +
+                        pos.count<BISHOP>(BLACK) + pos.count<KNIGHT>(BLACK);
+                    if (minor_pc == 1)
+                    {
+                        return 0;
+                    }
+                }
+
+                // (3) KBvKB, bishops of the same color
+                else if (num_pieces == 4)
+                {
+                    if (pos.count<BISHOP>(WHITE) == 1 && pos.count<BISHOP>(BLACK) == 1)
+                    {
+                        // Color of bishops is black.
+                        if ((pos.pieces(WHITE, BISHOP) & DarkSquares)
+                            && (pos.pieces(BLACK, BISHOP) & DarkSquares))
+                        {
+                            return 0;
+                        }
+                        // Color of bishops is white.
+                        if ((pos.pieces(WHITE, BISHOP) & ~DarkSquares)
+                            && (pos.pieces(BLACK, BISHOP) & ~DarkSquares))
+                        {
+                            return 0;
+                        }
+                    }
+                }
+            }
+        }
+
+        return nullopt;
+    }
+
+    vector<uint8_t> Gensfen::generate_random_move_flags()
+    {
+        vector<uint8_t> random_move_flag;
+
+        // Depending on random move selection parameters setup
+        // the array of flags that indicates whether a random move
+        // be taken at a given ply.
+
+        // Make an array like a[0] = 0 ,a[1] = 1, ...
+        // Fisher-Yates shuffle and take out the first N items.
+        // Actually, I only want N pieces, so I only need
+        // to shuffle the first N pieces with Fisher-Yates.
+
+        vector<int> a;
+        a.reserve((size_t)params.random_move_maxply);
+
+        // random_move_minply ,random_move_maxply is specified by 1 origin,
+        // Note that we are handling 0 origin here.
+        for (int i = std::max(params.random_move_minply - 1, 0); i < params.random_move_maxply; ++i)
+        {
+            a.push_back(i);
+        }
+
+        // In case of Apery random move, insert() may be called random_move_count times.
+        // Reserve only the size considering it.
+        random_move_flag.resize((size_t)params.random_move_maxply + params.random_move_count);
+
+        // A random move that exceeds the size() of a[] cannot be applied, so limit it.
+        for (int i = 0; i < std::min(params.random_move_count, (int)a.size()); ++i)
+        {
+            swap(a[i], a[prng.rand((uint64_t)a.size() - i) + i]);
+            random_move_flag[a[i]] = true;
+        }
+
+        return random_move_flag;
+    }
+
+    optional<Move> Gensfen::choose_random_move(
+        Position& pos,
+        std::vector<uint8_t>& random_move_flag,
+        int ply,
+        int& random_move_c)
+    {
+        optional<Move> random_move;
+
+        // Randomly choose one from legal move
+        if (
+            // 1. Random move of random_move_count times from random_move_minply to random_move_maxply
+            (params.random_move_minply != -1 && ply < (int)random_move_flag.size() && random_move_flag[ply]) ||
+            // 2. A mode to perform random move of random_move_count times after leaving the startpos
+            (params.random_move_minply == -1 && random_move_c < params.random_move_count))
+        {
+            ++random_move_c;
+
+            // It's not a mate, so there should be one legal move...
+            if (params.random_multi_pv == 0)
+            {
+                // Normal random move
+                MoveList<LEGAL> list(pos);
+
+                // I don't really know the goodness and badness of making this the Apery method.
+                if (params.random_move_like_apery == 0
+                    || prng.rand(params.random_move_like_apery) != 0)
+                {
+                    // Normally one move from legal move
+                    random_move = list.at((size_t)prng.rand((uint64_t)list.size()));
+                }
+                else
+                {
+                    // if you can move the king, move the king
+                    Move moves[8]; // Near 8
+                    Move* p = &moves[0];
+                    for (auto& m : list)
+                    {
+                        if (type_of(pos.moved_piece(m)) == KING)
+                        {
+                            *(p++) = m;
+                        }
+                    }
+
+                    size_t n = p - &moves[0];
+                    if (n != 0)
+                    {
+                        // move to move the king
+                        random_move = moves[prng.rand(n)];
+
+                        // In Apery method, at this time there is a 1/2 chance
+                        // that the opponent will also move randomly
+                        if (prng.rand(2) == 0)
+                        {
+                            // Is it a simple hack to add a "1" next to random_move_flag[ply]?
+                            random_move_flag.insert(random_move_flag.begin() + ply + 1, 1, true);
+                        }
+                    }
+                    else
+                    {
+                        // Normally one move from legal move
+                        random_move = list.at((size_t)prng.rand((uint64_t)list.size()));
+                    }
+                }
+            }
+            else
+            {
+                Search::search(pos, params.random_multi_pv_depth, params.random_multi_pv);
+
+                // Select one from the top N hands of root Moves
+                auto& rm = pos.this_thread()->rootMoves;
+
+                uint64_t s = min((uint64_t)rm.size(), (uint64_t)params.random_multi_pv);
+                for (uint64_t i = 1; i < s; ++i)
+                {
+                    // The difference from the evaluation value of rm[0] must
+                    // be within the range of random_multi_pv_diff.
+                    // It can be assumed that rm[x].score is arranged in descending order.
+                    if (rm[0].score > rm[i].score + params.random_multi_pv_diff)
+                    {
+                        s = i;
+                        break;
+                    }
+                }
+
+                random_move = rm[prng.rand(s)].pv[0];
+            }
+        }
+
+        return random_move;
+    }
+
+    // Write out the phases loaded in sfens to a file.
+    // result: win/loss in the next phase after the final phase in sfens
+    // 1 when winning. -1 when losing. Pass 0 for a draw.
+    // Return value: true if the specified number of
+    // sfens has already been reached and the process ends.
+    bool Gensfen::commit_psv(
+        Thread& th,
+        PSVector& sfens,
+        int8_t result,
+        std::atomic<uint64_t>& counter,
+        uint64_t limit,
+        Color result_color)
+    {
+        if (!params.write_out_draw_game_in_training_data_generation && result == 0)
+        {
+            // We didn't write anything so why quit.
+            return false;
+        }
+
+        auto side_to_move_from_sfen = [](auto& sfen){
+            return (Color)(sfen.sfen.data[0] & 1);
+        };
+
+        // From the final stage (one step before) to the first stage, give information on the outcome of the game for each stage.
+        // The phases stored in sfens are assumed to be continuous (in order).
+        for (auto it = sfens.rbegin(); it != sfens.rend(); ++it)
+        {
+            // The side to move is packed as the lowest bit of the first byte
+            const Color side_to_move = side_to_move_from_sfen(*it);
+            it->game_result = side_to_move == result_color ? result : -result;
+        }
+
+        // Write sfens in move order to make potential compression easier
+        for (auto& sfen : sfens)
+        {
+            // Return true if there is already enough data generated.
+            const auto iter = counter.fetch_add(1);
+            if (iter >= limit)
+                return true;
+
+            // because `iter` was done, now we do one more
+            maybe_report(iter + 1);
+
+            // Write out one sfen.
+            sfen_writer.write(th.thread_idx(), sfen);
+        }
+
+        return false;
+    }
+
+    void Gensfen::report(uint64_t done, uint64_t new_done)
+    {
+        const auto now_time = now();
+        const TimePoint elapsed = now_time - last_stats_report_time + 1;
+
+        out
+            << endl
+            << done << " sfens, "
+            << new_done * 1000 / elapsed << " sfens/second, "
+            << "at " << now_string() << sync_endl;
+
+        last_stats_report_time = now_time;
+
+        out = sync_region_cout.new_region();
+    }
+
+    void Gensfen::maybe_report(uint64_t done)
+    {
+        if (done % REPORT_DOT_EVERY == 0)
+        {
+            std::lock_guard lock(stats_mutex);
+
+            if (last_stats_report_time == 0)
+            {
+                last_stats_report_time = now();
+                out = sync_region_cout.new_region();
+            }
+
+            if (done != 0)
+            {
+                out << '.';
+
+                if (done % REPORT_STATS_EVERY == 0)
+                {
+                    report(done, REPORT_STATS_EVERY);
+                }
+            }
+        }
+    }
+
+    // Command to generate a game record
+    void gensfen(istringstream& is)
+    {
+        // Number of generated game records default = 8 billion phases (Ponanza specification)
+        uint64_t loop_max = 8000000000UL;
+
+        Gensfen::Params params;
+
+        // Add a random number to the end of the file name.
+        bool random_file_name = false;
+        std::string sfen_format = "binpack";
+
+        string token;
+        while (true)
+        {
+            token = "";
+            is >> token;
+            if (token == "")
+                break;
+
+            if (token == "depth")
+                is >> params.search_depth_min;
+            else if (token == "depth2")
+                is >> params.search_depth_max;
+            else if (token == "nodes")
+                is >> params.nodes;
+            else if (token == "loop")
+                is >> loop_max;
+            else if (token == "output_file_name")
+                is >> params.output_file_name;
+            else if (token == "eval_limit")
+                is >> params.eval_limit;
+            else if (token == "random_move_minply")
+                is >> params.random_move_minply;
+            else if (token == "random_move_maxply")
+                is >> params.random_move_maxply;
+            else if (token == "random_move_count")
+                is >> params.random_move_count;
+            else if (token == "random_move_like_apery")
+                is >> params.random_move_like_apery;
+            else if (token == "random_multi_pv")
+                is >> params.random_multi_pv;
+            else if (token == "random_multi_pv_diff")
+                is >> params.random_multi_pv_diff;
+            else if (token == "random_multi_pv_depth")
+                is >> params.random_multi_pv_depth;
+            else if (token == "write_minply")
+                is >> params.write_minply;
+            else if (token == "write_maxply")
+                is >> params.write_maxply;
+            else if (token == "save_every")
+                is >> params.save_every;
+            else if (token == "book")
+                is >> params.book;
+            else if (token == "random_file_name")
+                is >> random_file_name;
+            // Accept also the old option name.
+            else if (token == "use_draw_in_training_data_generation" || token == "write_out_draw_game_in_training_data_generation")
+                is >> params.write_out_draw_game_in_training_data_generation;
+            // Accept also the old option name.
+            else if (token == "use_game_draw_adjudication" || token == "detect_draw_by_consecutive_low_score")
+                is >> params.detect_draw_by_consecutive_low_score;
+            else if (token == "detect_draw_by_insufficient_mating_material")
+                is >> params.detect_draw_by_insufficient_mating_material;
+            else if (token == "sfen_format")
+                is >> sfen_format;
+            else if (token == "seed")
+                is >> params.seed;
+            else if (token == "set_recommended_uci_options")
+            {
+                UCI::setoption("Contempt", "0");
+                UCI::setoption("Skill Level", "20");
+                UCI::setoption("UCI_Chess960", "false");
+                UCI::setoption("UCI_AnalyseMode", "false");
+                UCI::setoption("UCI_LimitStrength", "false");
+                UCI::setoption("PruneAtShallowDepth", "false");
+                UCI::setoption("EnableTranspositionTable", "true");
+            }
+            else if (token == "ensure_quiet")
+            {
+                params.ensure_quiet = true;
+            }
+            else
+                cout << "ERROR: Ignoring unknown option " << token << endl;
+        }
+
+        if (!sfen_format.empty())
+        {
+            if (sfen_format == "bin")
+                params.sfen_format = SfenOutputType::Bin;
+            else if (sfen_format == "binpack")
+                params.sfen_format = SfenOutputType::Binpack;
+            else
+                cout << "WARNING: Unknown sfen format `" << sfen_format << "`. Using bin\n";
+        }
+
+        if (params.ensure_quiet)
+        {
+            // Otherwise we can't ensure quiet positions...
+            UCI::setoption("EnableTranspositionTable", "false");
+        }
+
+        if (random_file_name)
+        {
+            // Give a random number to output_file_name at this point.
+            // Do not use std::random_device().  Because it always the same integers on MinGW.
+            PRNG r(params.seed);
+
+            // Just in case, reassign the random numbers.
+            for (int i = 0; i < 10; ++i)
+                r.rand(1);
+
+            auto to_hex = [](uint64_t u) {
+                std::stringstream ss;
+                ss << std::hex << u;
+                return ss.str();
+            };
+
+            // I don't want to wear 64bit numbers by accident, so I'next_move going to make a 64bit number 2 just in case.
+            params.output_file_name += "_" + to_hex(r.rand<uint64_t>()) + to_hex(r.rand<uint64_t>());
+        }
+
+        params.enforce_constraints();
+
+        std::cout << "INFO: Executing gensfen command\n";
+
+        std::cout << "INFO: Parameters:\n";
+        std::cout
+            << "  - search_depth_min       = " << params.search_depth_min << endl
+            << "  - search_depth_max       = " << params.search_depth_max << endl
+            << "  - nodes                  = " << params.nodes << endl
+            << "  - num sfens to generate  = " << loop_max << endl
+            << "  - eval_limit             = " << params.eval_limit << endl
+            << "  - num threads (UCI)      = " << params.num_threads << endl
+            << "  - random_move_minply     = " << params.random_move_minply << endl
+            << "  - random_move_maxply     = " << params.random_move_maxply << endl
+            << "  - random_move_count      = " << params.random_move_count << endl
+            << "  - random_move_like_apery = " << params.random_move_like_apery << endl
+            << "  - random_multi_pv        = " << params.random_multi_pv << endl
+            << "  - random_multi_pv_diff   = " << params.random_multi_pv_diff << endl
+            << "  - random_multi_pv_depth  = " << params.random_multi_pv_depth << endl
+            << "  - write_minply           = " << params.write_minply << endl
+            << "  - write_maxply           = " << params.write_maxply << endl
+            << "  - book                   = " << params.book << endl
+            << "  - output_file_name       = " << params.output_file_name << endl
+            << "  - save_every             = " << params.save_every << endl
+            << "  - random_file_name       = " << random_file_name << endl
+            << "  - write_drawn_games      = " << params.write_out_draw_game_in_training_data_generation << endl
+            << "  - draw by low score      = " << params.detect_draw_by_consecutive_low_score << endl
+            << "  - draw by insuff. mat.   = " << params.detect_draw_by_insufficient_mating_material << endl;
+
+        // Show if the training data generator uses NNUE.
+        Eval::NNUE::verify_eval_file_loaded();
+
+        Threads.main()->ponder = false;
+
+        Gensfen gensfen(params);
+        gensfen.generate(loop_max);
+
+        std::cout << "INFO: Gensfen finished." << endl;
+    }
+}
@@ -0,0 +1,14 @@
+#ifndef _GENSFEN_H_
+#define _GENSFEN_H_
+
+#include "position.h"
+
+#include <sstream>
+
+namespace Learner {
+
+    // Automatic generation of teacher position
+    void gensfen(std::istringstream& is);
+}
+
+#endif
@@ -1 +0,0 @@
-// just a place holder
@@ -7,126 +7,126 @@
 // Floating point operation by 16bit type
 // Assume that the float type code generated by the compiler is in IEEE 754 format and use it.

-#include "../types.h"
+#include "types.h"

 namespace HalfFloat
 {
-	// IEEE 754 float 32 format is :
-	//   sign(1bit) + exponent(8bits) + fraction(23bits) = 32bits
-	//
-	// Our float16 format is :
-	//   sign(1bit) + exponent(5bits) + fraction(10bits) = 16bits
-	union float32_converter
-	{
-		int32_t n;
-		float f;
-	};
+    // IEEE 754 float 32 format is :
+    //   sign(1bit) + exponent(8bits) + fraction(23bits) = 32bits
+    //
+    // Our float16 format is :
+    //   sign(1bit) + exponent(5bits) + fraction(10bits) = 16bits
+    union float32_converter
+    {
+        int32_t n;
+        float f;
+    };


-	// 16-bit float
-	struct float16
-	{
-		// --- constructors
+    // 16-bit float
+    struct float16
+    {
+        // --- constructors

-		float16() {}
-		float16(int16_t n) { from_float((float)n);  }
-		float16(int32_t n) { from_float((float)n); }
-		float16(float n) { from_float(n); }
-		float16(double n) { from_float((float)n); }
+        float16() {}
+        float16(int16_t n) { from_float((float)n);  }
+        float16(int32_t n) { from_float((float)n); }
+        float16(float n) { from_float(n); }
+        float16(double n) { from_float((float)n); }

-		// build from a float
-		void from_float(float f) { *this = to_float16(f); }
+        // build from a float
+        void from_float(float f) { *this = to_float16(f); }

-		// --- implicit converters
+        // --- implicit converters

-		operator int32_t() const { return (int32_t)to_float(*this); }
-		operator float() const { return to_float(*this); }
-		operator double() const { return double(to_float(*this)); }
+        operator int32_t() const { return (int32_t)to_float(*this); }
+        operator float() const { return to_float(*this); }
+        operator double() const { return double(to_float(*this)); }

-		// --- operators
+        // --- operators

-		float16 operator += (float16 rhs) { from_float(to_float(*this) + to_float(rhs)); return *this; }
-		float16 operator -= (float16 rhs) { from_float(to_float(*this) - to_float(rhs)); return *this; }
-		float16 operator *= (float16 rhs) { from_float(to_float(*this) * to_float(rhs)); return *this; }
-		float16 operator /= (float16 rhs) { from_float(to_float(*this) / to_float(rhs)); return *this; }
-		float16 operator + (float16 rhs) const { return float16(*this) += rhs; }
-		float16 operator - (float16 rhs) const { return float16(*this) -= rhs; }
-		float16 operator * (float16 rhs) const { return float16(*this) *= rhs; }
-		float16 operator / (float16 rhs) const { return float16(*this) /= rhs; }
-		float16 operator - () const { return float16(-to_float(*this)); }
-		bool operator == (float16 rhs) const { return this->v_ == rhs.v_; }
-		bool operator != (float16 rhs) const { return !(*this == rhs); }
+        float16 operator += (float16 rhs) { from_float(to_float(*this) + to_float(rhs)); return *this; }
+        float16 operator -= (float16 rhs) { from_float(to_float(*this) - to_float(rhs)); return *this; }
+        float16 operator *= (float16 rhs) { from_float(to_float(*this) * to_float(rhs)); return *this; }
+        float16 operator /= (float16 rhs) { from_float(to_float(*this) / to_float(rhs)); return *this; }
+        float16 operator + (float16 rhs) const { return float16(*this) += rhs; }
+        float16 operator - (float16 rhs) const { return float16(*this) -= rhs; }
+        float16 operator * (float16 rhs) const { return float16(*this) *= rhs; }
+        float16 operator / (float16 rhs) const { return float16(*this) /= rhs; }
+        float16 operator - () const { return float16(-to_float(*this)); }
+        bool operator == (float16 rhs) const { return this->v_ == rhs.v_; }
+        bool operator != (float16 rhs) const { return !(*this == rhs); }

-		static void UnitTest() { unit_test(); }
+        static void UnitTest() { unit_test(); }

-	private:
+    private:

-		// --- entity
+        // --- entity

-		uint16_t v_;
+        uint16_t v_;

-		// --- conversion between float and float16
+        // --- conversion between float and float16

-		static float16 to_float16(float f)
-		{
-			float32_converter c;
-			c.f = f;
-			u32 n = c.n;
+        static float16 to_float16(float f)
+        {
+            float32_converter c;
+            c.f = f;
+            u32 n = c.n;

-			// The sign bit is MSB in common.
-			uint16_t sign_bit = (n >> 16) & 0x8000;
+            // The sign bit is MSB in common.
+            uint16_t sign_bit = (n >> 16) & 0x8000;

-			// The exponent of IEEE 754's float 32 is biased +127 , so we change this bias into +15 and limited to 5-bit.
-			uint16_t exponent = (((n >> 23) - 127 + 15) & 0x1f) << 10;
+            // The exponent of IEEE 754's float 32 is biased +127 , so we change this bias into +15 and limited to 5-bit.
+            uint16_t exponent = (((n >> 23) - 127 + 15) & 0x1f) << 10;

-			// The fraction is limited to 10-bit.
-			uint16_t fraction = (n >> (23-10)) & 0x3ff;
+            // The fraction is limited to 10-bit.
+            uint16_t fraction = (n >> (23-10)) & 0x3ff;

-			float16 f_;
-			f_.v_ = sign_bit | exponent | fraction;
+            float16 f_;
+            f_.v_ = sign_bit | exponent | fraction;

-			return f_;
-		}
+            return f_;
+        }

-		static float to_float(float16 v)
-		{
-			u32 sign_bit = (v.v_ & 0x8000) << 16;
-			u32 exponent = ((((v.v_ >> 10) & 0x1f) - 15 + 127) & 0xff) << 23;
-			u32 fraction = (v.v_ & 0x3ff) << (23 - 10);
+        static float to_float(float16 v)
+        {
+            u32 sign_bit = (v.v_ & 0x8000) << 16;
+            u32 exponent = ((((v.v_ >> 10) & 0x1f) - 15 + 127) & 0xff) << 23;
+            u32 fraction = (v.v_ & 0x3ff) << (23 - 10);

-			float32_converter c;
-			c.n = sign_bit | exponent | fraction;
-			return c.f;
-		}
+            float32_converter c;
+            c.n = sign_bit | exponent | fraction;
+            return c.f;
+        }

-		// It is not a unit test, but I confirmed that it can be calculated. I'll fix the code later (maybe).
-		static void unit_test()
-		{
-			float16 a, b, c, d;
-			a = 1;
-			std::cout << (float)a << std::endl;
-			b = -118.625;
-			std::cout << (float)b << std::endl;
-			c = 2.5;
-			std::cout << (float)c << std::endl;
-			d = a + c;
-			std::cout << (float)d << std::endl;
+        // It is not a unit test, but I confirmed that it can be calculated. I'll fix the code later (maybe).
+        static void unit_test()
+        {
+            float16 a, b, c, d;
+            a = 1;
+            std::cout << (float)a << std::endl;
+            b = -118.625;
+            std::cout << (float)b << std::endl;
+            c = 2.5;
+            std::cout << (float)c << std::endl;
+            d = a + c;
+            std::cout << (float)d << std::endl;

-			c *= 1.5;
-			std::cout << (float)c << std::endl;
+            c *= 1.5;
+            std::cout << (float)c << std::endl;

-			b /= 3;
-			std::cout << (float)b << std::endl;
+            b /= 3;
+            std::cout << (float)b << std::endl;

-			float f1 = 1.5;
-			a += f1;
-			std::cout << (float)a << std::endl;
+            float f1 = 1.5;
+            a += f1;
+            std::cout << (float)a << std::endl;

-			a += f1 * (float)a;
-			std::cout << (float)a << std::endl;
-		}
+            a += f1 * (float)a;
+            std::cout << (float)a << std::endl;
+        }

-	};
+    };

 }

@@ -1,101 +1,6 @@
 #ifndef _LEARN_H_
 #define _LEARN_H_

-#if defined(EVAL_LEARN)
-
-#include <vector>
-
-// =====================
-// Settings for learning
-// =====================
-
-// If you select one of the following, the details after that will be automatically selected.
-// If you don't select any of them, you need to set the subsequent details one by one.
-
-// Learning setting by elmo method. This is the default setting.
-// To make a standard squeeze diaphragm, specify "lambda 1" with the learn command.
-#define LEARN_ELMO_METHOD
-
-
-// ----------------------
-// update formula
-// ----------------------
-
-// Ada Grad. Recommended because it is stable.
-// #define ADA_GRAD_UPDATE
-
-// SGD looking only at the sign of the gradient. It requires less memory, but the accuracy is...
-// #define SGD_UPDATE
-
-// ----------------------
-// Settings for learning
-// ----------------------
-
-// mini-batch size.
-// Calculate the gradient by combining this number of phases.
-// If you make it smaller, the number of update_weights() will increase and the convergence will be faster. The gradient is incorrect.
-// If you increase it, the number of update_weights() decreases, so the convergence will be slow. The slope will come out accurately.
-// I don't think you need to change this value in most cases.
-
-#define LEARN_MINI_BATCH_SIZE (1000 * 1000 * 1)
-
-// The number of phases to read from the file at one time. After reading this much, shuffle.
-// It is better to have a certain size, but this number x 40 bytes x 3 times as much memory is consumed. 400MB*3 is consumed in the 10M phase.
-// Must be a multiple of THREAD_BUFFER_SIZE(=10000).
-
-#define LEARN_SFEN_READ_SIZE (1000 * 1000 * 10)
-
-// Saving interval of evaluation function at learning. Save each time you learn this number of phases.
-// Needless to say, the longer the saving interval, the shorter the learning time.
-// Folder name is incremented for each save like 0/, 1/, 2/...
-// By default, once every 1 billion phases.
-#define LEARN_EVAL_SAVE_INTERVAL (1000000000ULL)
-
-
-// ----------------------
-// Select the objective function
-// ----------------------
-
-// The objective function is the sum of squares of the difference in winning percentage
-// See learner.cpp for more information.
-
-//#define LOSS_FUNCTION_IS_WINNING_PERCENTAGE
-
-// Objective function is cross entropy
-// See learner.cpp for more information.
-// So-called ordinary "rag cloth squeezer"
-//#define LOSS_FUNCTION_IS_CROSS_ENTOROPY
-
-// A version in which the objective function is cross entropy, but the win rate function is not passed
-// #define LOSS_FUNCTION_IS_CROSS_ENTOROPY_FOR_VALUE
-
-// elmo (WCSC27) method
-// #define LOSS_FUNCTION_IS_ELMO_METHOD
-
-// ※ Other things may be added.
-
-
-// ----------------------
-// debug settings for learning
-// ----------------------
-
-// Reduce the output of rmse during learning to 1 for this number of times.
-// rmse calculation is done in one thread, so it takes some time, so reducing the output is effective.
-#define LEARN_RMSE_OUTPUT_INTERVAL 1
-
-
-// ----------------------
-// learning from zero vector
-// ----------------------
-
-// Start learning the evaluation function parameters from the zero vector.
-// Initialize to zero, generate a game, learn from zero vector,
-// Game generation → If you repeat learning, you will get parameters that do not depend on the professional game. (maybe)
-// (very time consuming)
-
-//#define RESET_TO_ZERO_VECTOR
-
-
 // ----------------------
 // Floating point for learning
 // ----------------------
@@ -105,7 +10,7 @@
 // Even if it is a double type, there is almost no difference in the way of convergence, so fix it to float.

 // when using float
-typedef float LearnFloatType;
+using LearnFloatType = float;

 // when using double
 //typedef double LearnFloatType;
@@ -114,59 +19,6 @@ typedef float LearnFloatType;
 //#include "half_float.h"
 //typedef HalfFloat::float16 LearnFloatType;

-// ----------------------
-// save memory
-// ----------------------
-
-// Use a triangular array for the Weight array (of which is KPP) to save memory.
-// If this is used, the weight array for learning will be about 3 times as large as the evaluation function file.
-
-#define USE_TRIANGLE_WEIGHT_ARRAY
-
-// ----------------------
-// dimension down
-// ----------------------
-
-// Dimension reduction for mirrors (left/right symmetry) and inverse (forward/backward symmetry).
-// All on by default.
-
-// Dimension reduction using mirror and inverse for KK. (Unclear effect)
-// USE_KK_MIRROR_WRITE must be on when USE_KK_INVERSE_WRITE is on.
-#define USE_KK_MIRROR_WRITE
-#define USE_KK_INVERSE_WRITE
-
-// Dimension reduction using Mirror and Inverse for KKP. (Inverse is not so effective)
-// When USE_KKP_INVERSE_WRITE is turned on, USE_KKP_MIRROR_WRITE must also be turned on.
-#define USE_KKP_MIRROR_WRITE
-#define USE_KKP_INVERSE_WRITE
-
-// Perform dimension reduction using a mirror for KPP. (Turning this off requires double the teacher position)
-// KPP has no inverse. (Because there is only K on the front side)
-#define USE_KPP_MIRROR_WRITE
-
-// Perform a dimension reduction using a mirror for KPPP. (Turning this off requires double the teacher position)
-// KPPP has no inverse. (Because there is only K on the front side)
-#define USE_KPPP_MIRROR_WRITE
-
-// Reduce the dimension by KPP for learning the KKPP component.
-// Learning is very slow.
-// Do not use as it is not debugged.
-//#define USE_KKPP_LOWER_DIM
-
-
-// ======================
-// Settings for creating teacher phases
-// ======================
-
-// ----------------------
-// write out the draw
-// ----------------------
-
-// When you reach a draw, write it out as a teacher position
-// It's subtle whether it's better to do this.
-// #define LEARN_GENSFEN_USE_DRAW_RESULT
-
-
 // ======================
 // configure
 // ======================
@@ -175,63 +27,122 @@ typedef float LearnFloatType;
 // Learning with the method of elmo (WCSC27)
 // ----------------------

-#if defined( LEARN_ELMO_METHOD )
-#define LOSS_FUNCTION_IS_ELMO_METHOD
-#define ADA_GRAD_UPDATE
-#endif
-
+#define LOSS_FUNCTION "ELMO_METHOD(WCSC27)"

 // ----------------------
 // Definition of struct used in Learner
 // ----------------------
-#include "../position.h"
+
+#include "autograd.h"
+#include "packed_sfen.h"
+
+#include "position.h"
+
+#include <sstream>
+#include <vector>
+#include <mutex>
+#include <string>

 namespace Learner
 {
-	//Structure in which PackedSfen and evaluation value are integrated
-	// If you write different contents for each option, it will be a problem when reusing the teacher game
-	// For the time being, write all the following members regardless of the options.
-	struct PackedSfenValue
-	{
-		// phase
-		PackedSfen sfen;
+    // ----------------------
+    // Settings for learning
+    // ----------------------

-		// Evaluation value returned from Learner::search()
-		int16_t score;
+    // mini-batch size.
+    // Calculate the gradient by combining this number of phases.
+    // If you make it smaller, the number of update_weights() will increase and the convergence will be faster. The gradient is incorrect.
+    // If you increase it, the number of update_weights() decreases, so the convergence will be slow. The slope will come out accurately.
+    // I don't think you need to change this value in most cases.

-		// PV first move
-		// Used when finding the match rate with the teacher
-		uint16_t move;
+    constexpr std::size_t LEARN_MINI_BATCH_SIZE = 1000 * 1000 * 1;

-		// Trouble of the phase from the initial phase.
-		uint16_t gamePly;
+    // Saving interval of evaluation function at learning. Save each time you learn this number of phases.
+    // Needless to say, the longer the saving interval, the shorter the learning time.
+    // Folder name is incremented for each save like 0/, 1/, 2/...
+    // By default, once every 1 billion phases.
+    constexpr std::size_t LEARN_EVAL_SAVE_INTERVAL = 100'000'000ULL;

-		// 1 if the player on this side ultimately wins the game. -1 if you are losing.
-		// 0 if a draw is reached.
-		// The draw is in the teacher position generation command gensfen,
-		// Only write if LEARN_GENSFEN_DRAW_RESULT is enabled.
-		int8_t game_result;
+    // Reduce the output of rmse during learning to 1 for this number of times.
+    // rmse calculation is done in one thread, so it takes some time, so reducing the output is effective.
+    constexpr std::size_t LEARN_RMSE_OUTPUT_INTERVAL = 1;

-		// When exchanging the file that wrote the teacher aspect with other people
-		//Because this structure size is not fixed, pad it so that it is 40 bytes in any environment.
-		uint8_t padding;
+    // Learning from the generated game record
+    void learn(std::istringstream& is);

-		// 32 + 2 + 2 + 2 + 1 + 1 = 40bytes
-	};
+    using CalcLossFunc = ValueWithGrad<double>(Value, Value, int, int);

-	// Type that returns the reading line and the evaluation value at that time
-	// Used in Learner::search(), Learner::qsearch().
-	typedef std::pair<Value, std::vector<Move> > ValueAndPV;
+    struct Loss
+    {
+        double value() const
+        {
+            return m_loss.value;
+        }

-	// So far, only Yaneura King 2018 Otafuku has this stub
-	// This stub is required if EVAL_LEARN is defined.
-	extern Learner::ValueAndPV  search(Position& pos, int depth , size_t multiPV = 1 , uint64_t NodesLimit = 0);
-	extern Learner::ValueAndPV qsearch(Position& pos);
+        double grad() const
+        {
+            return m_loss.grad;
+        }

-	double calc_grad(Value shallow, const PackedSfenValue& psv);
+        uint64_t count() const
+        {
+            return m_count;
+        }

+        Loss() = default;
+
+        Loss(const Loss& other) :
+            m_loss(other.m_loss),
+            m_count(other.m_count)
+        {
+        }
+
+        Loss& operator += (const ValueWithGrad<double>& rhs)
+        {
+            std::unique_lock lock(m_mutex);
+
+            m_loss += rhs.abs();
+            m_count += 1;
+
+            return *this;
+        }
+
+        Loss& operator += (const Loss& rhs)
+        {
+            std::unique_lock lock(m_mutex);
+
+            m_loss += rhs.m_loss.abs();
+            m_count += rhs.m_count;
+
+            return *this;
+        }
+
+        void reset()
+        {
+            std::unique_lock lock(m_mutex);
+
+            m_loss = ValueWithGrad<double>{ 0.0, 0.0 };
+            m_count = 0;
+        }
+
+        template <typename StreamT>
+        void print_with_grad(const std::string& prefix, StreamT& s) const
+        {
+            s << "  - " << prefix << "_loss       = " << m_loss.value / (double)m_count << std::endl;
+            s << "  - " << prefix << "_grad_norm  = " << m_loss.grad / (double)m_count << std::endl;
+        }
+
+        template <typename StreamT>
+        void print_only_loss(const std::string& prefix, StreamT& s) const
+        {
+            s << "  - " << prefix << "_loss       = " << m_loss.value / (double)m_count << std::endl;
+        }
+
+    private:
+        ValueWithGrad<double> m_loss{ 0.0, 0.0 };
+        uint64_t m_count{0};
+        std::mutex m_mutex;
+    };
 }

-#endif
-
 #endif // ifndef _LEARN_H_
@@ -1,25 +0,0 @@
-#include "learning_tools.h"
-
-#if defined (EVAL_LEARN)
-
-#if defined(_OPENMP)
-#include <omp.h>
-#endif
-#include "../misc.h"
-
-using namespace Eval;
-
-namespace EvalLearningTools
-{
-
-	// --- static variables
-
-	double Weight::eta;
-	double Weight::eta1;
-	double Weight::eta2;
-	double Weight::eta3;
-	uint64_t Weight::eta1_epoch;
-	uint64_t Weight::eta2_epoch;
-}
-
-#endif
@@ -1,200 +0,0 @@
-#ifndef __LEARN_WEIGHT_H__
-#define __LEARN_WEIGHT_H__
-
-// A set of machine learning tools related to the weight array used for machine learning of evaluation functions
-
-#include "learn.h"
-#if defined (EVAL_LEARN)
-#include <array>
-
-#if defined(SGD_UPDATE) || defined(USE_KPPP_MIRROR_WRITE)
-#include "../misc.h"  // PRNG , my_insertion_sort
-#endif
-
-#include <cmath>	// std::sqrt()
-
-namespace EvalLearningTools
-{
-	// -------------------------------------------------
-	//   Array for learning that stores gradients etc.
-	// -------------------------------------------------
-
-#if defined(_MSC_VER)
-#pragma pack(push,2)
-#elif defined(__GNUC__)
-#pragma pack(2)
-#endif
-	struct Weight
-	{
-		// cumulative value of one mini-batch gradient
-		LearnFloatType g = LearnFloatType(0);
-
-		// When ADA_GRAD_UPDATE. LearnFloatType == float,
-		// total 4*2 + 4*2 + 1*2 = 18 bytes
-		// It suffices to secure a Weight array that is 4.5 times the size of the evaluation function parameter of 1GB.
-		// However, sizeof(Weight)==20 code is generated if the structure alignment is in 4-byte units, so
-		// Specify pragma pack(2).
-
-		// For SGD_UPDATE, this structure is reduced by 10 bytes to 8 bytes.
-
-		// Learning rate η(eta) such as AdaGrad.
-		// It is assumed that eta1,2,3,eta1_epoch,eta2_epoch have been set by the time updateFV() is called.
-		// The epoch of update_weights() gradually changes from eta1 to eta2 until eta1_epoch.
-		// After eta2_epoch, gradually change from eta2 to eta3.
-		static double eta;
-		static double eta1;
-		static double eta2;
-		static double eta3;
-		static uint64_t eta1_epoch;
-		static uint64_t eta2_epoch;
-
-		// Batch initialization of eta. If 0 is passed, the default value will be set.
-		static void init_eta(double eta1, double eta2, double eta3, uint64_t eta1_epoch, uint64_t eta2_epoch)
-		{
-			Weight::eta1 = (eta1 != 0) ? eta1 : 30.0;
-			Weight::eta2 = (eta2 != 0) ? eta2 : 30.0;
-			Weight::eta3 = (eta3 != 0) ? eta3 : 30.0;
-			Weight::eta1_epoch = (eta1_epoch != 0) ? eta1_epoch : 0;
-			Weight::eta2_epoch = (eta2_epoch != 0) ? eta2_epoch : 0;
-		}
-
-		// Set eta according to epoch.
-		static void calc_eta(uint64_t epoch)
-		{
-			if (Weight::eta1_epoch == 0) // Exclude eta2
-				Weight::eta = Weight::eta1;
-			else if (epoch < Weight::eta1_epoch)
-				// apportion
-				Weight::eta = Weight::eta1 + (Weight::eta2 - Weight::eta1) * epoch / Weight::eta1_epoch;
-			else if (Weight::eta2_epoch == 0) // Exclude eta3
-				Weight::eta = Weight::eta2;
-			else if (epoch < Weight::eta2_epoch)
-				Weight::eta = Weight::eta2 + (Weight::eta3 - Weight::eta2) * (epoch - Weight::eta1_epoch) / (Weight::eta2_epoch - Weight::eta1_epoch);
-			else
-				Weight::eta = Weight::eta3;
-		}
-
-		template <typename T> void updateFV(T& v) { updateFV(v, 1.0); }
-
-#if defined (ADA_GRAD_UPDATE)
-
-		// Since the maximum value that can be accurately calculated with float is INT16_MAX*256-1
-		// Keep the small value as a marker.
-		const LearnFloatType V0_NOT_INIT = (INT16_MAX * 128);
-
-		// What holds v internally. The previous implementation kept a fixed decimal with only a fractional part to save memory,
-		// Since it is doubtful in accuracy and the visibility is bad, it was abolished.
-		LearnFloatType v0 = LearnFloatType(V0_NOT_INIT);
-
-		// AdaGrad g2
-		LearnFloatType g2 = LearnFloatType(0);
-
-		// update with AdaGrad
-		// When executing this function, the value of g and the member do not change
-		// Guaranteed by the caller. It does not have to be an atomic operation.
-		// k is a coefficient for eta. 1.0 is usually sufficient. If you want to lower eta for your turn item, set this to 1/8.0 etc.
-		template <typename T>
-		void updateFV(T& v,double k)
-		{
-			// AdaGrad update formula
-			// Gradient vector is g, vector to be updated is v, η(eta) is a constant,
-			//     g2 = g2 + g^2
-			//     v = v - ηg/sqrt(g2)
-
-			constexpr double epsilon = 0.000001;
-
-			if (g == LearnFloatType(0))
-				return;
-
-			g2 += g * g;
-
-			// If v0 is V0_NOT_INIT, it means that the value is not initialized with the value of KK/KKP/KPP array,
-			// In this case, read the value of v from the one passed in the argument.
-			double V = (v0 == V0_NOT_INIT) ? v : v0;
-
-			V -= k * eta * (double)g / sqrt((double)g2 + epsilon);
-
-			// Limit the value of V to be within the range of types.
-			// By the way, windows.h defines the min and max macros, so to avoid it,
-			// Here, it is enclosed in parentheses so that it is not treated as a function-like macro.
-			V = (std::min)((double)(std::numeric_limits<T>::max)() , V);
-			V = (std::max)((double)(std::numeric_limits<T>::min)() , V);
-
-			v0 = (LearnFloatType)V;
-			v = (T)round(V);
-
-			// Clear g because one update of mini-batch for this element is over
-			// g[i] = 0;
-			// → There is a problem of dimension reduction, so this will be done by the caller.
-		}
-
-#elif defined(SGD_UPDATE)
-
-		// See only the sign of the gradient Update with SGD
-		// When executing this function, the value of g and the member do not change
-		// Guaranteed by the caller. It does not have to be an atomic operation.
-		template <typename T>
-		void updateFV(T & v , double k)
-		{
-			if (g == 0)
-				return;
-
-			// See only the sign of g and update.
-			// If g <0, add v a little.
-			// If g> 0, subtract v slightly.
-
-			// Since we only add integers, no decimal part is required.
-
-			// It's a good idea to move around 0-5.
-			// It is better to have a Gaussian distribution, so generate a 5-bit random number (each bit has a 1/2 probability of 1),
-			// Pop_count() it. At this time, it has a binomial distribution.
-			//int16_t diff = (int16_t)POPCNT32((u32)prng.rand(31));
-			// → If I do this with 80 threads, this AsyncPRNG::rand() locks, so I slowed down. This implementation is not good.
-			int16_t diff = 1;
-
-			double V = v;
-			if (g > 0.0)
-				V-= diff;
-			else
-				V+= diff;
-
-			V = (std::min)((double)(std::numeric_limits<T>::max)(), V);
-			V = (std::max)((double)(std::numeric_limits<T>::min)(), V);
-
-			v = (T)V;
-		}
-
-#endif
-
-		// grad setting
-		template <typename T> void set_grad(const T& g_) { g = g_; }
-
-		// Add grad
-		template <typename T> void add_grad(const T& g_) { g += g_; }
-
-		LearnFloatType get_grad() const { return g; }
-	};
-#if defined(_MSC_VER)
-#pragma pack(pop)
-#elif defined(__GNUC__)
-#pragma pack(0)
-#endif
-
-	// Turned weight array
-	// In order to be able to handle it transparently, let's have the same member as Weight.
-	struct Weight2
-	{
-		Weight w[2];
-
-		//Evaluate your turn, eta 1/8.
-		template <typename T> void updateFV(std::array<T, 2>& v) { w[0].updateFV(v[0] , 1.0); w[1].updateFV(v[1],1.0/8.0); }
-
-		template <typename T> void set_grad(const std::array<T, 2>& g) { for (int i = 0; i<2; ++i) w[i].set_grad(g[i]); }
-		template <typename T> void add_grad(const std::array<T, 2>& g) { for (int i = 0; i<2; ++i) w[i].add_grad(g[i]); }
-
-		std::array<LearnFloatType, 2> get_grad() const { return std::array<LearnFloatType, 2>{w[0].get_grad(), w[1].get_grad()}; }
-	};
-}
-
-#endif // defined (EVAL_LEARN)
-#endif
@@ -1,123 +0,0 @@
-#include "../types.h"
-
-#if defined(EVAL_LEARN)
-
-#include "multi_think.h"
-#include "../tt.h"
-#include "../uci.h"
-
-#include <thread>
-
-void MultiThink::go_think()
-{
-	// Keep a copy to restore the Options settings later.
-	auto oldOptions = Options;
-
-	// When using the constant track, it takes a lot of time to perform on the fly & the part to access the file is
-	// Since it is not thread safe, it is guaranteed here that it is being completely read in memory.
-	Options["BookOnTheFly"] = std::string("false");
-
-	// Read evaluation function, etc.
-	// In the case of the learn command, the value of the evaluation function may be corrected after reading the evaluation function, so
-	// Skip memory corruption check.
-	Eval::init_NNUE();
-
-	// Call the derived class's init().
-	init();
-
-	// The loop upper limit is set with set_loop_max().
-	loop_count = 0;
-	done_count = 0;
-
-	// Create threads as many as Options["Threads"] and start thinking.
-	std::vector<std::thread> threads;
-	auto thread_num = (size_t)Options["Threads"];
-
-	// Secure end flag of worker thread
-	thread_finished.resize(thread_num);
-	
-	// start worker thread
-	for (size_t i = 0; i < thread_num; ++i)
-	{
-		thread_finished[i] = 0;
-		threads.push_back(std::thread([i, this]
-		{ 
-			// exhaust all processor threads.
-			WinProcGroup::bindThisThread(i);
-
-			// execute the overridden process
-			this->thread_worker(i);
-
-			// Set the end flag because the thread has ended
-			this->thread_finished[i] = 1;
-		}));
-	}
-
-	// wait for all threads to finish
-	// for (auto& th :threads)
-	// th.join();
-	// If you write like, the thread will rush here while it is still working,
-	// During that time, callback_func() cannot be called and you cannot save.
-	// Therefore, you need to check the end flag yourself.
-
-	// function to determine if all threads have finished
-	auto threads_done = [&]()
-	{
-		// returns false if no one is finished
-		for (auto& f : thread_finished)
-			if (!f)
-				return false;
-		return true;
-	};
-
-	// Call back if the callback function is set.
-	auto do_a_callback = [&]()
-	{
-		if (callback_func)
-			callback_func();
-	};
-
-
-	for (uint64_t i = 0 ; ; )
-	{
-		// If all threads have finished, exit the loop.
-		if (threads_done())
-			break;
-
-		sleep(1000);
-
-		// callback_func() is called every callback_seconds.
-		if (++i == callback_seconds)
-		{
-			do_a_callback();
-			// Since I am returning from ↑, I reset the counter, so
-			// no matter how long it takes to save() etc. in do_a_callback()
-			// The next call will take a certain amount of time.
-			i = 0;
-		}
-	}
-
-	// Last save.
-	std::cout << std::endl << "finalize..";
-
-	// do_a_callback();
-	// → It should be saved by the caller, so I feel that it is not necessary here.
-
-	// It is possible that the exit code of the thread is running but the exit code of the thread is running, so
-	// We need to wait for the end with join().
-	for (auto& th : threads)
-		th.join();
-
-	// The file writing thread etc. are still running only when all threads are finished
-	// Since the work itself may not have completed, output only that all threads have finished.
-	std::cout << "all threads are joined." << std::endl;
-
-	// Restored because Options were rewritten.
-	// Restore the handler because the handler will not start unless you assign a value.
-	for (auto& s : oldOptions)
-		Options[s.first] = std::string(s.second);
-
-}
-
-
-#endif // defined(EVAL_LEARN)
@@ -1,152 +0,0 @@
-#ifndef _MULTI_THINK_
-#define _MULTI_THINK_
-
-#if defined(EVAL_LEARN)
-
-#include <functional>
-#include <mutex>
-
-#include "../misc.h"
-#include "../learn/learn.h"
-#include "../thread_win32_osx.h"
-
-#include <atomic>
-
-// Learning from a game record, when making yourself think and generating a fixed track, etc.
-// Helper class used when multiple threads want to call Search::think() individually.
-// Derive and use this class.
-struct MultiThink
-{
-	MultiThink() : prng(std::chrono::system_clock::now().time_since_epoch().count())
-	{
-		loop_count = 0;
-	}
-
-	// Call this function from the master thread, each thread will think,
-	// Return control when the thought ending condition is satisfied.
-	// Do something else.
-	// ・It is safe for each thread to call Learner::search(),qsearch()
-	// Separates the substitution table for each thread. (It will be restored after the end.)
-	// ・Book is not thread safe when in on the fly mode, so temporarily change this mode.
-	// Turn it off.
-	// [Requirements]
-	// 1) Override thread_worker()
-	// 2) Set the loop count with set_loop_max()
-	// 3) set a function to be called back periodically (if necessary)
-	// callback_func and callback_interval
-	void go_think();
-
-	// If there is something you want to initialize on the derived class side, override this,
-	// Called when initialization is completed with go_think().
-	// It is better to read the fixed trace at that timing.
-	virtual void init() {}
-
-	// A thread worker that is called by creating a thread when you go_think()
-	// Override and use this.
-	virtual void thread_worker(size_t thread_id) = 0;
-
-	// Called back every callback_seconds [seconds] when go_think().
-	std::function<void()> callback_func;
-	uint64_t callback_seconds = 600;
-
-	// Set the number of times worker processes (calls Search::think()).
-	void set_loop_max(uint64_t loop_max_) { loop_max = loop_max_; }
-
-	// Get the value set by set_loop_max().
-	uint64_t get_loop_max() const { return loop_max; }
-
-	// [ASYNC] Take the value of the loop counter and add the loop counter after taking it out.
-	// If the loop counter has reached loop_max, return UINT64_MAX.
-	// If you want to generate a phase, you must call this function at the time of generating the phase,
-	// Please note that the number of generated phases and the value of the counter will not match.
-	uint64_t get_next_loop_count() {
-		std::unique_lock<std::mutex> lk(loop_mutex);
-		if (loop_count >= loop_max)
-			return UINT64_MAX;
-		return loop_count++;
-	}
-
-	// [ASYNC] For returning the processed number. Each time it is called, it returns a counter that is incremented.
-	uint64_t get_done_count() {
-		std::unique_lock<std::mutex> lk(loop_mutex);
-		return ++done_count;
-	}
-
-	// Mutex when worker thread accesses I/O
-	std::mutex io_mutex;
-
-protected:
-	// Random number generator body
-	AsyncPRNG prng;
-
-private:
-	// number of times worker processes (calls Search::think())
-	std::atomic<uint64_t> loop_max;
-	// number of times the worker has processed (calls Search::think())
-	std::atomic<uint64_t> loop_count;
-	// To return the number of times it has been processed.
-	std::atomic<uint64_t> done_count;
-
-	// Mutex when changing the variables in ↑
-	std::mutex loop_mutex;
-
-	// Thread end flag.
-	// vector<bool> may not be reflected properly when trying to rewrite from multiple threads...
-	typedef uint8_t Flag;
-	std::vector<Flag> thread_finished;
-
-};
-
-// Mechanism to process task during idle time.
-// master passes the task with push_task_async() whenever you like.
-// When slave executes on_idle() in its spare time, it retrieves one task and continues execution until there is no queue.
-// Convenient to use when you want to write MultiThink thread worker in master-slave method.
-struct TaskDispatcher
-{
-	typedef std::function<void(size_t /* thread_id */)> Task;
-
-	// slave calls this function during idle.
-	void on_idle(size_t thread_id)
-	{
-		Task task;
-		while ((task = get_task_async()) != nullptr)
-			task(thread_id);
-
-		sleep(1);
-	}
-
-	// Stack [ASYNC] task.
-	void push_task_async(Task task)
-	{
-		std::unique_lock<std::mutex> lk(task_mutex);
-		tasks.push_back(task);
-	}
-
-	// Allocate size array elements for task in advance.
-	void task_reserve(size_t size)
-	{
-		tasks.reserve(size);
-	}
-
-protected:
-	// set of tasks
-	std::vector<Task> tasks;
-
-	// Take out one [ASYNC] task. Called from on_idle().
-	Task get_task_async()
-	{
-		std::unique_lock<std::mutex> lk(task_mutex);
-		if (tasks.size() == 0)
-			return nullptr;
-		Task task = *tasks.rbegin();
-		tasks.pop_back();
-		return task;
-	}
-
-	// a mutex for accessing tasks
-	std::mutex task_mutex;
-};
-
-#endif // defined(EVAL_LEARN) && defined(YANEURAOU_2018_OTAFUKU_ENGINE)
-
-#endif
@@ -0,0 +1,43 @@
+#include "opening_book.h"
+
+#include <fstream>
+
+namespace Learner {
+
+    EpdOpeningBook::EpdOpeningBook(const std::string& file, PRNG& prng) :
+        OpeningBook(file)
+    {
+        std::ifstream in(file);
+        if (!in)
+        {
+            return;
+        }
+
+        std::string line;
+        while (std::getline(in, line))
+        {
+            if (line.empty())
+                continue;
+
+            fens.emplace_back(line);
+        }
+
+        Algo::shuffle(fens, prng);
+    }
+
+    static bool ends_with(const std::string& lhs, const std::string& end)
+    {
+        if (end.size() > lhs.size()) return false;
+
+        return std::equal(end.rbegin(), end.rend(), lhs.rbegin());
+    }
+
+    std::unique_ptr<OpeningBook> open_opening_book(const std::string& filename, PRNG& prng)
+    {
+        if (ends_with(filename, ".epd"))
+            return std::make_unique<EpdOpeningBook>(filename, prng);
+
+        return nullptr;
+    }
+
+}
@@ -0,0 +1,56 @@
+#ifndef LEARN_OPENING_BOOK_H
+#define LEARN_OPENING_BOOK_H
+
+#include "misc.h"
+#include "position.h"
+#include "thread.h"
+
+#include <vector>
+#include <random>
+#include <optional>
+#include <string>
+#include <cstdint>
+#include <memory>
+
+namespace Learner {
+
+    struct OpeningBook {
+
+        const std::string& next_fen()
+        {
+            assert(fens.size() > 0);
+
+            auto& fen = fens[current_index++];
+            if (current_index >= fens.size())
+                current_index = 0;
+
+            return fen;
+        }
+
+        std::size_t size() const { return fens.size(); }
+
+        const std::string& get_filename() const { return filename; }
+
+    protected:
+        OpeningBook(const std::string& file) :
+            filename(file),
+            current_index(0)
+        {
+        }
+
+
+        std::string filename;
+        std::vector<std::string> fens;
+        std::size_t current_index;
+    };
+
+    struct EpdOpeningBook : OpeningBook {
+
+        EpdOpeningBook(const std::string& file, PRNG& prng);
+    };
+
+    std::unique_ptr<OpeningBook> open_opening_book(const std::string& filename, PRNG& prng);
+
+}
+
+#endif
@@ -0,0 +1,46 @@
+#ifndef _PACKED_SFEN_H_
+#define _PACKED_SFEN_H_
+
+#include <vector>
+#include <cstdint>
+
+namespace Learner {
+
+    // packed sfen
+    struct PackedSfen { std::uint8_t data[32]; };
+
+    // Structure in which PackedSfen and evaluation value are integrated
+    // If you write different contents for each option, it will be a problem when reusing the teacher game
+    // For the time being, write all the following members regardless of the options.
+    struct PackedSfenValue
+    {
+        // phase
+        PackedSfen sfen;
+
+        // Evaluation value returned from Learner::search()
+        std::int16_t score;
+
+        // PV first move
+        // Used when finding the match rate with the teacher
+        std::uint16_t move;
+
+        // Trouble of the phase from the initial phase.
+        std::uint16_t gamePly;
+
+        // 1 if the player on this side ultimately wins the game. -1 if you are losing.
+        // 0 if a draw is reached.
+        // The draw is in the teacher position generation command gensfen,
+        // Only write if LEARN_GENSFEN_DRAW_RESULT is enabled.
+        std::int8_t game_result;
+
+        // When exchanging the file that wrote the teacher aspect with other people
+        //Because this structure size is not fixed, pad it so that it is 40 bytes in any environment.
+        std::uint8_t padding;
+
+        // 32 + 2 + 2 + 2 + 1 + 1 = 40bytes
+    };
+
+    // Phase array: PSVector stands for packed sfen vector.
+    using PSVector = std::vector<PackedSfenValue>;
+}
+#endif
@@ -0,0 +1,386 @@
+#include "sfen_packer.h"
+
+#include "packed_sfen.h"
+
+#include "misc.h"
+#include "position.h"
+
+#include <sstream>
+#include <fstream>
+#include <cstring> // std::memset()
+
+using namespace std;
+
+namespace Learner {
+
+    // Class that handles bitstream
+    // useful when doing aspect encoding
+    struct BitStream
+    {
+        // Set the memory to store the data in advance.
+        // Assume that memory is cleared to 0.
+        void set_data(std::uint8_t* data_) { data = data_; reset(); }
+
+        // Get the pointer passed in set_data().
+        uint8_t* get_data() const { return data; }
+
+        // Get the cursor.
+        int get_cursor() const { return bit_cursor; }
+
+        // reset the cursor
+        void reset() { bit_cursor = 0; }
+
+        // Write 1bit to the stream.
+        // If b is non-zero, write out 1. If 0, write 0.
+        void write_one_bit(int b)
+        {
+            if (b)
+                data[bit_cursor / 8] |= 1 << (bit_cursor & 7);
+
+            ++bit_cursor;
+        }
+
+        // Get 1 bit from the stream.
+        int read_one_bit()
+        {
+            int b = (data[bit_cursor / 8] >> (bit_cursor & 7)) & 1;
+            ++bit_cursor;
+
+            return b;
+        }
+
+        // write n bits of data
+        // Data shall be written out from the lower order of d.
+        void write_n_bit(int d, int n)
+        {
+            for (int i = 0; i <n; ++i)
+                write_one_bit(d & (1 << i));
+        }
+
+        // read n bits of data
+        // Reverse conversion of write_n_bit().
+        int read_n_bit(int n)
+        {
+            int result = 0;
+            for (int i = 0; i < n; ++i)
+                result |= read_one_bit() ? (1 << i) : 0;
+
+            return result;
+        }
+
+    private:
+        // Next bit position to read/write.
+        int bit_cursor;
+
+        // data entity
+        std::uint8_t* data;
+    };
+
+    // Class for compressing/decompressing sfen
+    // sfen can be packed to 256bit (32bytes) by Huffman coding.
+    // This is proven by mini. The above is Huffman coding.
+    //
+    // Internal format = 1-bit turn + 7-bit king position *2 + piece on board (Huffman coding) + hand piece (Huffman coding)
+    // Side to move (White = 0, Black = 1) (1bit)
+    // White King Position (6 bits)
+    // Black King Position (6 bits)
+    // Huffman Encoding of the board
+    // Castling availability (1 bit x 4)
+    // En passant square (1 or 1 + 6 bits)
+    // Rule 50 (6 bits)
+    // Game play (8 bits)
+    //
+    // TODO(someone): Rename SFEN to FEN.
+    //
+    struct SfenPacker
+    {
+        void pack(const Position& pos);
+
+        // sfen packed by pack() (256bit = 32bytes)
+        // Or sfen to decode with unpack()
+        uint8_t *data; // uint8_t[32];
+
+        BitStream stream;
+
+        // Output the board pieces to stream.
+        void write_board_piece_to_stream(Piece pc);
+
+        // Read one board piece from stream
+        Piece read_board_piece_from_stream();
+    };
+
+
+    // Huffman coding
+    // * is simplified from mini encoding to make conversion easier.
+    //
+    // Huffman Encoding
+    //
+    // Empty  xxxxxxx0
+    // Pawn   xxxxx001 + 1 bit (Color)
+    // Knight xxxxx011 + 1 bit (Color)
+    // Bishop xxxxx101 + 1 bit (Color)
+    // Rook   xxxxx111 + 1 bit (Color)
+    // Queen   xxxx1001 + 1 bit (Color)
+    //
+    // Worst case:
+    // - 32 empty squares    32 bits
+    // - 30 pieces           150 bits
+    // - 2 kings             12 bits
+    // - castling rights     4 bits
+    // - ep square           7 bits
+    // - rule50              7 bits
+    // - game ply            16 bits
+    // - TOTAL               228 bits < 256 bits
+
+    struct HuffmanedPiece
+    {
+        int code; // how it will be coded
+        int bits; // How many bits do you have
+    };
+
+    constexpr HuffmanedPiece huffman_table[] =
+    {
+        {0b0000,1}, // NO_PIECE
+        {0b0001,4}, // PAWN
+        {0b0011,4}, // KNIGHT
+        {0b0101,4}, // BISHOP
+        {0b0111,4}, // ROOK
+        {0b1001,4}, // QUEEN
+    };
+
+    // Pack sfen and store in data[32].
+    void SfenPacker::pack(const Position& pos)
+    {
+        memset(data, 0, 32 /* 256bit */);
+        stream.set_data(data);
+
+        // turn
+        // Side to move.
+        stream.write_one_bit((int)(pos.side_to_move()));
+
+        // 7-bit positions for leading and trailing balls
+        // White king and black king, 6 bits for each.
+        for(auto c: Colors)
+            stream.write_n_bit(pos.king_square(c), 6);
+
+        // Write the pieces on the board other than the kings.
+        for (Rank r = RANK_8; r >= RANK_1; --r)
+        {
+            for (File f = FILE_A; f <= FILE_H; ++f)
+            {
+                Piece pc = pos.piece_on(make_square(f, r));
+                if (type_of(pc) == KING)
+                    continue;
+                write_board_piece_to_stream(pc);
+            }
+        }
+
+        // TODO(someone): Support chess960.
+        stream.write_one_bit(pos.can_castle(WHITE_OO));
+        stream.write_one_bit(pos.can_castle(WHITE_OOO));
+        stream.write_one_bit(pos.can_castle(BLACK_OO));
+        stream.write_one_bit(pos.can_castle(BLACK_OOO));
+
+        if (pos.ep_square() == SQ_NONE) {
+            stream.write_one_bit(0);
+        }
+        else {
+            stream.write_one_bit(1);
+            stream.write_n_bit(static_cast<int>(pos.ep_square()), 6);
+        }
+
+        stream.write_n_bit(pos.state()->rule50, 6);
+
+        const int fm = 1 + (pos.game_ply()-(pos.side_to_move() == BLACK)) / 2;
+        stream.write_n_bit(fm, 8);
+
+        // Write high bits of half move. This is a fix for the
+        // limited range of half move counter.
+        // This is backwards compatibile.
+        stream.write_n_bit(fm >> 8, 8);
+
+        // Write the highest bit of rule50 at the end. This is a backwards
+        // compatibile fix for rule50 having only 6 bits stored.
+        // This bit is just ignored by the old parsers.
+        stream.write_n_bit(pos.state()->rule50 >> 6, 1);
+
+        assert(stream.get_cursor() <= 256);
+    }
+
+    // Output the board pieces to stream.
+    void SfenPacker::write_board_piece_to_stream(Piece pc)
+    {
+        // piece type
+        PieceType pr = type_of(pc);
+        auto c = huffman_table[pr];
+        stream.write_n_bit(c.code, c.bits);
+
+        if (pc == NO_PIECE)
+            return;
+
+        // first and second flag
+        stream.write_one_bit(color_of(pc));
+    }
+
+    // Read one board piece from stream
+    Piece SfenPacker::read_board_piece_from_stream()
+    {
+        PieceType pr = NO_PIECE_TYPE;
+        int code = 0, bits = 0;
+        while (true)
+        {
+            code |= stream.read_one_bit() << bits;
+            ++bits;
+
+            assert(bits <= 6);
+
+            for (pr = NO_PIECE_TYPE; pr <KING; ++pr)
+                if (huffman_table[pr].code == code
+                    && huffman_table[pr].bits == bits)
+                    goto Found;
+        }
+    Found:;
+        if (pr == NO_PIECE_TYPE)
+            return NO_PIECE;
+
+        // first and second flag
+        Color c = (Color)stream.read_one_bit();
+
+        return make_piece(c, pr);
+    }
+
+    int set_from_packed_sfen(Position& pos, const PackedSfen& sfen, StateInfo* si, Thread* th)
+    {
+        SfenPacker packer;
+        auto& stream = packer.stream;
+
+        // TODO: separate streams for writing and reading. Here we actually have to
+        // const_cast which is not safe in the long run.
+        stream.set_data(const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(&sfen)));
+
+        pos.clear();
+        std::memset(si, 0, sizeof(StateInfo));
+        std::fill_n(&pos.pieceList[0][0], sizeof(pos.pieceList) / sizeof(Square), SQ_NONE);
+        pos.st = si;
+
+        // Active color
+        pos.sideToMove = (Color)stream.read_one_bit();
+
+        pos.pieceList[W_KING][0] = SQUARE_NB;
+        pos.pieceList[B_KING][0] = SQUARE_NB;
+
+        // First the position of the ball
+        for (auto c : Colors)
+            pos.board[stream.read_n_bit(6)] = make_piece(c, KING);
+
+        // Piece placement
+        for (Rank r = RANK_8; r >= RANK_1; --r)
+        {
+            for (File f = FILE_A; f <= FILE_H; ++f)
+            {
+                auto sq = make_square(f, r);
+
+                // it seems there are already balls
+                Piece pc;
+                if (type_of(pos.board[sq]) != KING)
+                {
+                    assert(pos.board[sq] == NO_PIECE);
+                    pc = packer.read_board_piece_from_stream();
+                }
+                else
+                {
+                    pc = pos.board[sq];
+                    // put_piece() will catch ASSERT unless you remove it all.
+                    pos.board[sq] = NO_PIECE;
+                }
+
+                // There may be no pieces, so skip in that case.
+                if (pc == NO_PIECE)
+                    continue;
+
+                pos.put_piece(Piece(pc), sq);
+
+                if (stream.get_cursor()> 256)
+                    return 1;
+            }
+        }
+
+        // Castling availability.
+        // TODO(someone): Support chess960.
+        pos.st->castlingRights = 0;
+        if (stream.read_one_bit()) {
+            Square rsq;
+            for (rsq = relative_square(WHITE, SQ_H1); pos.piece_on(rsq) != W_ROOK; --rsq) {}
+            pos.set_castling_right(WHITE, rsq);
+        }
+        if (stream.read_one_bit()) {
+            Square rsq;
+            for (rsq = relative_square(WHITE, SQ_A1); pos.piece_on(rsq) != W_ROOK; ++rsq) {}
+            pos.set_castling_right(WHITE, rsq);
+        }
+        if (stream.read_one_bit()) {
+            Square rsq;
+            for (rsq = relative_square(BLACK, SQ_H1); pos.piece_on(rsq) != B_ROOK; --rsq) {}
+            pos.set_castling_right(BLACK, rsq);
+        }
+        if (stream.read_one_bit()) {
+            Square rsq;
+            for (rsq = relative_square(BLACK, SQ_A1); pos.piece_on(rsq) != B_ROOK; ++rsq) {}
+            pos.set_castling_right(BLACK, rsq);
+        }
+
+        // En passant square. Ignore if no pawn capture is possible
+        if (stream.read_one_bit()) {
+            Square ep_square = static_cast<Square>(stream.read_n_bit(6));
+            pos.st->epSquare = ep_square;
+
+            if (!(pos.attackers_to(pos.st->epSquare) & pos.pieces(pos.sideToMove, PAWN))
+                || !(pos.pieces(~pos.sideToMove, PAWN) & (pos.st->epSquare + pawn_push(~pos.sideToMove))))
+                pos.st->epSquare = SQ_NONE;
+        }
+        else {
+            pos.st->epSquare = SQ_NONE;
+        }
+
+        // Halfmove clock
+        pos.st->rule50 = stream.read_n_bit(6);
+
+        // Fullmove number
+        pos.gamePly = stream.read_n_bit(8);
+
+        // Read the highest bit of rule50. This was added as a fix for rule50
+        // counter having only 6 bits stored.
+        // In older entries this will just be a zero bit.
+        pos.gamePly |= stream.read_n_bit(8) << 8;
+
+        // Read the highest bit of rule50. This was added as a fix for rule50
+        // counter having only 6 bits stored.
+        // In older entries this will just be a zero bit.
+        pos.st->rule50 |= stream.read_n_bit(1) << 6;
+
+        // Convert from fullmove starting from 1 to gamePly starting from 0,
+        // handle also common incorrect FEN with fullmove = 0.
+        pos.gamePly = std::max(2 * (pos.gamePly - 1), 0) + (pos.sideToMove == BLACK);
+
+        assert(stream.get_cursor() <= 256);
+
+        pos.chess960 = false;
+        pos.thisThread = th;
+        pos.set_state(pos.st);
+
+        assert(pos.pos_is_ok());
+
+        return 0;
+    }
+
+    PackedSfen sfen_pack(Position& pos)
+    {
+        PackedSfen sfen;
+
+        SfenPacker sp;
+        sp.data = (uint8_t*)&sfen;
+        sp.pack(pos);
+
+        return sfen;
+    }
+}
@@ -0,0 +1,20 @@
+#ifndef _SFEN_PACKER_H_
+#define _SFEN_PACKER_H_
+
+#include "types.h"
+
+#include "learn/packed_sfen.h"
+
+#include <cstdint>
+
+class Position;
+struct StateInfo;
+class Thread;
+
+namespace Learner {
+
+    int set_from_packed_sfen(Position& pos, const PackedSfen& sfen, StateInfo* si, Thread* th);
+    PackedSfen sfen_pack(Position& pos);
+}
+
+#endif
@@ -0,0 +1,365 @@
+#include "sfen_stream.h"
+
+#include "packed_sfen.h"
+
+#include "misc.h"
+
+#include <string>
+#include <vector>
+#include <deque>
+#include <memory>
+#include <mutex>
+#include <list>
+#include <atomic>
+#include <optional>
+#include <iostream>
+#include <cstdint>
+#include <thread>
+
+namespace Learner{
+
+    enum struct SfenReaderMode
+    {
+        Sequential,
+        Cyclic
+    };
+
+    // Sfen reader
+    struct SfenReader
+    {
+        // Number of phases buffered by each thread 0.1M phases. 4M phase at 40HT
+        static constexpr size_t DEFAULT_THREAD_BUFFER_SIZE = 10 * 1000;
+
+        // Buffer for reading files (If this is made larger,
+        // the shuffle becomes larger and the phases may vary.
+        // If it is too large, the memory consumption will increase.
+        // SFEN_READ_SIZE is a multiple of THREAD_BUFFER_SIZE.
+        static constexpr const size_t DEFAULT_SFEN_READ_SIZE = 1000 * 1000 * 10;
+
+        // Do not use std::random_device().
+        // Because it always the same integers on MinGW.
+        SfenReader(
+            const std::vector<std::string>& filenames_,
+            bool do_shuffle,
+            SfenReaderMode mode_,
+            int thread_num,
+            const std::string& seed,
+            size_t read_size = DEFAULT_SFEN_READ_SIZE,
+            size_t buffer_size = DEFAULT_THREAD_BUFFER_SIZE
+        ) :
+            filenames(filenames_.begin(), filenames_.end()),
+            mode(mode_),
+            sfen_read_size(read_size),
+            thread_buffer_size(buffer_size),
+            prng(seed)
+        {
+            packed_sfens.resize(thread_num);
+            total_read = 0;
+            end_of_files = false;
+            shuffle = do_shuffle;
+            stop_flag = false;
+
+            file_worker_thread = std::thread([&] {
+                this->file_read_worker();
+            });
+        }
+
+        ~SfenReader()
+        {
+            stop_flag = true;
+
+            if (file_worker_thread.joinable())
+                file_worker_thread.join();
+        }
+
+        // Load the phase for calculation such as mse.
+        PSVector read_for_mse(uint64_t count)
+        {
+            PSVector sfen_for_mse;
+            sfen_for_mse.reserve(count);
+
+            for (uint64_t i = 0; i < count; ++i)
+            {
+                PackedSfenValue ps;
+                if (!read_to_thread_buffer(0, ps))
+                {
+                    std::cout << "ERROR (sfen_reader): Reading failed." << std::endl;
+                    return sfen_for_mse;
+                }
+
+                sfen_for_mse.push_back(ps);
+            }
+
+            return sfen_for_mse;
+        }
+
+        PSVector read_validation_set(const std::string& file_name, int eval_limit, bool use_draw_games)
+        {
+            PSVector sfen_for_mse;
+
+            auto input = open_sfen_input_file(file_name);
+
+            while(!input->eof())
+            {
+                std::optional<PackedSfenValue> p_opt = input->next();
+                if (p_opt.has_value())
+                {
+                    auto& p = *p_opt;
+
+                    if (eval_limit < abs(p.score))
+                        continue;
+
+                    if (!use_draw_games && p.game_result == 0)
+                        continue;
+
+                    sfen_for_mse.push_back(p);
+                }
+                else
+                {
+                    break;
+                }
+            }
+
+            return sfen_for_mse;
+        }
+
+        // [ASYNC] Thread returns one aspect. Otherwise returns false.
+        bool read_to_thread_buffer(size_t thread_id, PackedSfenValue& ps)
+        {
+            // If there are any positions left in the thread buffer
+            // then retrieve one and return it.
+            auto& thread_ps = packed_sfens[thread_id];
+
+            // Fill the read buffer if there is no remaining buffer,
+            // but if it doesn't even exist, finish.
+            // If the buffer is empty, fill it.
+            if ((thread_ps == nullptr || thread_ps->empty())
+                && !read_to_thread_buffer_impl(thread_id))
+                return false;
+
+            // read_to_thread_buffer_impl() returned true,
+            // Since the filling of the thread buffer with the
+            // phase has been completed successfully
+            // thread_ps->rbegin() is alive.
+
+            ps = thread_ps->back();
+            thread_ps->pop_back();
+
+            // If you've run out of buffers, call delete yourself to free this buffer.
+            if (thread_ps->empty())
+            {
+                thread_ps.reset();
+            }
+
+            return true;
+        }
+
+        // [ASYNC] Read some aspects into thread buffer.
+        bool read_to_thread_buffer_impl(size_t thread_id)
+        {
+            while (true)
+            {
+                {
+                    std::unique_lock<std::mutex> lk(mutex);
+                    // If you can fill from the file buffer, that's fine.
+                    if (packed_sfens_pool.size() != 0)
+                    {
+                        // It seems that filling is possible, so fill and finish.
+
+                        packed_sfens[thread_id] = std::move(packed_sfens_pool.front());
+                        packed_sfens_pool.pop_front();
+
+                        total_read += thread_buffer_size;
+
+                        return true;
+                    }
+                }
+
+                // The file to read is already gone. No more use.
+                if (end_of_files)
+                    return false;
+
+                // Waiting for file worker to fill packed_sfens_pool.
+                // The mutex isn't locked, so it should fill up soon.
+                // Poor man's condition variable.
+                sleep(1);
+            }
+
+        }
+
+        void file_read_worker()
+        {
+            std::string currentFilename;
+            uint64_t numEntriesReadFromCurrentFile = 0;
+
+            auto open_next_file = [&]() {
+                // no more
+                for(;;)
+                {
+                    sfen_input_stream.reset();
+
+                    if (filenames.empty())
+                        return false;
+
+                    // Get the next file name.
+                    currentFilename = filenames.front();
+                    filenames.pop_front();
+
+                    numEntriesReadFromCurrentFile = 0;
+
+                    sfen_input_stream = open_sfen_input_file(currentFilename);
+
+                    auto out = sync_region_cout.new_region();
+                    if (sfen_input_stream == nullptr)
+                    {
+                        out << "INFO (sfen_reader): File does not exist: " << currentFilename << '\n';
+                    }
+                    else
+                    {
+                        out << "INFO (sfen_reader): Opened file for reading: " << currentFilename << '\n';
+
+                        // in case the file is empty or was deleted.
+                        if (sfen_input_stream->eof())
+                        {
+                            out << "  - File empty, nothing to read.\n";
+                        }
+                        else
+                        {
+                            return true;
+                        }
+                    }
+                }
+            };
+
+            if (sfen_input_stream == nullptr && !open_next_file())
+            {
+                auto out = sync_region_cout.new_region();
+                out << "INFO (sfen_reader): End of files." << std::endl;
+                end_of_files = true;
+                return;
+            }
+
+            while (true)
+            {
+                // Wait for the buffer to run out.
+                // This size() is read only, so you don't need to lock it.
+                while (!stop_flag && packed_sfens_pool.size() >= sfen_read_size / thread_buffer_size)
+                    sleep(100);
+
+                if (stop_flag)
+                    return;
+
+                PSVector sfens;
+                sfens.reserve(sfen_read_size);
+
+                // Read from the file into the file buffer.
+                while (sfens.size() < sfen_read_size)
+                {
+                    std::optional<PackedSfenValue> p = sfen_input_stream->next();
+                    if (p.has_value())
+                    {
+                        sfens.push_back(*p);
+                        ++numEntriesReadFromCurrentFile;
+                    }
+                    else
+                    {
+                        if (mode == SfenReaderMode::Cyclic
+                            && numEntriesReadFromCurrentFile > 0)
+                        {
+                            // The file contained data so we add it again to the end of the queue.
+                            filenames.emplace_back(currentFilename);
+                        }
+
+                        if(!open_next_file())
+                        {
+                            // There was no next file. Abort.
+                            auto out = sync_region_cout.new_region();
+                            out << "INFO (sfen_reader): End of files." << std::endl;
+                            end_of_files = true;
+                            return;
+                        }
+                    }
+                }
+
+                // Shuffle the read phase data.
+                if (shuffle)
+                {
+                    Algo::shuffle(sfens, prng);
+                }
+
+                // Divide this by thread_buffer_size. There should be size pieces.
+                // sfen_read_size shall be a multiple of thread_buffer_size.
+                assert((sfen_read_size % thread_buffer_size) == 0);
+
+                auto size = size_t(sfen_read_size / thread_buffer_size);
+                std::vector<std::unique_ptr<PSVector>> buffers;
+                buffers.reserve(size);
+
+                for (size_t i = 0; i < size; ++i)
+                {
+                    // Delete this pointer on the receiving side.
+                    auto buf = std::make_unique<PSVector>();
+                    buf->resize(thread_buffer_size);
+                    memcpy(
+                        buf->data(),
+                        &sfens[i * thread_buffer_size],
+                        sizeof(PackedSfenValue) * thread_buffer_size);
+
+                    buffers.emplace_back(std::move(buf));
+                }
+
+                {
+                    std::unique_lock<std::mutex> lk(mutex);
+
+                    // The mutex lock is required because the%
+                    // contents of packed_sfens_pool are changed.
+
+                    for (auto& buf : buffers)
+                        packed_sfens_pool.emplace_back(std::move(buf));
+                }
+            }
+        }
+
+    protected:
+
+        // worker thread reading file in background
+        std::thread file_worker_thread;
+
+        // sfen files
+        std::deque<std::string> filenames;
+
+        std::atomic<bool> stop_flag;
+
+        // number of phases read (file to memory buffer)
+        std::atomic<uint64_t> total_read;
+
+        // Do not shuffle when reading the phase.
+        bool shuffle;
+
+        SfenReaderMode mode;
+
+        size_t sfen_read_size;
+        size_t thread_buffer_size;
+
+        // Random number to shuffle when reading the phase
+        PRNG prng;
+
+        // Did you read the files and reached the end?
+        std::atomic<bool> end_of_files;
+
+        // handle of sfen file
+        std::unique_ptr<BasicSfenInputStream> sfen_input_stream;
+
+        // sfen for each thread
+        // (When the thread is used up, the thread should call delete to release it.)
+        std::vector<std::unique_ptr<PSVector>> packed_sfens;
+
+        // Mutex when accessing packed_sfens_pool
+        std::mutex mutex;
+
+        // pool of sfen. The worker thread read from the file is added here.
+        // Each worker thread fills its own packed_sfens[thread_id] from here.
+        // * Lock and access the mutex.
+        std::list<std::unique_ptr<PSVector>> packed_sfens_pool;
+    };
+}
@@ -0,0 +1,222 @@
+#ifndef _SFEN_STREAM_H_
+#define _SFEN_STREAM_H_
+
+#include "packed_sfen.h"
+
+#include "extra/nnue_data_binpack_format.h"
+
+#include <optional>
+#include <fstream>
+#include <string>
+#include <memory>
+
+namespace Learner {
+
+    enum struct SfenOutputType
+    {
+        Bin,
+        Binpack
+    };
+
+    static bool ends_with(const std::string& lhs, const std::string& end)
+    {
+        if (end.size() > lhs.size()) return false;
+
+        return std::equal(end.rbegin(), end.rend(), lhs.rbegin());
+    }
+
+    static bool has_extension(const std::string& filename, const std::string& extension)
+    {
+        return ends_with(filename, "." + extension);
+    }
+
+    static std::string filename_with_extension(const std::string& filename, const std::string& ext)
+    {
+        if (ends_with(filename, ext))
+        {
+            return filename;
+        }
+        else
+        {
+            return filename + "." + ext;
+        }
+    }
+
+    struct BasicSfenInputStream
+    {
+        virtual std::optional<PackedSfenValue> next() = 0;
+        virtual bool eof() const = 0;
+        virtual ~BasicSfenInputStream() {}
+    };
+
+    struct BinSfenInputStream : BasicSfenInputStream
+    {
+        static constexpr auto openmode = std::ios::in | std::ios::binary;
+        static inline const std::string extension = "bin";
+
+        BinSfenInputStream(std::string filename) :
+            m_stream(filename, openmode),
+            m_eof(!m_stream)
+        {
+        }
+
+        std::optional<PackedSfenValue> next() override
+        {
+            PackedSfenValue e;
+            if(m_stream.read(reinterpret_cast<char*>(&e), sizeof(PackedSfenValue)))
+            {
+                return e;
+            }
+            else
+            {
+                m_eof = true;
+                return std::nullopt;
+            }
+        }
+
+        bool eof() const override
+        {
+            return m_eof;
+        }
+
+        ~BinSfenInputStream() override {}
+
+    private:
+        std::fstream m_stream;
+        bool m_eof;
+    };
+
+    struct BinpackSfenInputStream : BasicSfenInputStream
+    {
+        static constexpr auto openmode = std::ios::in | std::ios::binary;
+        static inline const std::string extension = "binpack";
+
+        BinpackSfenInputStream(std::string filename) :
+            m_stream(filename, openmode),
+            m_eof(!m_stream.hasNext())
+        {
+        }
+
+        std::optional<PackedSfenValue> next() override
+        {
+            static_assert(sizeof(binpack::nodchip::PackedSfenValue) == sizeof(PackedSfenValue));
+
+            if (!m_stream.hasNext())
+            {
+                m_eof = true;
+                return std::nullopt;
+            }
+
+            auto training_data_entry = m_stream.next();
+            auto v = binpack::trainingDataEntryToPackedSfenValue(training_data_entry);
+            PackedSfenValue psv;
+            // same layout, different types. One is from generic library.
+            std::memcpy(&psv, &v, sizeof(PackedSfenValue));
+
+            return psv;
+        }
+
+        bool eof() const override
+        {
+            return m_eof;
+        }
+
+        ~BinpackSfenInputStream() override {}
+
+    private:
+        binpack::CompressedTrainingDataEntryReader m_stream;
+        bool m_eof;
+    };
+
+    struct BasicSfenOutputStream
+    {
+        virtual void write(const PSVector& sfens) = 0;
+        virtual ~BasicSfenOutputStream() {}
+    };
+
+    struct BinSfenOutputStream : BasicSfenOutputStream
+    {
+        static constexpr auto openmode = std::ios::out | std::ios::binary | std::ios::app;
+        static inline const std::string extension = "bin";
+
+        BinSfenOutputStream(std::string filename) :
+            m_stream(filename_with_extension(filename, extension), openmode)
+        {
+        }
+
+        void write(const PSVector& sfens) override
+        {
+            m_stream.write(reinterpret_cast<const char*>(sfens.data()), sizeof(PackedSfenValue) * sfens.size());
+        }
+
+        ~BinSfenOutputStream() override {}
+
+    private:
+        std::fstream m_stream;
+    };
+
+    struct BinpackSfenOutputStream : BasicSfenOutputStream
+    {
+        static constexpr auto openmode = std::ios::out | std::ios::binary | std::ios::app;
+        static inline const std::string extension = "binpack";
+
+        BinpackSfenOutputStream(std::string filename) :
+            m_stream(filename_with_extension(filename, extension), openmode)
+        {
+        }
+
+        void write(const PSVector& sfens) override
+        {
+            static_assert(sizeof(binpack::nodchip::PackedSfenValue) == sizeof(PackedSfenValue));
+
+            for(auto& sfen : sfens)
+            {
+                // The library uses a type that's different but layout-compatibile.
+                binpack::nodchip::PackedSfenValue e;
+                std::memcpy(&e, &sfen, sizeof(binpack::nodchip::PackedSfenValue));
+                m_stream.addTrainingDataEntry(binpack::packedSfenValueToTrainingDataEntry(e));
+            }
+        }
+
+        ~BinpackSfenOutputStream() override {}
+
+    private:
+        binpack::CompressedTrainingDataEntryWriter m_stream;
+    };
+
+    inline std::unique_ptr<BasicSfenInputStream> open_sfen_input_file(const std::string& filename)
+    {
+        if (has_extension(filename, BinSfenInputStream::extension))
+            return std::make_unique<BinSfenInputStream>(filename);
+        else if (has_extension(filename, BinpackSfenInputStream::extension))
+            return std::make_unique<BinpackSfenInputStream>(filename);
+
+        return nullptr;
+    }
+
+    inline std::unique_ptr<BasicSfenOutputStream> create_new_sfen_output(const std::string& filename, SfenOutputType sfen_output_type)
+    {
+        switch(sfen_output_type)
+        {
+            case SfenOutputType::Bin:
+                return std::make_unique<BinSfenOutputStream>(filename);
+            case SfenOutputType::Binpack:
+                return std::make_unique<BinpackSfenOutputStream>(filename);
+        }
+
+        assert(false);
+        return nullptr;
+    }
+
+    inline std::unique_ptr<BasicSfenOutputStream> create_new_sfen_output(const std::string& filename)
+    {
+        if (has_extension(filename, BinSfenOutputStream::extension))
+            return std::make_unique<BinSfenOutputStream>(filename);
+        else if (has_extension(filename, BinpackSfenOutputStream::extension))
+            return std::make_unique<BinpackSfenOutputStream>(filename);
+
+        return nullptr;
+    }
+}
+
+#endif
@@ -0,0 +1,206 @@
+#include "packed_sfen.h"
+#include "sfen_stream.h"
+
+#include "misc.h"
+
+#include "extra/nnue_data_binpack_format.h"
+
+#include "syzygy/tbprobe.h"
+
+#include <cstring>
+#include <filesystem>
+#include <fstream>
+#include <limits>
+#include <list>
+#include <memory>
+#include <optional>
+#include <shared_mutex>
+#include <thread>
+#include <atomic>
+
+using namespace std;
+
+namespace Learner {
+
+    // Helper class for exporting Sfen
+    struct SfenWriter
+    {
+        // Amount of sfens required to flush the buffer.
+        static constexpr size_t SFEN_WRITE_SIZE = 5000;
+
+        // File name to write and number of threads to create
+        SfenWriter(string filename_, int thread_num, uint64_t save_count, SfenOutputType sfen_output_type)
+        {
+            sfen_buffers_pool.reserve((size_t)thread_num * 10);
+            sfen_buffers.resize(thread_num);
+
+            auto out = sync_region_cout.new_region();
+            out << "INFO (sfen_writer): Creating new data file at " << filename_ << endl;
+
+            sfen_format = sfen_output_type;
+            output_file_stream = create_new_sfen_output(filename_, sfen_format);
+            filename = filename_;
+            save_every = save_count;
+
+            finished = false;
+
+            file_worker_thread = std::thread([&] { this->file_write_worker(); });
+        }
+
+        ~SfenWriter()
+        {
+            flush();
+
+            finished = true;
+            file_worker_thread.join();
+            output_file_stream.reset();
+
+#if !defined(NDEBUG)
+            {
+                // All buffers should be empty since file_worker_thread
+                // should have written everything before exiting.
+                for (const auto& p : sfen_buffers) { assert(p == nullptr); (void)p ; }
+                assert(sfen_buffers_pool.empty());
+            }
+#endif
+        }
+
+        void write(size_t thread_id, const PackedSfenValue& psv)
+        {
+            // We have a buffer for each thread and add it there.
+            // If the buffer overflows, write it to a file.
+
+            // This buffer is prepared for each thread.
+            auto& buf = sfen_buffers[thread_id];
+
+            // Secure since there is no buf at the first time
+            // and immediately after writing the thread buffer.
+            if (!buf)
+            {
+                buf = std::make_unique<PSVector>();
+                buf->reserve(SFEN_WRITE_SIZE);
+            }
+
+            // Buffer is exclusive to this thread.
+            // There is no need for a critical section.
+            buf->push_back(psv);
+
+            if (buf->size() >= SFEN_WRITE_SIZE)
+            {
+                // If you load it in sfen_buffers_pool, the worker will do the rest.
+
+                // Critical section since sfen_buffers_pool is shared among threads.
+                std::unique_lock<std::mutex> lk(mutex);
+                sfen_buffers_pool.emplace_back(std::move(buf));
+            }
+        }
+
+        void flush()
+        {
+            for (size_t i = 0; i < sfen_buffers.size(); ++i)
+            {
+                flush(i);
+            }
+        }
+
+        // Move what remains in the buffer for your thread to a buffer for writing to a file.
+        void flush(size_t thread_id)
+        {
+            std::unique_lock<std::mutex> lk(mutex);
+
+            auto& buf = sfen_buffers[thread_id];
+
+            // There is a case that buf==nullptr, so that check is necessary.
+            if (buf && buf->size() != 0)
+            {
+                sfen_buffers_pool.emplace_back(std::move(buf));
+            }
+        }
+
+        // Dedicated thread to write to file
+        void file_write_worker()
+        {
+            while (!finished || sfen_buffers_pool.size())
+            {
+                vector<std::unique_ptr<PSVector>> buffers;
+                {
+                    std::unique_lock<std::mutex> lk(mutex);
+
+                    // Atomically swap take the filled buffers and
+                    // create a new buffer pool for threads to fill.
+                    buffers = std::move(sfen_buffers_pool);
+                    sfen_buffers_pool = std::vector<std::unique_ptr<PSVector>>();
+                }
+
+                if (!buffers.size())
+                {
+                    // Poor man's condition variable.
+                    sleep(100);
+                }
+                else
+                {
+                    for (auto& buf : buffers)
+                    {
+                        output_file_stream->write(*buf);
+
+                        sfen_write_count += buf->size();
+
+                        // Add the processed number here, and if it exceeds save_every,
+                        // change the file name and reset this counter.
+                        sfen_write_count_current_file += buf->size();
+                        if (sfen_write_count_current_file >= save_every)
+                        {
+                            sfen_write_count_current_file = 0;
+
+                            // Sequential number attached to the file
+                            int n = (int)(sfen_write_count / save_every);
+
+                            // Rename the file and open it again.
+                            // Add ios::app in consideration of overwriting.
+                            // (Depending on the operation, it may not be necessary.)
+                            string new_filename = filename + "_" + std::to_string(n);
+                            output_file_stream = create_new_sfen_output(new_filename, sfen_format);
+
+                            auto out = sync_region_cout.new_region();
+                            out << "INFO (sfen_writer): Creating new data file at " << new_filename << endl;
+                        }
+                    }
+                }
+            }
+        }
+
+    private:
+
+        std::unique_ptr<BasicSfenOutputStream> output_file_stream;
+
+        // A new net is saved after every save_every sfens are processed.
+        uint64_t save_every = std::numeric_limits<uint64_t>::max();
+
+        // File name passed in the constructor
+        std::string filename;
+
+        // Thread to write to the file
+        std::thread file_worker_thread;
+
+        // Flag that all threads have finished
+        atomic<bool> finished;
+
+        SfenOutputType sfen_format;
+
+        // buffer before writing to file
+        // sfen_buffers is the buffer for each thread
+        // sfen_buffers_pool is a buffer for writing.
+        // After loading the phase in the former buffer by SFEN_WRITE_SIZE,
+        // transfer it to the latter.
+        std::vector<std::unique_ptr<PSVector>> sfen_buffers;
+        std::vector<std::unique_ptr<PSVector>> sfen_buffers_pool;
+
+        // Mutex required to access sfen_buffers_pool
+        std::mutex mutex;
+
+        // Number of sfens written in total, and the
+        // number of sfens written in the current file.
+        uint64_t sfen_write_count = 0;
+        uint64_t sfen_write_count_current_file = 0;
+    };
+}
@@ -0,0 +1,242 @@
+#include "transform.h"
+
+#include "sfen_stream.h"
+#include "packed_sfen.h"
+
+#include "thread.h"
+#include "position.h"
+#include "evaluate.h"
+
+#include "nnue/evaluate_nnue.h"
+
+#include <string>
+#include <map>
+#include <iostream>
+#include <cmath>
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+
+namespace Learner
+{
+    using CommandFunc = void(*)(std::istringstream&);
+
+    enum struct NudgedStaticMode
+    {
+        Absolute,
+        Relative,
+        Interpolate
+    };
+
+    struct NudgedStaticParams
+    {
+        std::string input_filename = "in.binpack";
+        std::string output_filename = "out.binpack";
+        NudgedStaticMode mode = NudgedStaticMode::Absolute;
+        int absolute_nudge = 5;
+        float relative_nudge = 0.1;
+        float interpolate_nudge = 0.1;
+
+        void enforce_constraints()
+        {
+            relative_nudge = std::max(relative_nudge, 0.0f);
+            absolute_nudge = std::max(absolute_nudge, 0);
+        }
+    };
+
+    [[nodiscard]] std::int16_t nudge(NudgedStaticParams& params, std::int16_t static_eval_i16, std::int16_t deep_eval_i16)
+    {
+        auto saturate_i32_to_i16 = [](int v) {
+            return static_cast<std::int16_t>(
+                std::clamp(
+                    v,
+                    (int)std::numeric_limits<std::int16_t>::min(),
+                    (int)std::numeric_limits<std::int16_t>::max()
+                )
+            );
+        };
+
+        auto saturate_f32_to_i16 = [saturate_i32_to_i16](float v) {
+            return saturate_i32_to_i16((int)v);
+        };
+
+        int static_eval = static_eval_i16;
+        int deep_eval = deep_eval_i16;
+
+        switch(params.mode)
+        {
+            case NudgedStaticMode::Absolute:
+                return saturate_i32_to_i16(
+                    static_eval + std::clamp(
+                        deep_eval - static_eval,
+                        -params.absolute_nudge,
+                        params.absolute_nudge
+                    )
+                );
+
+            case NudgedStaticMode::Relative:
+                return saturate_f32_to_i16(
+                    (float)static_eval * std::clamp(
+                        (float)deep_eval / (float)static_eval,
+                        (1.0f - params.relative_nudge),
+                        (1.0f + params.relative_nudge)
+                    )
+                );
+
+            case NudgedStaticMode::Interpolate:
+                return saturate_f32_to_i16(
+                    (float)static_eval * (1.0f - params.interpolate_nudge)
+                    + (float)deep_eval * params.interpolate_nudge
+                );
+
+            default:
+                assert(false);
+                return 0;
+        }
+    }
+
+    void do_nudged_static(NudgedStaticParams& params)
+    {
+        Thread* th = Threads.main();
+        Position& pos = th->rootPos;
+        StateInfo si;
+
+        auto in = Learner::open_sfen_input_file(params.input_filename);
+        auto out = Learner::create_new_sfen_output(params.output_filename);
+
+        if (in == nullptr)
+        {
+            std::cerr << "Invalid input file type.\n";
+            return;
+        }
+
+        if (out == nullptr)
+        {
+            std::cerr << "Invalid output file type.\n";
+            return;
+        }
+
+        PSVector buffer;
+        uint64_t batch_size = 1'000'000;
+
+        buffer.reserve(batch_size);
+
+        uint64_t num_processed = 0;
+        for (;;)
+        {
+            auto v = in->next();
+            if (!v.has_value())
+                break;
+
+            auto& ps = v.value();
+
+            pos.set_from_packed_sfen(ps.sfen, &si, th);
+            auto static_eval = Eval::evaluate(pos);
+            auto deep_eval = ps.score;
+            ps.score = nudge(params, static_eval, deep_eval);
+
+            buffer.emplace_back(ps);
+            if (buffer.size() >= batch_size)
+            {
+                num_processed += buffer.size();
+
+                out->write(buffer);
+                buffer.clear();
+
+                std::cout << "Processed " << num_processed << " positions.\n";
+            }
+        }
+
+        if (!buffer.empty())
+        {
+            num_processed += buffer.size();
+
+            out->write(buffer);
+            buffer.clear();
+
+            std::cout << "Processed " << num_processed << " positions.\n";
+        }
+
+        std::cout << "Finished.\n";
+    }
+
+    void nudged_static(std::istringstream& is)
+    {
+        NudgedStaticParams params{};
+
+        while(true)
+        {
+            std::string token;
+            is >> token;
+
+            if (token == "")
+                break;
+
+            if (token == "absolute")
+            {
+                params.mode = NudgedStaticMode::Absolute;
+                is >> params.absolute_nudge;
+            }
+            else if (token == "relative")
+            {
+                params.mode = NudgedStaticMode::Relative;
+                is >> params.relative_nudge;
+            }
+            else if (token == "interpolate")
+            {
+                params.mode = NudgedStaticMode::Interpolate;
+                is >> params.interpolate_nudge;
+            }
+            else if (token == "input_file")
+                is >> params.input_filename;
+            else if (token == "output_file")
+                is >> params.output_filename;
+        }
+
+        std::cout << "Performing transform nudged_static with parameters:\n";
+        std::cout << "input_file          : " << params.input_filename << '\n';
+        std::cout << "output_file         : " << params.output_filename << '\n';
+        std::cout << "\n";
+        if (params.mode == NudgedStaticMode::Absolute)
+        {
+            std::cout << "mode                : absolute\n";
+            std::cout << "absolute_nudge      : " << params.absolute_nudge << '\n';
+        }
+        else if (params.mode == NudgedStaticMode::Relative)
+        {
+            std::cout << "mode                : relative\n";
+            std::cout << "relative_nudge      : " << params.relative_nudge << '\n';
+        }
+        else if (params.mode == NudgedStaticMode::Interpolate)
+        {
+            std::cout << "mode                : interpolate\n";
+            std::cout << "interpolate_nudge   : " << params.interpolate_nudge << '\n';
+        }
+        std::cout << '\n';
+
+        params.enforce_constraints();
+        do_nudged_static(params);
+    }
+
+    void transform(std::istringstream& is)
+    {
+        const std::map<std::string, CommandFunc> subcommands = {
+            { "nudged_static", &nudged_static }
+        };
+
+        Eval::NNUE::init();
+
+        std::string subcommand;
+        is >> subcommand;
+
+        auto func = subcommands.find(subcommand);
+        if (func == subcommands.end())
+        {
+            std::cout << "Invalid subcommand " << subcommand << ". Exiting...\n";
+            return;
+        }
+
+        func->second(is);
+    }
+
+}
@@ -0,0 +1,12 @@
+#ifndef _TRANSFORM_H_
+#define _TRANSFORM_H_
+
+#include <sstream>
+
+namespace Learner {
+
+    void transform(std::istringstream& is);
+
+}
+
+#endif
@@ -18,6 +18,8 @@

 #include <iostream>

+#include "nnue/evaluate_nnue.h"
+
 #include "bitboard.h"
 #include "endgame.h"
 #include "position.h"
@@ -35,6 +37,7 @@ int main(int argc, char* argv[]) {

  std::cout << engine_info() << std::endl;

+  CommandLine::init(argc, argv);
  UCI::init(Options);
  Tune::init();
  PSQT::init();
@@ -44,7 +47,7 @@ int main(int argc, char* argv[]) {
  Endgames::init();
  Threads.set(size_t(Options["Threads"]));
  Search::clear(); // After threads are up
-  Eval::init_NNUE();
+  Eval::NNUE::init();

  UCI::loop(argc, argv);

@@ -61,6 +61,8 @@ typedef bool(*fun3_t)(HANDLE, CONST GROUP_AFFINITY*, PGROUP_AFFINITY);

 using namespace std;

+SynchronizedRegionLogger sync_region_cout(std::cout);
+
 namespace {

 /// Version number. If Version is left empty, then compile date in the format
@@ -132,6 +134,7 @@ public:

 } // namespace

+
 /// engine_info() returns the full name of the current Stockfish version. This
 /// will be either "Stockfish <Tag> DD-MM-YY" (where DD-MM-YY is the date when
 /// the program was compiled) or "Stockfish <Version>", depending on whether
@@ -356,27 +359,11 @@ void std_aligned_free(void* ptr) {
 #endif
 }

-/// aligned_ttmem_alloc() will return suitably aligned memory, if possible using large pages.
-/// The returned pointer is the aligned one, while the mem argument is the one that needs
-/// to be passed to free. With c++17 some of this functionality could be simplified.
+/// aligned_large_pages_alloc() will return suitably aligned memory, if possible using large pages.

-#if defined(__linux__) && !defined(__ANDROID__)
+#if defined(_WIN32)

-void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
-
-  constexpr size_t alignment = 2 * 1024 * 1024; // assumed 2MB page sizes
-  size_t size = ((allocSize + alignment - 1) / alignment) * alignment; // multiple of alignment
-  if (posix_memalign(&mem, alignment, size))
-     mem = nullptr;
-#if defined(MADV_HUGEPAGE)
-  madvise(mem, allocSize, MADV_HUGEPAGE);
-#endif
-  return mem;
-}
-
-#elif defined(_WIN64)
-
-static void* aligned_ttmem_alloc_large_pages(size_t allocSize) {
+static void* aligned_large_pages_alloc_win(size_t allocSize) {

  HANDLE hProcessToken { };
  LUID luid { };
@@ -421,23 +408,10 @@ static void* aligned_ttmem_alloc_large_pages(size_t allocSize) {
  return mem;
 }

-void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
-
-  static bool firstCall = true;
+void* aligned_large_pages_alloc(size_t allocSize) {

  // Try to allocate large pages
-  mem = aligned_ttmem_alloc_large_pages(allocSize);
-
-  // Suppress info strings on the first call. The first call occurs before 'uci'
-  // is received and in that case this output confuses some GUIs.
-  if (!firstCall)
-  {
-      if (mem)
-          sync_cout << "info string Hash table allocation: Windows large pages used." << sync_endl;
-      else
-          sync_cout << "info string Hash table allocation: Windows large pages not used." << sync_endl;
-  }
-  firstCall = false;
+  void* mem = aligned_large_pages_alloc_win(allocSize);

  // Fall back to regular, page aligned, allocation if necessary
  if (!mem)
@@ -448,23 +422,31 @@ void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {

 #else

-void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
+void* aligned_large_pages_alloc(size_t allocSize) {

-  constexpr size_t alignment = 64; // assumed cache line size
-  size_t size = allocSize + alignment - 1; // allocate some extra space
-  mem = malloc(size);
-  void* ret = reinterpret_cast<void*>((uintptr_t(mem) + alignment - 1) & ~uintptr_t(alignment - 1));
-  return ret;
+#if defined(__linux__)
+  constexpr size_t alignment = 2 * 1024 * 1024; // assumed 2MB page size
+#else
+  constexpr size_t alignment = 4096; // assumed small page size
+#endif
+
+  // round up to multiples of alignment
+  size_t size = ((allocSize + alignment - 1) / alignment) * alignment;
+  void *mem = std_aligned_alloc(alignment, size);
+#if defined(MADV_HUGEPAGE)
+  madvise(mem, size, MADV_HUGEPAGE);
+#endif
+  return mem;
 }

 #endif


-/// aligned_ttmem_free() will free the previously allocated ttmem
+/// aligned_large_pages_free() will free the previously allocated ttmem

-#if defined(_WIN64)
+#if defined(_WIN32)

-void aligned_ttmem_free(void* mem) {
+void aligned_large_pages_free(void* mem) {

  if (mem && !VirtualFree(mem, 0, MEM_RELEASE))
  {
@@ -477,8 +459,8 @@ void aligned_ttmem_free(void* mem) {

 #else

-void aligned_ttmem_free(void *mem) {
-  free(mem);
+void aligned_large_pages_free(void *mem) {
+  std_aligned_free(mem);
 }

 #endif
@@ -590,6 +572,62 @@ void bindThisThread(size_t idx) {

 } // namespace WinProcGroup

+#ifdef _WIN32
+#include <direct.h>
+#define GETCWD _getcwd
+#else
+#include <unistd.h>
+#define GETCWD getcwd
+#endif
+
+namespace CommandLine {
+
+string argv0;            // path+name of the executable binary, as given by argv[0]
+string binaryDirectory;  // path of the executable directory
+string workingDirectory; // path of the working directory
+
+void init(int argc, char* argv[]) {
+    (void)argc;
+    string pathSeparator;
+
+    // extract the path+name of the executable binary
+    argv0 = argv[0];
+
+#ifdef _WIN32
+    pathSeparator = "\\";
+  #ifdef _MSC_VER
+    // Under windows argv[0] may not have the extension. Also _get_pgmptr() had
+    // issues in some windows 10 versions, so check returned values carefully.
+    char* pgmptr = nullptr;
+    if (!_get_pgmptr(&pgmptr) && pgmptr != nullptr && *pgmptr)
+        argv0 = pgmptr;
+  #endif
+#else
+    pathSeparator = "/";
+#endif
+
+    // extract the working directory
+    workingDirectory = "";
+    char buff[40000];
+    char* cwd = GETCWD(buff, 40000);
+    if (cwd)
+        workingDirectory = cwd;
+
+    // extract the binary directory path from argv0
+    binaryDirectory = argv0;
+    size_t pos = binaryDirectory.find_last_of("\\/");
+    if (pos == std::string::npos)
+        binaryDirectory = "." + pathSeparator;
+    else
+        binaryDirectory.resize(pos + 1);
+
+    // pattern replacement: "./" at the start of path is replaced by the working directory
+    if (binaryDirectory.find("." + pathSeparator) == 0)
+        binaryDirectory.replace(0, 1, workingDirectory);
+}
+
+
+} // namespace CommandLine
 // Returns a string that represents the current time. (Used when learning evaluation functions)
 std::string now_string()
 {
@@ -627,18 +665,27 @@ void* aligned_malloc(size_t size, size_t align)
    return p;
 }

+std::uint64_t get_file_size(std::fstream& fs)
+{
+    auto pos = fs.tellg();
+
+    fs.seekg(0, fstream::end);
+    const uint64_t eofPos = (uint64_t)fs.tellg();
+    fs.clear(); // Otherwise, the next seek may fail.
+    fs.seekg(0, fstream::beg);
+    const uint64_t begPos = (uint64_t)fs.tellg();
+    fs.seekg(pos);
+
+    return eofPos - begPos;
+}
+
 int read_file_to_memory(std::string filename, std::function<void* (uint64_t)> callback_func)
 {
    fstream fs(filename, ios::in | ios::binary);
    if (fs.fail())
        return 1;

-    fs.seekg(0, fstream::end);
-    uint64_t eofPos = (uint64_t)fs.tellg();
-    fs.clear(); // Otherwise the next seek may fail.
-    fs.seekg(0, fstream::beg);
-    uint64_t begPos = (uint64_t)fs.tellg();
-    uint64_t file_size = eofPos - begPos;
+    const uint64_t file_size = get_file_size(fs);
    //std::cout << "filename = " << filename << " , file_size = " << file_size << endl;

    // I know the file size, so call callback_func to get a buffer for this,
@@ -687,66 +734,3 @@ int write_memory_to_file(std::string filename, void* ptr, uint64_t size)
    fs.close();
    return 0;
 }
-
-// ----------------------------
-//     mkdir wrapper
-// ----------------------------
-
-// Specify relative to the current folder. Returns 0 on success, non-zero on failure.
-// Create a folder. Japanese is not used.
-// In case of gcc under msys2 environment, folder creation fails with _wmkdir(). Cause unknown.
-// Use _mkdir() because there is no help for it.
-
-#if defined(_WIN32)
-// for Windows
-
-#if defined(_MSC_VER)
-#include <codecvt> // I need this because I want wstring to mkdir
-#include <locale> // This is required for wstring_convert.
-
-namespace Dependency {
-    int mkdir(std::string dir_name)
-    {
-        std::wstring_convert<std::codecvt_utf8<wchar_t>, wchar_t> cv;
-        return _wmkdir(cv.from_bytes(dir_name).c_str());
-        //	::CreateDirectory(cv.from_bytes(dir_name).c_str(),NULL);
-    }
-}
-
-#elif defined(__GNUC__) 
-
-#include <direct.h>
-namespace Dependency {
-    int mkdir(std::string dir_name)
-    {
-        return _mkdir(dir_name.c_str());
-    }
-}
-
-#endif
-#elif defined(__linux__)
-
-// In the linux environment, this symbol _LINUX is defined in the makefile.
-
-// mkdir implementation for Linux.
-#include "sys/stat.h"
-
-namespace Dependency {
-    int mkdir(std::string dir_name)
-    {
-        return ::mkdir(dir_name.c_str(), 0777);
-    }
-}
-#else
-
-// In order to judge whether it is a Linux environment, we have to divide the makefile..
-// The function to dig a folder on linux is good for the time being... Only used to save the evaluation function file...
-
-namespace Dependency {
-    int mkdir(std::string dir_name)
-    {
-        return 0;
-    }
-}
-
-#endif
@@ -19,6 +19,7 @@
 #ifndef MISC_H_INCLUDED
 #define MISC_H_INCLUDED

+#include <algorithm>
 #include <cassert>
 #include <chrono>
 #include <functional>
@@ -27,6 +28,12 @@
 #include <string>
 #include <vector>

+#include <cstdint>
+#include <cmath>
+#include <cctype>
+#include <sstream>
+#include <deque>
+
 #include "types.h"

 const std::string engine_info(bool to_uci = false);
@@ -35,8 +42,8 @@ void prefetch(void* addr);
 void start_logger(const std::string& fname);
 void* std_aligned_alloc(size_t alignment, size_t size);
 void std_aligned_free(void* ptr);
-void* aligned_ttmem_alloc(size_t size, void*& mem);
-void aligned_ttmem_free(void* mem); // nop if mem == nullptr
+void* aligned_large_pages_alloc(size_t size); // memory aligned by page size, min alignment: 4096 bytes
+void aligned_large_pages_free(void* mem); // nop if mem == nullptr

 void dbg_hit_on(bool b);
 void dbg_hit_on(bool c, bool b);
@@ -44,9 +51,7 @@ void dbg_mean_of(int v);
 void dbg_print();

 typedef std::chrono::milliseconds::rep TimePoint; // A value in milliseconds
-
 static_assert(sizeof(TimePoint) == sizeof(int64_t), "TimePoint should be 64 bits");
-
 inline TimePoint now() {
  return std::chrono::duration_cast<std::chrono::milliseconds>
        (std::chrono::steady_clock::now().time_since_epoch()).count();
@@ -67,6 +72,232 @@ std::ostream& operator<<(std::ostream&, SyncCout);
 #define sync_cout std::cout << IO_LOCK
 #define sync_endl std::endl << IO_UNLOCK

+// `ptr` must point to an array of size at least
+// `sizeof(T) * N + alignment` bytes, where `N` is the
+// number of elements in the array.
+template <uintptr_t Alignment, typename T>
+T* align_ptr_up(T* ptr)
+{
+  static_assert(alignof(T) < Alignment);
+
+  const uintptr_t ptrint = reinterpret_cast<uintptr_t>(reinterpret_cast<char*>(ptr));
+  return reinterpret_cast<T*>(reinterpret_cast<char*>((ptrint + (Alignment - 1)) / Alignment * Alignment));
+}
+
+// This logger allows printing many parts in a region atomically
+// but doesn't block the threads trying to append to other regions.
+// Instead if some region tries to pring while other region holds
+// the lock the messages are queued to be printed as soon as the
+// current region releases the lock.
+struct SynchronizedRegionLogger
+{
+  using RegionId = std::uint64_t;
+
+  struct Region
+  {
+    friend struct SynchronizedRegionLogger;
+
+    Region() :
+      logger(nullptr), region_id(0), is_held(false)
+    {
+    }
+
+    Region(const Region&) = delete;
+    Region& operator=(const Region&) = delete;
+
+    Region(Region&& other) :
+      logger(other.logger), region_id(other.region_id), is_held(other.is_held)
+    {
+      other.logger = nullptr;
+      other.is_held = false;
+    }
+
+    Region& operator=(Region&& other) {
+      if (is_held && logger != nullptr)
+      {
+        logger->release_region(region_id);
+      }
+
+      logger = other.logger;
+      region_id = other.region_id;
+      is_held = other.is_held;
+
+      other.is_held = false;
+
+      return *this;
+    }
+
+    ~Region() { unlock(); }
+
+    void unlock() {
+      if (is_held) {
+        is_held = false;
+
+        if (logger != nullptr)
+          logger->release_region(region_id);
+      }
+    }
+
+    Region& operator << (std::ostream&(*pManip)(std::ostream&)) {
+      if (logger != nullptr)
+        logger->write(region_id, pManip);
+
+      return *this;
+    }
+
+    template <typename T>
+    Region& operator << (const T& value) {
+      if (logger != nullptr)
+        logger->write(region_id, value);
+
+      return *this;
+    }
+
+  private:
+    SynchronizedRegionLogger* logger;
+    RegionId region_id;
+    bool is_held;
+
+    Region(SynchronizedRegionLogger& log, RegionId id) :
+      logger(&log), region_id(id), is_held(true)
+    {
+    }
+  };
+
+private:
+  struct RegionBookkeeping
+  {
+    RegionBookkeeping(RegionId rid) : id(rid), is_held(true) {}
+
+    std::vector<std::string> pending_parts;
+    RegionId id;
+    bool is_held;
+  };
+
+  RegionId init_next_region()
+  {
+    static RegionId next_id = 0;
+
+    std::lock_guard lock(mutex);
+
+    const auto id = next_id++;
+    regions.emplace_back(id);
+
+    return id;
+  }
+
+  void write(RegionId id, std::ostream&(*pManip)(std::ostream&)) {
+    std::lock_guard lock(mutex);
+
+    if (regions.empty())
+      return;
+
+    if (id == regions.front().id) {
+      // We can just directly print to the output because
+      // we are at the front of the region queue.
+      out << *pManip;
+    } else {
+      // We have to schedule the print until previous regions are
+      // processed
+      auto* region = find_region_nolock(id);
+      if (region == nullptr)
+        return;
+
+      std::stringstream ss;
+      ss << *pManip;
+      region->pending_parts.emplace_back(std::move(ss).str());
+    }
+  }
+
+  template <typename T>
+  void write(RegionId id, const T& value) {
+    std::lock_guard lock(mutex);
+
+    if (regions.empty())
+      return;
+
+    if (id == regions.front().id) {
+      // We can just directly print to the output because
+      // we are at the front of the region queue.
+      out << value;
+    } else {
+      // We have to schedule the print until previous regions are
+      // processed
+      auto* region = find_region_nolock(id);
+      if (region == nullptr)
+        return;
+
+      std::stringstream ss;
+      ss << value;
+      region->pending_parts.emplace_back(std::move(ss).str());
+    }
+  }
+
+  std::ostream& out;
+
+  std::deque<RegionBookkeeping> regions;
+
+  std::mutex mutex;
+
+  RegionBookkeeping* find_region_nolock(RegionId id) {
+    // Linear search because the amount of concurrent regions should be small.
+    auto it = std::find_if(
+      regions.begin(),
+      regions.end(),
+      [id](const RegionBookkeeping& r) { return r.id == id; });
+
+    if (it == regions.end())
+      return nullptr;
+    else
+      return &*it;
+  }
+
+  void release_region(RegionId id) {
+    std::lock_guard lock(mutex);
+
+    auto* region = find_region_nolock(id);
+    if (region == nullptr)
+      return;
+
+    region->is_held = false;
+
+    process_backlog_nolock();
+  }
+
+  void process_backlog_nolock()
+  {
+    while(!regions.empty()) {
+      auto& region = regions.front();
+
+      for(auto& part : region.pending_parts) {
+        out << part;
+      }
+
+      // If the region is still held then we don't
+      // want to start printing stuff from the next region.
+      if (region.is_held)
+        break;
+
+      regions.pop_front();
+    }
+  }
+
+public:
+
+  SynchronizedRegionLogger(std::ostream& s) :
+    out(s)
+  {
+  }
+
+  [[nodiscard]] Region new_region() {
+    const auto id = init_next_region();
+    return Region(*this, id);
+  }
+
+};
+
+extern SynchronizedRegionLogger sync_region_cout;
+

 /// xorshift64star Pseudo-Random Number Generator
 /// This class is based on original code written and dedicated
@@ -83,6 +314,19 @@ std::ostream& operator<<(std::ostream&, SyncCout);
 /// For further analysis see
 ///   <http://vigna.di.unimi.it/ftp/papers/xorshift.pdf>

+static uint64_t string_hash(const std::string& str)
+{
+  uint64_t h = 525201411107845655ull;
+
+  for (auto c : str) {
+    h ^= static_cast<uint64_t>(c);
+    h *= 0x5bd1e9955bd1e995ull;
+    h ^= h >> 47;
+  }
+
+  return h;
+}
+
 class PRNG {

  uint64_t s;
@@ -94,7 +338,9 @@ class PRNG {
  }

 public:
+  PRNG() { set_seed_from_time(); }
  PRNG(uint64_t seed) : s(seed) { assert(seed); }
+  PRNG(const std::string& seed) { set_seed(seed); }

  template<typename T> T rand() { return T(rand64()); }

@@ -107,6 +353,40 @@ public:

  // Return the random seed used internally.
  uint64_t get_seed() const { return s; }
+
+  void set_seed(uint64_t seed) { s = seed; }
+
+  uint64_t next_random_seed()
+  {
+    uint64_t seed = 0;
+    for(int i = 0; i < 64; ++i)
+    {
+      const auto off = rand64() % 64;
+      seed |= (rand64() & (uint64_t(1) << off)) >> off;
+      seed <<= 1;
+    }
+    return seed;
+  }
+
+  void set_seed_from_time()
+  {
+      set_seed(std::chrono::system_clock::now().time_since_epoch().count());
+  }
+
+  void set_seed(const std::string& str)
+  {
+    if (str.empty())
+    {
+      set_seed_from_time();
+    }
+    else if (std::all_of(str.begin(), str.end(), [](char c) { return std::isdigit(c);} )) {
+      set_seed(std::stoull(str));
+    }
+    else
+    {
+      set_seed(string_hash(str));
+    }
+  }
 };

 // Display a random seed. (For debugging)
@@ -130,6 +410,74 @@ inline uint64_t mul_hi64(uint64_t a, uint64_t b) {
 #endif
 }

+// This bitset can be accessed concurrently, provided
+// the concurrent accesses are performed on distinct
+// instances of underlying type. That means the cuncurrent
+// accesses need to be spaced by at least
+// bits_per_bucket bits.
+// But at least best_concurrent_access_stride bits
+// is recommended to prevent false sharing.
+template <uint64_t N>
+struct LargeBitset
+{
+private:
+    constexpr static uint64_t cache_line_size = 64;
+
+public:
+    using UnderlyingType = uint64_t;
+
+    constexpr static uint64_t num_bits = N;
+    constexpr static uint64_t bits_per_bucket = 8 * sizeof(uint64_t);
+    constexpr static uint64_t num_buckets = (num_bits + bits_per_bucket - 1) / bits_per_bucket;
+    constexpr static uint64_t best_concurrent_access_stride = 8 * cache_line_size;
+
+    LargeBitset()
+    {
+        std::fill(std::begin(bits), std::end(bits), 0);
+    }
+
+    void set(uint64_t idx)
+    {
+        const uint64_t bucket = idx / bits_per_bucket;
+        const uint64_t bit = uint64_t(1) << (idx % bits_per_bucket);
+        bits[bucket] |= bit;
+    }
+
+    bool test(uint64_t idx) const
+    {
+        const uint64_t bucket = idx / bits_per_bucket;
+        const uint64_t bit = uint64_t(1) << (idx % bits_per_bucket);
+        return bits[bucket] & bit;
+    }
+
+    uint64_t count() const
+    {
+        uint64_t c = 0;
+        uint64_t i = 0;
+
+        for (; i < num_buckets - 3; i += 4)
+        {
+            uint64_t c0 = popcount(bits[i+0]);
+            uint64_t c1 = popcount(bits[i+1]);
+            uint64_t c2 = popcount(bits[i+2]);
+            uint64_t c3 = popcount(bits[i+3]);
+            c0 += c1;
+            c2 += c3;
+            c += c0 + c2;
+        }
+
+        for (; i < num_buckets; ++i)
+        {
+            c += popcount(bits[i]);
+        }
+
+        return c;
+    }
+
+private:
+    alignas(cache_line_size) UnderlyingType bits[num_buckets];
+};
+
 /// Under Windows it is not possible for a process to run on more than one
 /// logical processor group. This usually means to be limited to use max 64
 /// cores. To overcome this, some special platform specific API should be
@@ -155,6 +503,7 @@ std::string now_string();
 // Also, if the buffer cannot be allocated in the callback function or if the file size is different from the expected file size,
 // Return nullptr. At this time, read_file_to_memory() interrupts reading and returns with an error.

+std::uint64_t get_file_size(std::fstream& fs);
 int read_file_to_memory(std::string filename, std::function<void* (uint64_t)> callback_func);
 int write_memory_to_file(std::string filename, void* ptr, uint64_t size);

@@ -165,7 +514,9 @@ int write_memory_to_file(std::string filename, void* ptr, uint64_t size);
 // async version of PRNG
 struct AsyncPRNG
 {
+  AsyncPRNG() : prng() { }
  AsyncPRNG(uint64_t seed) : prng(seed) { assert(seed); }
+  AsyncPRNG(const std::string& seed) : prng(seed) { }
  // [ASYNC] Extract one random number.
  template<typename T> T rand() {
    std::unique_lock<std::mutex> lk(mutex);
@@ -199,20 +550,51 @@ inline std::ostream& operator<<(std::ostream& os, AsyncPRNG& prng)

 // Mathematical function used for progress calculation and learning
 namespace Math {
-	// Sigmoid function
-	// = 1.0 / (1.0 + std::exp(-x))
-	double sigmoid(double x);
+    inline double sigmoid(double x)
+    {
+        return 1.0 / (1.0 + std::exp(-x));
+    }

-	// Differentiation of sigmoid function
-	// = sigmoid(x) * (1.0-sigmoid(x))
-	double dsigmoid(double x);
+    inline double dsigmoid(double x)
+    {
+        // Sigmoid function
+        // f(x) = 1/(1+exp(-x))
+        // the first derivative is
+        // f'(x) = df/dx = f(x)・{ 1-f(x)}
+        // becomes
+
+        return sigmoid(x) * (1.0 - sigmoid(x));
+    }

 	// Clip v so that it fits between [lo,hi].
 	// * In Stockfish, this function is written in bitboard.h.
 	template<class T> constexpr const T& clamp(const T& v, const T& lo, const T& hi) {
 		return v < lo ? lo : v > hi ? hi : v;
 	}
+}

+namespace Algo {
+    // Fisher-Yates
+    template <typename Rng, typename T>
+    void shuffle(std::vector<T>& buf, Rng&& prng)
+    {
+        const auto size = buf.size();
+        for (uint64_t i = 0; i < size; ++i)
+            std::swap(buf[i], buf[prng.rand(size - i) + i]);
+    }
+
+    // split the string
+    inline std::vector<std::string> split(const std::string& input, char delimiter) {
+        std::istringstream stream(input);
+        std::string field;
+        std::vector<std::string> fields;
+
+        while (std::getline(stream, field, delimiter)) {
+            fields.push_back(field);
+        }
+
+        return fields;
+    }
 }

 // --------------------
@@ -225,7 +607,7 @@ struct Path
 {
 	// Combine the path name and file name and return it.
 	// If the folder name is not an empty string, append it if there is no'/' or'\\' at the end.
-	static std::string Combine(const std::string& folder, const std::string& filename)
+	static std::string combine(const std::string& folder, const std::string& filename)
 	{
 		if (folder.length() >= 1 && *folder.rbegin() != '/' && *folder.rbegin() != '\\')
 			return folder + "/" + filename;
@@ -234,7 +616,7 @@ struct Path
 	}

 	// Get the file name part (excluding the folder name) from the full path expression.
-	static std::string GetFileName(const std::string& path)
+	static std::string get_file_name(const std::string& path)
 	{
 		// I don't know which "\" or "/" is used.
 		auto path_index1 = path.find_last_of("\\") + 1;
@@ -259,7 +641,24 @@ public:
  template <typename U> AlignedAllocator(const AlignedAllocator<U>&) {}

  T* allocate(std::size_t n) { return (T*)std_aligned_alloc(alignof(T), n * sizeof(T)); }
-  void deallocate(T* p, std::size_t n) { std_aligned_free(p); }
+  void deallocate(T* p, std::size_t ) { std_aligned_free(p); }
+};
+
+template <typename T>
+class CacheLineAlignedAllocator {
+public:
+    using value_type = T;
+
+    constexpr static uint64_t cache_line_size = 64;
+
+    CacheLineAlignedAllocator() {}
+    CacheLineAlignedAllocator(const CacheLineAlignedAllocator&) {}
+    CacheLineAlignedAllocator(CacheLineAlignedAllocator&&) {}
+
+    template <typename U> CacheLineAlignedAllocator(const CacheLineAlignedAllocator<U>&) {}
+
+    T* allocate(std::size_t n) { return (T*)std_aligned_alloc(cache_line_size, n * sizeof(T)); }
+    void deallocate(T* p, std::size_t) { std_aligned_free(p); }
 };

 // --------------------
@@ -273,11 +672,13 @@ namespace Dependency
  // So when calling getline() on fstream,
  // just write getline() instead of std::getline() and use this function.
  extern bool getline(std::ifstream& fs, std::string& s);
+}

-  // Create a folder.
-  // Specify relative to the current folder. Japanese is not used for dir_name.
-  // Returns 0 on success, non-zero on failure.
-  extern int mkdir(std::string dir_name);
+namespace CommandLine {
+  void init(int argc, char* argv[]);
+
+  extern std::string binaryDirectory;  // path of the executable directory
+  extern std::string workingDirectory; // path of the working directory
 }

 #endif // #ifndef MISC_H_INCLUDED
@@ -73,8 +73,9 @@ MovePicker::MovePicker(const Position& p, Move ttm, Depth d, const ButterflyHist
  assert(d <= 0);

  stage = (pos.checkers() ? EVASION_TT : QSEARCH_TT) +
-           !(ttm && (depth > DEPTH_QS_RECAPTURES || to_sq(ttm) == recaptureSquare)
-                 && pos.pseudo_legal(ttm));
+          !(   ttm
+            && (pos.checkers() || depth > DEPTH_QS_RECAPTURES || to_sq(ttm) == recaptureSquare)
+            && pos.pseudo_legal(ttm));
 }

 /// MovePicker constructor for ProbCut: we generate captures with SEE greater
@@ -0,0 +1,54 @@
+/*
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+// Definition of input features and network structure used in NNUE evaluation function
+
+#ifndef NNUE_HALFKA_256X2_32_32_H_INCLUDED
+#define NNUE_HALFKA_256X2_32_32_H_INCLUDED
+
+#include "nnue/features/feature_set.h"
+#include "nnue/features/half_ka.h"
+
+#include "nnue/layers/input_slice.h"
+#include "nnue/layers/affine_transform.h"
+#include "nnue/layers/clipped_relu.h"
+
+namespace Eval::NNUE {
+
+    // Input features used in evaluation function
+    using RawFeatures = Features::FeatureSet<
+        Features::HalfKA<Features::Side::kFriend>>;
+
+    // Number of input feature dimensions after conversion
+    constexpr IndexType kTransformedFeatureDimensions = 256;
+
+    namespace Layers {
+
+        // Define network structure
+        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+        using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+
+    }  // namespace Layers
+
+    using Network = Layers::OutputLayer;
+
+}  // namespace Eval::NNUE
+
+#endif // #ifndef NNUE_HALFA_256X2_32_32_H_INCLUDED
@@ -1,42 +1,57 @@
+/*
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
 // Definition of input features and network structure used in NNUE evaluation function

-#ifndef HALFKP_CR_EP_256X2_32_32_H
-#define HALFKP_CR_EP_256X2_32_32_H
+#ifndef NNUE_HALFKP_CR_EP_256X2_32_32_H_INCLUDED
+#define NNUE_HALFKP_CR_EP_256X2_32_32_H_INCLUDED

-#include "../features/feature_set.h"
-#include "../features/half_kp.h"
-#include "../features/castling_right.h"
-#include "../features/enpassant.h"
+#include "nnue/features/feature_set.h"
+#include "nnue/features/half_kp.h"
+#include "nnue/features/castling_right.h"
+#include "nnue/features/enpassant.h"

-#include "../layers/input_slice.h"
-#include "../layers/affine_transform.h"
-#include "../layers/clipped_relu.h"
+#include "nnue/layers/input_slice.h"
+#include "nnue/layers/affine_transform.h"
+#include "nnue/layers/clipped_relu.h"

-namespace Eval {
-
-  namespace NNUE {
+namespace Eval::NNUE {

    // Input features used in evaluation function
    using RawFeatures = Features::FeatureSet<
-      Features::HalfKP<Features::Side::kFriend>, Features::CastlingRight,
-      Features::EnPassant>;
+        Features::HalfKP<Features::Side::kFriend>, Features::CastlingRight,
+        Features::EnPassant>;

    // Number of input feature dimensions after conversion
    constexpr IndexType kTransformedFeatureDimensions = 256;

    namespace Layers {

-      // define network structure
-      using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-      using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-      using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-      using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+        // Define network structure
+        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+        using OutputLayer = AffineTransform<HiddenLayer2, 1>;

    }  // namespace Layers

    using Network = Layers::OutputLayer;

-  }  // namespace NNUE
+}  // namespace Eval::NNUE

-}  // namespace Eval
-#endif // HALFKP_CR_EP_256X2_32_32_H
+#endif // #ifndef NNUE_HALFKP_CR_EP_256X2_32_32_H_INCLUDED
@@ -0,0 +1,37 @@
+// Definition of input features and network structure used in NNUE evaluation function
+
+#ifndef NNUE_HALFKP_CR_256X2_32_32_H_INCLUDED
+#define NNUE_HALFKP_CR_256X2_32_32_H_INCLUDED
+
+#include "nnue/features/feature_set.h"
+#include "nnue/features/half_kp.h"
+#include "nnue/features/castling_right.h"
+
+#include "nnue/layers/input_slice.h"
+#include "nnue/layers/affine_transform.h"
+#include "nnue/layers/clipped_relu.h"
+
+namespace Eval::NNUE {
+
+    // Input features used in evaluation function
+    using RawFeatures = Features::FeatureSet<
+        Features::HalfKP<Features::Side::kFriend>, Features::CastlingRight>;
+
+    // Number of input feature dimensions after conversion
+    constexpr IndexType kTransformedFeatureDimensions = 256;
+
+    namespace Layers {
+
+        // Define network structure
+        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+        using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+
+    }  // namespace Layers
+
+    using Network = Layers::OutputLayer;
+
+}  // namespace Eval::NNUE
+
+#endif // #ifndef NNUE_HALFKP_CR_256X2_32_32_H_INCLUDED
@@ -1,19 +1,19 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)

-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.

-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.

-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

 // Definition of input features and network structure used in NNUE evaluation function
@@ -21,33 +21,33 @@
 #ifndef NNUE_HALFKP_256X2_32_32_H_INCLUDED
 #define NNUE_HALFKP_256X2_32_32_H_INCLUDED

-#include "../features/feature_set.h"
-#include "../features/half_kp.h"
+#include "nnue/features/feature_set.h"
+#include "nnue/features/half_kp.h"

-#include "../layers/input_slice.h"
-#include "../layers/affine_transform.h"
-#include "../layers/clipped_relu.h"
+#include "nnue/layers/input_slice.h"
+#include "nnue/layers/affine_transform.h"
+#include "nnue/layers/clipped_relu.h"

 namespace Eval::NNUE {

-// Input features used in evaluation function
-using RawFeatures = Features::FeatureSet<
-    Features::HalfKP<Features::Side::kFriend>>;
+    // Input features used in evaluation function
+    using RawFeatures = Features::FeatureSet<
+        Features::HalfKP<Features::Side::kFriend>>;

-// Number of input feature dimensions after conversion
-constexpr IndexType kTransformedFeatureDimensions = 256;
+    // Number of input feature dimensions after conversion
+    constexpr IndexType kTransformedFeatureDimensions = 256;

-namespace Layers {
+    namespace Layers {

-// Define network structure
-using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+        // Define network structure
+        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+        using OutputLayer = AffineTransform<HiddenLayer2, 1>;

-}  // namespace Layers
+    }  // namespace Layers

-using Network = Layers::OutputLayer;
+    using Network = Layers::OutputLayer;

 }  // namespace Eval::NNUE

@@ -3,37 +3,33 @@
 #ifndef HALFKP_384X2_32_32_H
 #define HALFKP_384X2_32_32_H

-#include "../features/feature_set.h"
-#include "../features/half_kp.h"
+#include "nnue/features/feature_set.h"
+#include "nnue/features/half_kp.h"

-#include "../layers/input_slice.h"
-#include "../layers/affine_transform.h"
-#include "../layers/clipped_relu.h"
+#include "nnue/layers/input_slice.h"
+#include "nnue/layers/affine_transform.h"
+#include "nnue/layers/clipped_relu.h"

-namespace Eval {
+namespace Eval::NNUE {

-namespace NNUE {
+    // Input features used in evaluation function
+    using RawFeatures = Features::FeatureSet<
+        Features::HalfKP<Features::Side::kFriend>>;

-// Input features used in evaluation function
-using RawFeatures = Features::FeatureSet<
-    Features::HalfKP<Features::Side::kFriend>>;
+    // Number of input feature dimensions after conversion
+    constexpr IndexType kTransformedFeatureDimensions = 384;

-// Number of input feature dimensions after conversion
-constexpr IndexType kTransformedFeatureDimensions = 384;
+    namespace Layers {

-namespace Layers {
+        // define network structure
+        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+        using OutputLayer = AffineTransform<HiddenLayer2, 1>;

-// define network structure
-using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+    }  // namespace Layers

-}  // namespace Layers
+    using Network = Layers::OutputLayer;

-using Network = Layers::OutputLayer;
-
-}  // namespace NNUE
-
-}  // namespace Eval
+}  // namespace Eval::NNUE
 #endif // HALFKP_384X2_32_32_H
@@ -1,42 +0,0 @@
-// Definition of input features and network structure used in NNUE evaluation function
-
-#ifndef K_P_CR_EP_256X2_32_32_H
-#define K_P_CR_EP_256X2_32_32_H
-
-#include "../features/feature_set.h"
-#include "../features/k.h"
-#include "../features/p.h"
-#include "../features/castling_right.h"
-#include "../features/enpassant.h"
-
-#include "../layers/input_slice.h"
-#include "../layers/affine_transform.h"
-#include "../layers/clipped_relu.h"
-
-namespace Eval {
-
-  namespace NNUE {
-
-    // Input features used in evaluation function
-    using RawFeatures = Features::FeatureSet<Features::K, Features::P,
-      Features::CastlingRight, Features::EnPassant>;
-
-    // Number of input feature dimensions after conversion
-    constexpr IndexType kTransformedFeatureDimensions = 256;
-
-    namespace Layers {
-
-      // define network structure
-      using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-      using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-      using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-      using OutputLayer = AffineTransform<HiddenLayer2, 1>;
-
-    }  // namespace Layers
-
-    using Network = Layers::OutputLayer;
-
-  }  // namespace NNUE
-
-}  // namespace Eval
-#endif // K_P_CR_EP_256X2_32_32_H
@@ -1,41 +0,0 @@
-// Definition of input features and network structure used in NNUE evaluation function
-
-#ifndef K_P_CR_256X2_32_32_H
-#define K_P_CR_256X2_32_32_H
-
-#include "../features/feature_set.h"
-#include "../features/k.h"
-#include "../features/p.h"
-#include "../features/castling_right.h"
-
-#include "../layers/input_slice.h"
-#include "../layers/affine_transform.h"
-#include "../layers/clipped_relu.h"
-
-namespace Eval {
-
-  namespace NNUE {
-
-    // Input features used in evaluation function
-    using RawFeatures = Features::FeatureSet<Features::K, Features::P,
-      Features::CastlingRight>;
-
-    // Number of input feature dimensions after conversion
-    constexpr IndexType kTransformedFeatureDimensions = 256;
-
-    namespace Layers {
-
-      // define network structure
-      using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-      using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-      using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-      using OutputLayer = AffineTransform<HiddenLayer2, 1>;
-
-    }  // namespace Layers
-
-    using Network = Layers::OutputLayer;
-
-  }  // namespace NNUE
-
-}  // namespace Eval
-#endif // K_P_CR_256X2_32_32_H
@@ -1,38 +0,0 @@
-// Definition of input features and network structure used in NNUE evaluation function
-#ifndef K_P_256X2_32_32_H
-#define K_P_256X2_32_32_H
-
-#include "../features/feature_set.h"
-#include "../features/k.h"
-#include "../features/p.h"
-
-#include "../layers/input_slice.h"
-#include "../layers/affine_transform.h"
-#include "../layers/clipped_relu.h"
-
-namespace Eval {
-
-namespace NNUE {
-
-// Input features used in evaluation function
-using RawFeatures = Features::FeatureSet<Features::K, Features::P>;
-
-// Number of input feature dimensions after conversion
-constexpr IndexType kTransformedFeatureDimensions = 256;
-
-namespace Layers {
-
-// define network structure
-using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-using OutputLayer = AffineTransform<HiddenLayer2, 1>;
-
-}  // namespace Layers
-
-using Network = Layers::OutputLayer;
-
-}  // namespace NNUE
-
-}  // namespace Eval
-#endif // K_P_256X2_32_32_H
@@ -18,20 +18,29 @@

 // Code for calculating NNUE evaluation function

-#include <fstream>
+#include "evaluate_nnue.h"
+
+#include "position.h"
+#include "misc.h"
+#include "uci.h"
+#include "types.h"
+
 #include <iostream>
+#include <string>
+#include <fstream>
 #include <set>

 #include "../evaluate.h"
 #include "../position.h"
 #include "../misc.h"
 #include "../uci.h"
+#include "../types.h"

 #include "evaluate_nnue.h"

 namespace Eval::NNUE {

-  uint32_t kpp_board_index[PIECE_NB][COLOR_NB] = {
+  const uint32_t kpp_board_index[PIECE_NB][COLOR_NB] = {
   // convention: W - us, B - them
   // viewed from other side, W and B are reversed
      { PS_NONE,     PS_NONE     },
@@ -53,7 +62,7 @@ namespace Eval::NNUE {
  };

  // Input feature converter
-  AlignedPtr<FeatureTransformer> feature_transformer;
+  LargePagePtr<FeatureTransformer> feature_transformer;

  // Evaluation function
  AlignedPtr<Network> network;
@@ -65,50 +74,77 @@ namespace Eval::NNUE {
  std::string savedfileName = "nn.bin";

  // Get a string that represents the structure of the evaluation function
-  std::string GetArchitectureString() {
-    return "Features=" + FeatureTransformer::GetStructureString() +
-      ",Network=" + Network::GetStructureString();
+  std::string get_architecture_string() {
+    return "Features=" + FeatureTransformer::get_structure_string() +
+        ",Network=" + Network::get_structure_string();
  }

+  std::string get_layers_info() {
+    return
+        FeatureTransformer::get_layers_info()
+        + '\n' + Network::get_layers_info();
+  }
+
+  UseNNUEMode useNNUE;
+  std::string eval_file_loaded = "None";
+
  namespace Detail {

  // Initialize the evaluation function parameters
  template <typename T>
-  void Initialize(AlignedPtr<T>& pointer) {
+  void initialize(AlignedPtr<T>& pointer) {

    pointer.reset(reinterpret_cast<T*>(std_aligned_alloc(alignof(T), sizeof(T))));
    std::memset(pointer.get(), 0, sizeof(T));
  }

+  template <typename T>
+  void initialize(LargePagePtr<T>& pointer) {
+
+    static_assert(alignof(T) <= 4096, "aligned_large_pages_alloc() may fail for such a big alignment requirement of T");
+    pointer.reset(reinterpret_cast<T*>(aligned_large_pages_alloc(sizeof(T))));
+    std::memset(pointer.get(), 0, sizeof(T));
+  }
+
  // Read evaluation function parameters
  template <typename T>
-  bool ReadParameters(std::istream& stream, const AlignedPtr<T>& pointer) {
+  bool ReadParameters(std::istream& stream, T& reference) {

    std::uint32_t header;
    header = read_little_endian<std::uint32_t>(stream);
    if (!stream || header != T::GetHashValue()) return false;
-    return pointer->ReadParameters(stream);
+    return reference.ReadParameters(stream);
  }

  // write evaluation function parameters
  template <typename T>
  bool WriteParameters(std::ostream& stream, const AlignedPtr<T>& pointer) {
    constexpr std::uint32_t header = T::GetHashValue();
+
    stream.write(reinterpret_cast<const char*>(&header), sizeof(header));
+
    return pointer->WriteParameters(stream);
  }

+  template <typename T>
+  bool WriteParameters(std::ostream& stream, const LargePagePtr<T>& pointer) {
+    constexpr std::uint32_t header = T::GetHashValue();
+
+    stream.write(reinterpret_cast<const char*>(&header), sizeof(header));
+
+    return pointer->WriteParameters(stream);
+  }
  }  // namespace Detail

  // Initialize the evaluation function parameters
-  void Initialize() {
+  void initialize() {

-    Detail::Initialize(feature_transformer);
-    Detail::Initialize(network);
+    Detail::initialize(feature_transformer);
+    Detail::initialize(network);
  }

  // Read network header
-  bool ReadHeader(std::istream& stream, std::uint32_t* hash_value, std::string* architecture)
+  bool read_header(std::istream& stream, std::uint32_t* hash_value, std::string* architecture)
  {
    std::uint32_t version, size;

@@ -122,13 +158,17 @@ namespace Eval::NNUE {
  }

  // write the header
-  bool WriteHeader(std::ostream& stream,
+  bool write_header(std::ostream& stream,
    std::uint32_t hash_value, const std::string& architecture) {
+
    stream.write(reinterpret_cast<const char*>(&kVersion), sizeof(kVersion));
    stream.write(reinterpret_cast<const char*>(&hash_value), sizeof(hash_value));
+
    const std::uint32_t size = static_cast<std::uint32_t>(architecture.size());
+
    stream.write(reinterpret_cast<const char*>(&size), sizeof(size));
    stream.write(architecture.data(), size);
+
    return !stream.fail();
  }

@@ -137,81 +177,176 @@ namespace Eval::NNUE {

    std::uint32_t hash_value;
    std::string architecture;
-    if (!ReadHeader(stream, &hash_value, &architecture)) return false;
+    if (!read_header(stream, &hash_value, &architecture)) return false;
    if (hash_value != kHashValue) return false;
-    if (!Detail::ReadParameters(stream, feature_transformer)) return false;
-    if (!Detail::ReadParameters(stream, network)) return false;
+    if (!Detail::ReadParameters(stream, *feature_transformer)) return false;
+    if (!Detail::ReadParameters(stream, *network)) return false;
    return stream && stream.peek() == std::ios::traits_type::eof();
  }

  // write evaluation function parameters
  bool WriteParameters(std::ostream& stream) {
-    if (!WriteHeader(stream, kHashValue, GetArchitectureString())) return false;
-    if (!Detail::WriteParameters(stream, feature_transformer)) return false;
-    if (!Detail::WriteParameters(stream, network)) return false;
+
+    if (!write_header(stream, kHashValue, get_architecture_string()))
+        return false;
+
+    if (!Detail::WriteParameters(stream, feature_transformer))
+        return false;
+
+    if (!Detail::WriteParameters(stream, network))
+        return false;
+
    return !stream.fail();
-  }
-
-  // Proceed with the difference calculation if possible
-  static void UpdateAccumulatorIfPossible(const Position& pos) {
-
-    feature_transformer->UpdateAccumulatorIfPossible(pos);
-  }
-
-  // Calculate the evaluation value
-  static Value ComputeScore(const Position& pos, bool refresh) {
-
-    auto& accumulator = pos.state()->accumulator;
-    if (!refresh && accumulator.computed_score) {
-      return accumulator.score;
-    }
-
-    alignas(kCacheLineSize) TransformedFeatureType
-        transformed_features[FeatureTransformer::kBufferSize];
-    feature_transformer->Transform(pos, transformed_features, refresh);
-    alignas(kCacheLineSize) char buffer[Network::kBufferSize];
-    const auto output = network->Propagate(transformed_features, buffer);
-
-    auto score = static_cast<Value>(output[0] / FV_SCALE);
-
-    accumulator.score = score;
-    accumulator.computed_score = true;
-    return accumulator.score;
-  }
-
-  // Load the evaluation function file
-  bool load_eval_file(const std::string& evalFile) {
-
-    Initialize();
-
-    if (Options["SkipLoadingEval"])
-    {
-      std::cout << "info string SkipLoadingEval set to true, Net not loaded!" << std::endl;
-      return true;
-    }
-
-    fileName = evalFile;
-
-    std::ifstream stream(evalFile, std::ios::binary);
-
-    const bool result = ReadParameters(stream);
-
-    return result;
-  }
+}

  // Evaluation function. Perform differential calculation.
  Value evaluate(const Position& pos) {
-    return ComputeScore(pos, false);
+
+    // We manually align the arrays on the stack because with gcc < 9.3
+    // overaligning stack variables with alignas() doesn't work correctly.
+
+    constexpr uint64_t alignment = kCacheLineSize;
+
+#if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN)
+    TransformedFeatureType transformed_features_unaligned[
+      FeatureTransformer::kBufferSize + alignment / sizeof(TransformedFeatureType)];
+    char buffer_unaligned[Network::kBufferSize + alignment];
+
+    auto* transformed_features = align_ptr_up<alignment>(&transformed_features_unaligned[0]);
+    auto* buffer = align_ptr_up<alignment>(&buffer_unaligned[0]);
+#else
+    alignas(alignment)
+      TransformedFeatureType transformed_features[FeatureTransformer::kBufferSize];
+    alignas(alignment) char buffer[Network::kBufferSize];
+#endif
+
+    ASSERT_ALIGNED(transformed_features, alignment);
+    ASSERT_ALIGNED(buffer, alignment);
+
+    feature_transformer->Transform(pos, transformed_features);
+    const auto output = network->Propagate(transformed_features, buffer);
+
+    return static_cast<Value>(output[0] / FV_SCALE);
  }

-  // Evaluation function. Perform full calculation.
-  Value compute_eval(const Position& pos) {
-    return ComputeScore(pos, true);
+  // Load eval, from a file stream or a memory stream
+  bool load_eval(std::string name, std::istream& stream) {
+
+    initialize();
+    fileName = name;
+    return ReadParameters(stream);
+}
+
+static UseNNUEMode nnue_mode_from_option(const UCI::Option& mode)
+{
+  if (mode == "false")
+    return UseNNUEMode::False;
+  else if (mode == "true")
+     return UseNNUEMode::True;
+  else if (mode == "pure")
+    return UseNNUEMode::Pure;
+
+  return UseNNUEMode::False;
+}
+
+void init() {
+
+  useNNUE = nnue_mode_from_option(Options["Use NNUE"]);
+
+  if (Options["SkipLoadingEval"] || useNNUE == UseNNUEMode::False)
+  {
+    eval_file_loaded.clear();
+    return;
  }

-  // Proceed with the difference calculation if possible
-  void update_eval(const Position& pos) {
-    UpdateAccumulatorIfPossible(pos);
+  std::string eval_file = std::string(Options["EvalFile"]);
+
+#if defined(DEFAULT_NNUE_DIRECTORY)
+#define stringify2(x) #x
+#define stringify(x) stringify2(x)
+  std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory , stringify(DEFAULT_NNUE_DIRECTORY) };
+#else
+  std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory };
+#endif
+
+  for (std::string directory : dirs)
+  {
+    if (eval_file_loaded != eval_file)
+    {
+      std::ifstream stream(directory + eval_file, std::ios::binary);
+      if (load_eval(eval_file, stream))
+      {
+        sync_cout << "info string Loaded eval file " << directory + eval_file << sync_endl;
+        eval_file_loaded = eval_file;
+      }
+      else
+      {
+        sync_cout << "info string ERROR: failed to load eval file " << directory + eval_file << sync_endl;
+        eval_file_loaded.clear();
+      }
+    }
  }

+#undef stringify2
+#undef stringify
+}
+
+/// NNUE::verify() verifies that the last net used was loaded successfully
+void verify_eval_file_loaded() {
+
+  std::string eval_file = std::string(Options["EvalFile"]);
+
+  if (useNNUE != UseNNUEMode::False && eval_file_loaded != eval_file)
+  {
+    UCI::OptionsMap defaults;
+    UCI::init(defaults);
+
+    std::string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available.";
+    std::string msg2 = "The option is set to true, but the network file " + eval_file + " was not loaded successfully.";
+    std::string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file.";
+    std::string msg4 = "The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/" + std::string(defaults["EvalFile"]);
+    std::string msg5 = "The engine will be terminated now.";
+
+    sync_cout << "info string ERROR: " << msg1 << sync_endl;
+    sync_cout << "info string ERROR: " << msg2 << sync_endl;
+    sync_cout << "info string ERROR: " << msg3 << sync_endl;
+    sync_cout << "info string ERROR: " << msg4 << sync_endl;
+    sync_cout << "info string ERROR: " << msg5 << sync_endl;
+
+    std::exit(EXIT_FAILURE);
+  }
+
+  if (useNNUE != UseNNUEMode::False)
+    sync_cout << "info string NNUE evaluation using " << eval_file << " enabled" << sync_endl;
+  else
+    sync_cout << "info string classical evaluation enabled" << sync_endl;
+}
+
+/// In training we override eval file so this is useful.
+void verify_any_net_loaded() {
+
+  if (!Options["SkipLoadingEval"] && useNNUE != UseNNUEMode::False && eval_file_loaded.empty())
+  {
+    UCI::OptionsMap defaults;
+    UCI::init(defaults);
+
+    std::string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available.";
+    std::string msg2 = "The option is set to true, but the network file was not loaded successfully.";
+    std::string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file.";
+    std::string msg5 = "The engine will be terminated now.";
+
+    sync_cout << "info string ERROR: " << msg1 << sync_endl;
+    sync_cout << "info string ERROR: " << msg2 << sync_endl;
+    sync_cout << "info string ERROR: " << msg3 << sync_endl;
+    sync_cout << "info string ERROR: " << msg5 << sync_endl;
+
+    std::exit(EXIT_FAILURE);
+  }
+
+  if (useNNUE != UseNNUEMode::False)
+    sync_cout << "info string NNUE evaluation using " << eval_file_loaded << " enabled" << sync_endl;
+  else
+    sync_cout << "info string classical evaluation enabled" << sync_endl;
+}
+
 } // namespace Eval::NNUE
@@ -23,10 +23,19 @@

 #include "nnue_feature_transformer.h"

+#include "misc.h"
+
 #include <memory>

 namespace Eval::NNUE {

+  enum struct UseNNUEMode
+  {
+    False,
+    True,
+    Pure
+  };
+
  // Hash value of evaluation function structure
  constexpr std::uint32_t kHashValue =
      FeatureTransformer::GetHashValue() ^ Network::GetHashValue();
@@ -40,11 +49,22 @@ namespace Eval::NNUE {
    }
  };

+  template <typename T>
+  struct LargePageDeleter {
+    void operator()(T* ptr) const {
+      ptr->~T();
+      aligned_large_pages_free(ptr);
+    }
+  };
+
  template <typename T>
  using AlignedPtr = std::unique_ptr<T, AlignedDeleter<T>>;

+  template <typename T>
+  using LargePagePtr = std::unique_ptr<T, LargePageDeleter<T>>;
+
  // Input feature converter
-  extern AlignedPtr<FeatureTransformer> feature_transformer;
+  extern LargePagePtr<FeatureTransformer> feature_transformer;

  // Evaluation function
  extern AlignedPtr<Network> network;
@@ -55,16 +75,22 @@ namespace Eval::NNUE {
  // Saved evaluation function file name
  extern std::string savedfileName;

+  extern UseNNUEMode useNNUE;
+
+  extern std::string eval_file_loaded;
+
  // Get a string that represents the structure of the evaluation function
-  std::string GetArchitectureString();
+  std::string get_architecture_string();
+
+  std::string get_layers_info();

  // read the header
-  bool ReadHeader(std::istream& stream,
-    std::uint32_t* hash_value, std::string* architecture);
+  bool read_header(std::istream& stream,
+      std::uint32_t* hash_value, std::string* architecture);

  // write the header
-  bool WriteHeader(std::ostream& stream,
-    std::uint32_t hash_value, const std::string& architecture);
+  bool write_header(std::ostream& stream,
+      std::uint32_t hash_value, const std::string& architecture);

  // read evaluation function parameters
  bool ReadParameters(std::istream& stream);
@@ -72,6 +98,13 @@ namespace Eval::NNUE {
  // write evaluation function parameters
  bool WriteParameters(std::ostream& stream);

+  Value evaluate(const Position& pos);
+  bool load_eval(std::string name, std::istream& stream);
+  void init();
+
+  void verify_eval_file_loaded();
+  void verify_any_net_loaded();
+
 }  // namespace Eval::NNUE

 #endif // #ifndef NNUE_EVALUATE_NNUE_H_INCLUDED
@@ -1,231 +1,342 @@
-// Code for learning NNUE evaluation function
-
-#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
-
-#include <random>
+#include <random>
 #include <fstream>
-
-#include "../learn/learn.h"
-#include "../learn/learning_tools.h"
-
-#include "../position.h"
-#include "../uci.h"
-#include "../misc.h"
-#include "../thread_win32_osx.h"
-
-#include "../eval/evaluate_common.h"
+#include <filesystem>

 #include "evaluate_nnue.h"
 #include "evaluate_nnue_learner.h"
-#include "trainer/features/factorizer_feature_set.h"
-#include "trainer/features/factorizer_half_kp.h"
+
+#include "trainer/features/all_factorizers.h"
+
 #include "trainer/trainer_feature_transformer.h"
 #include "trainer/trainer_input_slice.h"
 #include "trainer/trainer_affine_transform.h"
 #include "trainer/trainer_clipped_relu.h"
 #include "trainer/trainer_sum.h"

-namespace Eval {
+#include "position.h"
+#include "uci.h"
+#include "misc.h"
+#include "thread_win32_osx.h"
+#include "thread.h"

-namespace NNUE {
+// Code for learning NNUE evaluation function
+namespace Eval::NNUE {

-namespace {
+    namespace {

-// learning data
-std::vector<Example> examples;
+        // learning data
+        std::vector<Example> examples;

-// Mutex for exclusive control of examples
-std::mutex examples_mutex;
+        // Mutex for exclusive control of examples
+        std::mutex examples_mutex;

-// number of samples in mini-batch
-uint64_t batch_size;
+        // number of samples in mini-batch
+        uint64_t batch_size;

-// random number generator
-std::mt19937 rng;
+        // random number generator
+        std::mt19937 rng;

-// learner
-std::shared_ptr<Trainer<Network>> trainer;
+        // learner
+        std::shared_ptr<Trainer<Network>> trainer;

-// Learning rate scale
-double global_learning_rate_scale;
+        // Tell the learner options such as hyperparameters
+        void send_messages(std::vector<Message> messages) {
+            for (auto& message : messages) {
+                trainer->send_message(&message);
+                assert(message.num_receivers > 0);
+            }
+        }

-// Get the learning rate scale
-double GetGlobalLearningRateScale() {
-  return global_learning_rate_scale;
-}
+    }  // namespace

-// Tell the learner options such as hyperparameters
-void SendMessages(std::vector<Message> messages) {
-  for (auto& message : messages) {
-    trainer->SendMessage(&message);
-    assert(message.num_receivers > 0);
-  }
-}
+    // Initialize learning
+    void initialize_training(
+        const std::string& seed,
+        SynchronizedRegionLogger::Region& out) {

-}  // namespace
+#if defined (OPENBLAS_VERSION)
+        openblas_set_num_threads(1);
+#elif defined (INTEL_MKL_VERSION)
+        mkl_set_num_threads(1);
+#endif

-// Initialize learning
-void InitializeTraining(double eta1, uint64_t eta1_epoch,
-                        double eta2, uint64_t eta2_epoch, double eta3) {
-  std::cout << "Initializing NN training for "
-            << GetArchitectureString() << std::endl;
+        out << "INFO (initialize_training): Initializing NN training for "
+            << get_architecture_string() << std::endl;

-  assert(feature_transformer);
-  assert(network);
-  trainer = Trainer<Network>::Create(network.get(), feature_transformer.get());
+        out << std::endl;

-  if (Options["SkipLoadingEval"]) {
-    trainer->Initialize(rng);
-  }
+        out << "Layers:\n"
+            << get_layers_info() << std::endl;

-  global_learning_rate_scale = 1.0;
-  EvalLearningTools::Weight::init_eta(eta1, eta2, eta3, eta1_epoch, eta2_epoch);
-}
+        out << std::endl;

-// set the number of samples in the mini-batch
-void SetBatchSize(uint64_t size) {
-  assert(size > 0);
-  batch_size = size;
-}
+        out << "Factorizers:\n"
+            << Features::Factorizer<RawFeatures>::get_factorizers_string() << std::endl;

-// set the learning rate scale
-void SetGlobalLearningRateScale(double scale) {
-  global_learning_rate_scale = scale;
-}
+        out << std::endl;

-// Set options such as hyperparameters
-void SetOptions(const std::string& options) {
-  std::vector<Message> messages;
-  for (const auto& option : Split(options, ',')) {
-    const auto fields = Split(option, '=');
-    assert(fields.size() == 1 || fields.size() == 2);
-    if (fields.size() == 1) {
-      messages.emplace_back(fields[0]);
-    } else {
-      messages.emplace_back(fields[0], fields[1]);
-    }
-  }
-  SendMessages(std::move(messages));
-}
+        assert(feature_transformer);
+        assert(network);

-// Reread the evaluation function parameters for learning from the file
-void RestoreParameters(const std::string& dir_name) {
-  const std::string file_name = Path::Combine(dir_name, NNUE::savedfileName);
-  std::ifstream stream(file_name, std::ios::binary);
-  bool result = ReadParameters(stream);
-  assert(result);
+        trainer = Trainer<Network>::create(network.get(), feature_transformer.get());
+        rng.seed(PRNG(seed).rand<uint64_t>());

-  SendMessages({{"reset"}});
-}
-
-// Add 1 sample of learning data
-void AddExample(Position& pos, Color rootColor,
-                const Learner::PackedSfenValue& psv, double weight) {
-  Example example;
-  if (rootColor == pos.side_to_move()) {
-    example.sign = 1;
-  } else {
-    example.sign = -1;
-  }
-  example.psv = psv;
-  example.weight = weight;
-
-  Features::IndexList active_indices[2];
-  for (const auto trigger : kRefreshTriggers) {
-    RawFeatures::AppendActiveIndices(pos, trigger, active_indices);
-  }
-  if (pos.side_to_move() != WHITE) {
-    active_indices[0].swap(active_indices[1]);
-  }
-  for (const auto color : Colors) {
-    std::vector<TrainingFeature> training_features;
-    for (const auto base_index : active_indices[color]) {
-      static_assert(Features::Factorizer<RawFeatures>::GetDimensions() <
-                    (1 << TrainingFeature::kIndexBits), "");
-      Features::Factorizer<RawFeatures>::AppendTrainingFeatures(
-          base_index, &training_features);
-    }
-    std::sort(training_features.begin(), training_features.end());
-
-    auto& unique_features = example.training_features[color];
-    for (const auto& feature : training_features) {
-      if (!unique_features.empty() &&
-          feature.GetIndex() == unique_features.back().GetIndex()) {
-        unique_features.back() += feature;
-      } else {
-        unique_features.push_back(feature);
-      }
-    }
-  }
-
-  std::lock_guard<std::mutex> lock(examples_mutex);
-  examples.push_back(std::move(example));
-}
-
-// update the evaluation function parameters
-void UpdateParameters(uint64_t epoch) {
-  assert(batch_size > 0);
-
-  EvalLearningTools::Weight::calc_eta(epoch);
-  const auto learning_rate = static_cast<LearnFloatType>(
-      get_eta() / batch_size);
-
-  std::lock_guard<std::mutex> lock(examples_mutex);
-  std::shuffle(examples.begin(), examples.end(), rng);
-  while (examples.size() >= batch_size) {
-    std::vector<Example> batch(examples.end() - batch_size, examples.end());
-    examples.resize(examples.size() - batch_size);
-
-    const auto network_output = trainer->Propagate(batch);
-
-    std::vector<LearnFloatType> gradients(batch.size());
-    for (std::size_t b = 0; b < batch.size(); ++b) {
-      const auto shallow = static_cast<Value>(Round<std::int32_t>(
-          batch[b].sign * network_output[b] * kPonanzaConstant));
-      const auto& psv = batch[b].psv;
-      const double gradient = batch[b].sign * Learner::calc_grad(shallow, psv);
-      gradients[b] = static_cast<LearnFloatType>(gradient * batch[b].weight);
+        if (Options["SkipLoadingEval"]) {
+            out << "INFO (initialize_training): Performing random net initialization.\n";
+            trainer->initialize(rng);
+        }
    }

-    trainer->Backpropagate(gradients.data(), learning_rate);
-  }
-  SendMessages({{"quantize_parameters"}});
-}
+    // set the number of samples in the mini-batch
+    void set_batch_size(uint64_t size) {
+        assert(size > 0);
+        batch_size = size;
+    }

-// Check if there are any problems with learning
-void CheckHealth() {
-  SendMessages({{"check_health"}});
-}
+    // Set options such as hyperparameters
+    void set_options(const std::string& options) {
+        std::vector<Message> messages;
+        for (const auto& option : Algo::split(options, ',')) {
+          const auto fields = Algo::split(option, '=');
+          assert(fields.size() == 1 || fields.size() == 2);

-}  // namespace NNUE
+          if (fields.size() == 1) {
+              messages.emplace_back(fields[0]);
+          } else {
+              messages.emplace_back(fields[0], fields[1]);
+          }
+        }

-// save merit function parameters to a file
-void save_eval(std::string dir_name) {
-  auto eval_dir = Path::Combine(Options["EvalSaveDir"], dir_name);
-  std::cout << "save_eval() start. folder = " << eval_dir << std::endl;
+        send_messages(std::move(messages));
+    }

-  // mkdir() will fail if this folder already exists, but
-  // Apart from that. If not, I just want you to make it.
-  // Also, assume that the folders up to EvalSaveDir have been dug.
-  Dependency::mkdir(eval_dir);
+    // Reread the evaluation function parameters for learning from the file
+    void restore_parameters(const std::string& dir_name) {
+        const std::string file_name = Path::combine(dir_name, NNUE::savedfileName);
+        std::ifstream stream(file_name, std::ios::binary);
+#ifndef NDEBUG
+        bool result =
+#endif
+        ReadParameters(stream);
+#ifndef NDEBUG
+        assert(result);
+#endif

-  if (Options["SkipLoadingEval"] && NNUE::trainer) {
-    NNUE::SendMessages({{"clear_unobserved_feature_weights"}});
-  }
+        send_messages({{"reset"}});
+    }

-  const std::string file_name = Path::Combine(eval_dir, NNUE::savedfileName);
-  std::ofstream stream(file_name, std::ios::binary);
-  const bool result = NNUE::WriteParameters(stream);
-  assert(result);
+    void finalize_net() {
+        send_messages({{"clear_unobserved_feature_weights"}});
+    }

-  std::cout << "save_eval() finished. folder = " << eval_dir << std::endl;
-}
+    // Add 1 sample of learning data
+    void add_example(
+        Position& pos,
+        Color rootColor,
+        Value discrete_nn_eval,
+        const Learner::PackedSfenValue& psv,
+        double weight) {

-// get the current eta
-double get_eta() {
-  return NNUE::GetGlobalLearningRateScale() * EvalLearningTools::Weight::eta;
-}
+        Example example;
+        if (rootColor == pos.side_to_move()) {
+            example.sign = 1;
+        } else {
+            example.sign = -1;
+        }

-}  // namespace Eval
+        example.discrete_nn_eval = discrete_nn_eval;
+        example.psv = psv;
+        example.weight = weight;

-#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+        Features::IndexList active_indices[2];
+        for (const auto trigger : kRefreshTriggers) {
+            RawFeatures::append_active_indices(pos, trigger, active_indices);
+        }
+
+        if (pos.side_to_move() != WHITE) {
+            active_indices[0].swap(active_indices[1]);
+        }
+
+        static thread_local std::vector<TrainingFeature> s_training_features;
+        auto& training_features = s_training_features;
+
+        for (const auto color : Colors) {
+            training_features.clear();
+
+            for (const auto base_index : active_indices[color]) {
+                static_assert(Features::Factorizer<RawFeatures>::get_dimensions() <
+                              (1 << TrainingFeature::kIndexBits), "");
+                Features::Factorizer<RawFeatures>::append_training_features(
+                    base_index, &training_features);
+            }
+
+            std::sort(training_features.begin(), training_features.end());
+
+            auto& unique_features = example.training_features[color];
+            unique_features.reserve(training_features.size());
+            for (const auto& feature : training_features) {
+                if (!unique_features.empty() &&
+                    feature.get_index() == unique_features.back().get_index()) {
+
+                    unique_features.back() += feature;
+                } else {
+                    unique_features.push_back(feature);
+                }
+            }
+        }
+
+        std::lock_guard<std::mutex> lock(examples_mutex);
+        examples.push_back(std::move(example));
+    }
+
+    // update the evaluation function parameters
+    Learner::Loss update_parameters(
+        ThreadPool& thread_pool,
+        uint64_t epoch,
+        bool verbose,
+        double learning_rate,
+        double max_grad,
+        Learner::CalcLossFunc calc_loss)
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        assert(batch_size > 0);
+
+        learning_rate /= batch_size;
+
+        std::lock_guard<std::mutex> lock(examples_mutex);
+
+        double abs_eval_diff_sum = 0.0;
+        double abs_discrete_eval_sum = 0.0;
+        double gradient_norm = 0.0;
+
+        bool collect_stats = verbose;
+
+        Learner::Loss loss_sum{};
+
+        std::vector<double> abs_eval_diff_sum_local(thread_pool.size(), 0.0);
+        std::vector<double> abs_discrete_eval_sum_local(thread_pool.size(), 0.0);
+        std::vector<double> gradient_norm_local(thread_pool.size(), 0.0);
+        std::vector<Learner::Loss> loss_sum_local(thread_pool.size());
+
+        auto prev_batch_begin = examples.end();
+        while ((long)(prev_batch_begin - examples.begin()) >= (long)batch_size) {
+            auto batch_begin = prev_batch_begin - batch_size;
+            auto batch_end = prev_batch_begin;
+            auto size = batch_end - batch_begin;
+            const auto network_output = trainer->step_start(thread_pool, batch_begin, batch_end);
+            std::vector<LearnFloatType> gradients(size);
+
+            thread_pool.for_each_index_chunk_with_workers(
+                std::size_t(0), size,
+                [&](Thread& th, std::size_t offset, std::size_t count) {
+                    const auto thread_id = th.thread_idx();
+
+                    trainer->propagate(th, offset, count);
+
+                    for (std::size_t b = offset; b < offset + count; ++b) {
+                        const auto& e = *(batch_begin + b);
+                        const auto shallow = static_cast<Value>(round<std::int32_t>(
+                            e.sign * network_output[b] * kPonanzaConstant));
+                        const auto discrete = e.sign * e.discrete_nn_eval;
+                        const auto& psv = e.psv;
+                        auto loss = calc_loss(shallow, (Value)psv.score, psv.game_result, psv.gamePly);
+                        loss.grad = std::clamp(
+                            loss.grad * e.sign * kPonanzaConstant * e.weight, -max_grad, max_grad);
+                        gradients[b] = static_cast<LearnFloatType>(loss.grad);
+                        loss_sum_local[thread_id] += loss;
+
+                        // The discrete eval will only be valid before first backpropagation,
+                        // that is only for the first batch.
+                        // Similarily we want only gradients from one batch.
+                        if (collect_stats)
+                        {
+                            abs_eval_diff_sum_local[thread_id] += std::abs(discrete - shallow);
+                            abs_discrete_eval_sum_local[thread_id] += std::abs(discrete);
+                            gradient_norm_local[thread_id] += std::abs(loss.grad);
+                        }
+                    }
+
+                    trainer->backpropagate(th, gradients.data(), offset, count);
+                }
+            );
+
+            // We can asyncronously erase the examples that we used in the previous
+            // step. This can be done safely because we're no longer using these
+            // examples and erase won't invalidate iterators.
+            examples.erase(prev_batch_begin, examples.end());
+            prev_batch_begin = batch_begin;
+
+            thread_pool.wait_for_workers_finished();
+
+            trainer->step_end(thread_pool, learning_rate);
+
+            collect_stats = false;
+        }
+        examples.erase(prev_batch_begin, examples.end());
+
+        if (verbose)
+        {
+            abs_eval_diff_sum = std::accumulate(abs_eval_diff_sum_local.begin(), abs_eval_diff_sum_local.end(), 0.0);
+            abs_discrete_eval_sum = std::accumulate(abs_discrete_eval_sum_local.begin(), abs_discrete_eval_sum_local.end(), 0.0);
+            gradient_norm = std::accumulate(gradient_norm_local.begin(), gradient_norm_local.end(), 0.0);
+
+            const double avg_abs_eval_diff = abs_eval_diff_sum / batch_size;
+            const double avg_abs_discrete_eval = abs_discrete_eval_sum / batch_size;
+
+            auto out = sync_region_cout.new_region();
+
+            out << "INFO (update_parameters):"
+                << " epoch = " << epoch
+                << " , avg_abs(trainer_eval-nnue_eval) = " << avg_abs_eval_diff
+                << " , avg_abs(nnue_eval) = " << avg_abs_discrete_eval
+                << " , avg_relative_error = " << avg_abs_eval_diff / avg_abs_discrete_eval
+                << " , batch_size = " << batch_size
+                << " , grad_norm = " << gradient_norm
+                << std::endl;
+        } else {
+            // Display some progress but don't synchronize as
+            // we can't really decide when to release the output lock here
+            std::cout << '.';
+        }
+
+        send_messages({{"quantize_parameters"}});
+
+        for(auto& loss : loss_sum_local)
+        {
+            loss_sum += loss;
+        }
+
+        return loss_sum;
+    }
+
+    // Check if there are any problems with learning
+    void check_health() {
+        send_messages({{"check_health"}});
+    }
+
+    // save merit function parameters to a file
+    void save_eval(std::string dir_name) {
+        auto eval_dir = Path::combine(Options["EvalSaveDir"], dir_name);
+
+        auto out = sync_region_cout.new_region();
+
+        out << "INFO (save_eval): Saving current evaluation file in " << eval_dir << std::endl;
+
+        // mkdir() will fail if this folder already exists, but
+        // Apart from that. If not, I just want you to make it.
+        // Also, assume that the folders up to EvalSaveDir have been dug.
+        std::filesystem::create_directories(eval_dir);
+
+        const std::string file_name = Path::combine(eval_dir, NNUE::savedfileName);
+        std::ofstream stream(file_name, std::ios::binary);
+#ifndef NDEBUG
+        bool result =
+#endif
+        WriteParameters(stream);
+#ifndef NDEBUG
+        assert(result);
+#endif
+        out << "INFO (save_eval): Finished saving evaluation file in " << eval_dir << std::endl;
+    }
+}  // namespace Eval::NNUE
@@ -1,46 +1,52 @@
-// Interface used for learning NNUE evaluation function
-
-#ifndef _EVALUATE_NNUE_LEARNER_H_
+#ifndef _EVALUATE_NNUE_LEARNER_H_
 #define _EVALUATE_NNUE_LEARNER_H_

-#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+#include "learn/learn.h"

-#include "../learn/learn.h"
+#include "misc.h"

-namespace Eval {
+struct ThreadPool;

-namespace NNUE {
+// Interface used for learning NNUE evaluation function
+namespace Eval::NNUE {

-// Initialize learning
-void InitializeTraining(double eta1, uint64_t eta1_epoch,
-                        double eta2, uint64_t eta2_epoch, double eta3);
+    // Initialize learning
+    void initialize_training(
+        const std::string& seed,
+        SynchronizedRegionLogger::Region& out);

-// set the number of samples in the mini-batch
-void SetBatchSize(uint64_t size);
+    // set the number of samples in the mini-batch
+    void set_batch_size(uint64_t size);

-// set the learning rate scale
-void SetGlobalLearningRateScale(double scale);
+    // Set options such as hyperparameters
+    void set_options(const std::string& options);

-// Set options such as hyperparameters
-void SetOptions(const std::string& options);
+    // Reread the evaluation function parameters for learning from the file
+    void restore_parameters(const std::string& dir_name);

-// Reread the evaluation function parameters for learning from the file
-void RestoreParameters(const std::string& dir_name);
+    // Add 1 sample of learning data
+    void add_example(
+        Position& pos,
+        Color rootColor,
+        Value discrete_nn_eval,
+    	const Learner::PackedSfenValue& psv,
+        double weight);

-// Add 1 sample of learning data
-void AddExample(Position& pos, Color rootColor,
-                const Learner::PackedSfenValue& psv, double weight);
+    // update the evaluation function parameters
+    Learner::Loss update_parameters(
+        ThreadPool& thread_pool,
+        uint64_t epoch,
+        bool verbose,
+        double learning_rate,
+        double max_grad,
+        Learner::CalcLossFunc calc_loss);

-// update the evaluation function parameters
-void UpdateParameters(uint64_t epoch);
+    // Check if there are any problems with learning
+    void check_health();

-// Check if there are any problems with learning
-void CheckHealth();
+    void finalize_net();

-}  // namespace NNUE
-
-}  // namespace Eval
-
-#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+    void save_eval(std::string suffix);
+}  // namespace Eval::NNUE

 #endif
@@ -0,0 +1,54 @@
+#include "a.h"
+#include "index_list.h"
+
+// Definition of input feature A of NNUE evaluation function
+namespace Eval::NNUE::Features {
+
+    // Orient a square according to perspective (rotate the board 180° for black)
+    // Important note for "halfka": this arch was designed with "flip" in mind 
+    // although it still is untested which approach is better.
+    // this has to stay until we find a better arch that works with "flip".
+    // allows us to use current master net for gensfen (primarily needed for higher quality data)
+    inline Square orient(Color perspective, Square s) {
+        return Square(int(s) ^ (bool(perspective) * 63));
+    }
+
+    // Find the index of the feature quantity from the king position and PieceSquare
+    inline IndexType A::make_index(
+        Color perspective, Square s, Piece pc) {
+        return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
+    }
+
+    // Get a list of indices with a value of 1 among the features
+    void A::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {
+
+        Bitboard bb = pos.pieces();
+        while (bb) {
+            Square s = pop_lsb(&bb);
+            active->push_back(make_index(perspective, s, pos.piece_on(s)));
+        }
+    }
+
+    // Get a list of indices whose values have changed from the previous one in the feature quantity
+    void A::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* added) {
+
+        const auto& dp = pos.state()->dirtyPiece;
+        for (int i = 0; i < dp.dirty_num; ++i) {
+            Piece pc = dp.piece[i];
+
+            if (dp.from[i] != SQ_NONE)
+              removed->push_back(make_index(perspective, dp.from[i], pc));
+
+            if (dp.to[i] != SQ_NONE)
+              added->push_back(make_index(perspective, dp.to[i], pc));
+        }
+    }
+
+}  // namespace Eval::NNUE::Features
@@ -0,0 +1,54 @@
+#ifndef _NNUE_FEATURES_A_H_
+#define _NNUE_FEATURES_A_H_
+
+#include "features_common.h"
+
+#include "evaluate.h"
+
+// Definition of input feature A of NNUE evaluation function
+// A is a union of P features and K features, so technically the
+// same effect can be achieved by including both P and K features
+// but it would result in slower index appending because
+// P would conditionally exclude K features and vice versa,
+// where A doesn't have any conditionals.
+namespace Eval::NNUE::Features {
+
+    // Feature P: PieceSquare of pieces other than balls
+    class A {
+    public:
+        // feature quantity name
+        static constexpr const char* kName = "A";
+
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t kHashValue = 0x7A4C414Cu;
+
+        // number of feature dimensions
+        static constexpr IndexType kDimensions = PS_END2;
+
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions = 32;
+
+        // Timing of full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
+
+        // Get a list of indices with a value of 1 among the features
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);
+
+        // Get a list of indices whose values have changed from the previous one in the feature quantity
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
+
+    private:
+        // Index of a feature for a given piece on some square
+        static IndexType make_index(Color perspective, Square s, Piece pc);
+    };
+
+}  // namespace Eval::NNUE::Features
+
+#endif // #ifndef _NNUE_FEATURES_UNION_P_K_H_
@@ -1,73 +1,65 @@
-//Definition of input feature quantity K of NNUE evaluation function
-
-#if defined(EVAL_NNUE)
-
 #include "castling_right.h"
 #include "index_list.h"

-namespace Eval {
+//Definition of input feature quantity CastlingRight of NNUE evaluation function
+namespace Eval::NNUE::Features {

-  namespace NNUE {
+    // Get a list of indices with a value of 1 among the features
+    void CastlingRight::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {

-    namespace Features {
-
-      // Get a list of indices with a value of 1 among the features
-      void CastlingRight::AppendActiveIndices(
-        const Position& pos, Color perspective, IndexList* active) {
        // do nothing if array size is small to avoid compiler warning
        if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;

        int castling_rights = pos.state()->castlingRights;
        int relative_castling_rights;
        if (perspective == WHITE) {
-          relative_castling_rights = castling_rights;
+            relative_castling_rights = castling_rights;
        }
        else {
-          // Invert the perspective.
-          relative_castling_rights = ((castling_rights & 3) << 2)
-            & ((castling_rights >> 2) & 3);
+            // Invert the perspective.
+            relative_castling_rights = ((castling_rights & 3) << 2)
+                & ((castling_rights >> 2) & 3);
        }

-        for (int i = 0; i <kDimensions; ++i) {
-          if (relative_castling_rights & (i << 1)) {
-            active->push_back(i);
-          }
+        for (Eval::NNUE::IndexType i = 0; i < kDimensions; ++i) {
+            if (relative_castling_rights & (1 << i)) {
+                active->push_back(i);
+            }
        }
-      }
+    }

-      // Get a list of indices whose values have changed from the previous one in the feature quantity
-      void CastlingRight::AppendChangedIndices(
-        const Position& pos, Color perspective,
-        IndexList* removed, IndexList* added) {
+    // Get a list of indices whose values have changed from the previous one in the feature quantity
+    void CastlingRight::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* /* added */) {

        int previous_castling_rights = pos.state()->previous->castlingRights;
        int current_castling_rights = pos.state()->castlingRights;
        int relative_previous_castling_rights;
        int relative_current_castling_rights;
        if (perspective == WHITE) {
-          relative_previous_castling_rights = previous_castling_rights;
-          relative_current_castling_rights = current_castling_rights;
+            relative_previous_castling_rights = previous_castling_rights;
+            relative_current_castling_rights = current_castling_rights;
        }
        else {
-          // Invert the perspective.
-          relative_previous_castling_rights = ((previous_castling_rights & 3) << 2)
-            & ((previous_castling_rights >> 2) & 3);
-          relative_current_castling_rights = ((current_castling_rights & 3) << 2)
-            & ((current_castling_rights >> 2) & 3);
+            // Invert the perspective.
+            relative_previous_castling_rights = ((previous_castling_rights & 3) << 2)
+                & ((previous_castling_rights >> 2) & 3);
+            relative_current_castling_rights = ((current_castling_rights & 3) << 2)
+                & ((current_castling_rights >> 2) & 3);
        }

-        for (int i = 0; i < kDimensions; ++i) {
-          if ((relative_previous_castling_rights & (i << 1)) &&
-            (relative_current_castling_rights & (i << 1)) == 0) {
-            removed->push_back(i);
-          }
+        for (Eval::NNUE::IndexType i = 0; i < kDimensions; ++i) {
+            if ((relative_previous_castling_rights & (1 << i)) &&
+                (relative_current_castling_rights & (1 << i)) == 0) {
+                removed->push_back(i);
+            }
        }
-      }
+    }

-    }  // namespace Features
-
-  }  // namespace NNUE
-
-}  // namespace Eval
-
-#endif  // defined(EVAL_NNUE)
+}  // namespace Eval::NNUE::Features
@@ -1,48 +1,44 @@
-//Definition of input feature quantity K of NNUE evaluation function
-
 #ifndef _NNUE_FEATURES_CASTLING_RIGHT_H_
 #define _NNUE_FEATURES_CASTLING_RIGHT_H_

-#if defined(EVAL_NNUE)
-
-#include "../../evaluate.h"
 #include "features_common.h"

-namespace Eval {
+#include "evaluate.h"

-  namespace NNUE {
+//Definition of input feature quantity CastlingRight of NNUE evaluation function
+namespace Eval::NNUE::Features {

-    namespace Features {
-
-      // Feature K: Ball position
-      class CastlingRight {
-      public:
+    class CastlingRight {
+    public:
        // feature quantity name
        static constexpr const char* kName = "CastlingRight";
+
        // Hash value embedded in the evaluation function file
        static constexpr std::uint32_t kHashValue = 0x913968AAu;
+
        // number of feature dimensions
        static constexpr IndexType kDimensions = 4;
+
        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
        static constexpr IndexType kMaxActiveDimensions = 4;
+
        // Timing of full calculation instead of difference calculation
        static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;

        // Get a list of indices with a value of 1 among the features
-        static void AppendActiveIndices(const Position& pos, Color perspective,
-          IndexList* active);
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);

-        // Get a list of indices whose values ??have changed from the previous one in the feature quantity
-        static void AppendChangedIndices(const Position& pos, Color perspective,
-          IndexList* removed, IndexList* added);
-      };
+        // Get a list of indices whose values have changed from the previous one in the feature quantity
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
+    };

-    }  // namespace Features
-
-  }  // namespace NNUE
-
-}  // namespace Eval
-
-#endif  // defined(EVAL_NNUE)
+}  // namespace Eval::NNUE::Features

 #endif
@@ -1,47 +1,49 @@
-//Definition of input feature quantity K of NNUE evaluation function
-
-#if defined(EVAL_NNUE)
-
 #include "enpassant.h"
 #include "index_list.h"

-namespace Eval {
+//Definition of input feature quantity EnPassant of NNUE evaluation function
+namespace Eval::NNUE::Features {

-  namespace NNUE {
+    // Get a list of indices with a value of 1 among the features
+    void EnPassant::append_active_indices(
+        const Position& pos,
+        Color /* perspective */,
+        IndexList* active) {

-    namespace Features {
-
-      // Get a list of indices with a value of 1 among the features
-      void EnPassant::AppendActiveIndices(
-        const Position& pos, Color perspective, IndexList* active) {
        // do nothing if array size is small to avoid compiler warning
-        if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
+        if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions)
+            return;

        auto epSquare = pos.state()->epSquare;
-        if (epSquare == SQ_NONE) {
-          return;
-        }
-
-        if (perspective == BLACK) {
-          epSquare = rotate180(epSquare);
-        }
+        if (epSquare == SQ_NONE)
+            return;

        auto file = file_of(epSquare);
        active->push_back(file);
-      }
+    }

-      // Get a list of indices whose values ??have changed from the previous one in the feature quantity
-      void EnPassant::AppendChangedIndices(
-        const Position& pos, Color perspective,
-        IndexList* removed, IndexList* added) {
-        // Not implemented.
-        assert(false);
-      }
+    // Get a list of indices whose values have changed from the previous one in the feature quantity
+    void EnPassant::append_changed_indices(
+        const Position& pos,
+        Color /* perspective */,
+        IndexList* removed,
+        IndexList* added) {

-    }  // namespace Features
+        auto previous_epSquare = pos.state()->previous->epSquare;
+        auto epSquare = pos.state()->epSquare;

-  }  // namespace NNUE
+        if (previous_epSquare != SQ_NONE) {
+            if (epSquare != SQ_NONE && file_of(epSquare) == file_of(previous_epSquare))
+                return;

-}  // namespace Eval
+            auto file = file_of(previous_epSquare);
+            removed->push_back(file);
+        }

-#endif  // defined(EVAL_NNUE)
+        if (epSquare != SQ_NONE) {
+            auto file = file_of(epSquare);
+            added->push_back(file);
+        }
+    }
+
+}  // namespace Eval::NNUE::Features
@@ -1,22 +1,15 @@
-//Definition of input feature quantity K of NNUE evaluation function
-
 #ifndef _NNUE_FEATURES_ENPASSANT_H_
 #define _NNUE_FEATURES_ENPASSANT_H_

-#if defined(EVAL_NNUE)
-
-#include "../../evaluate.h"
 #include "features_common.h"

-namespace Eval {
+#include "evaluate.h"

-  namespace NNUE {
+//Definition of input feature quantity EnPassant of NNUE evaluation function
+namespace Eval::NNUE::Features {

-    namespace Features {
-
-      // Feature K: Ball position
-      class EnPassant {
-      public:
+    class EnPassant {
+    public:
        // feature quantity name
        static constexpr const char* kName = "EnPassant";
        // Hash value embedded in the evaluation function file
@@ -26,23 +19,22 @@ namespace Eval {
        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
        static constexpr IndexType kMaxActiveDimensions = 1;
        // Timing of full calculation instead of difference calculation
-        static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kAnyPieceMoved;
+        static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;

        // Get a list of indices with a value of 1 among the features
-        static void AppendActiveIndices(const Position& pos, Color perspective,
-          IndexList* active);
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);

-        // Get a list of indices whose values ??have changed from the previous one in the feature quantity
-        static void AppendChangedIndices(const Position& pos, Color perspective,
-          IndexList* removed, IndexList* added);
-      };
+        // Get a list of indices whose values have changed from the previous one in the feature quantity
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
+    };

-    }  // namespace Features
-
-  }  // namespace NNUE
-
-}  // namespace Eval
-
-#endif  // defined(EVAL_NNUE)
+}  // namespace Eval::NNUE::Features

 #endif
@@ -26,222 +26,276 @@

 namespace Eval::NNUE::Features {

-  // Class template that represents a list of values
-  template <typename T, T... Values>
-  struct CompileTimeList;
+    // Class template that represents a list of values
+    template <typename T, T... Values>
+    struct CompileTimeList;

-  template <typename T, T First, T... Remaining>
-  struct CompileTimeList<T, First, Remaining...> {
-    static constexpr bool Contains(T value) {
-      return value == First || CompileTimeList<T, Remaining...>::Contains(value);
-    }
-    static constexpr std::array<T, sizeof...(Remaining) + 1>
-        kValues = {{First, Remaining...}};
-  };
-
-  template <typename T, T First, T... Remaining>
-  constexpr std::array<T, sizeof...(Remaining) + 1>
-    CompileTimeList<T, First, Remaining...>::kValues;
-  template <typename T>
-  struct CompileTimeList<T> {
-    static constexpr bool Contains(T /*value*/) {
-      return false;
-    }
-    static constexpr std::array<T, 0> kValues = { {} };
-  };
-
-  // Class template that adds to the beginning of the list
-  template <typename T, typename ListType, T Value>
-  struct AppendToList;
-  template <typename T, T... Values, T AnotherValue>
-  struct AppendToList<T, CompileTimeList<T, Values...>, AnotherValue> {
-    using Result = CompileTimeList<T, AnotherValue, Values...>;
-  };
-
-  // Class template for adding to a sorted, unique list
-  template <typename T, typename ListType, T Value>
-  struct InsertToSet;
-  template <typename T, T First, T... Remaining, T AnotherValue>
-  struct InsertToSet<T, CompileTimeList<T, First, Remaining...>, AnotherValue> {
-    using Result = std::conditional_t<
-      CompileTimeList<T, First, Remaining...>::Contains(AnotherValue),
-      CompileTimeList<T, First, Remaining...>,
-      std::conditional_t<(AnotherValue < First),
-      CompileTimeList<T, AnotherValue, First, Remaining...>,
-      typename AppendToList<T, typename InsertToSet<
-      T, CompileTimeList<T, Remaining...>, AnotherValue>::Result,
-      First>::Result>>;
-  };
-  template <typename T, T Value>
-  struct InsertToSet<T, CompileTimeList<T>, Value> {
-    using Result = CompileTimeList<T, Value>;
-  };
-
-  // Base class of feature set
-  template <typename Derived>
-  class FeatureSetBase {
-
-   public:
-    // Get a list of indices for active features
-    template <typename IndexListType>
-    static void AppendActiveIndices(
-        const Position& pos, TriggerEvent trigger, IndexListType active[2]) {
-
-      for (Color perspective : { WHITE, BLACK }) {
-        Derived::CollectActiveIndices(
-            pos, trigger, perspective, &active[perspective]);
-      }
-    }
-
-    // Get a list of indices for recently changed features
-    template <typename PositionType, typename IndexListType>
-    static void AppendChangedIndices(
-        const PositionType& pos, TriggerEvent trigger,
-        IndexListType removed[2], IndexListType added[2], bool reset[2]) {
-
-      const auto& dp = pos.state()->dirtyPiece;
-      if (dp.dirty_num == 0) return;
-
-      for (Color perspective : { WHITE, BLACK }) {
-        reset[perspective] = false;
-        switch (trigger) {
-          case TriggerEvent::kFriendKingMoved:
-            reset[perspective] = dp.piece[0] == make_piece(perspective, KING);
-            break;
-          default:
-            assert(false);
-            break;
+    template <typename T, T First, T... Remaining>
+    struct CompileTimeList<T, First, Remaining...> {
+        static constexpr bool contains(T value) {
+            return value == First || CompileTimeList<T, Remaining...>::contains(value);
        }
-        if (reset[perspective]) {
-          Derived::CollectActiveIndices(
-              pos, trigger, perspective, &added[perspective]);
-        } else {
-          Derived::CollectChangedIndices(
-              pos, trigger, perspective,
-              &removed[perspective], &added[perspective]);
+
+        static constexpr std::array<T, sizeof...(Remaining) + 1>
+            kValues = {{First, Remaining...}};
+    };
+
+    template <typename T, T First, T... Remaining>
+    constexpr std::array<T, sizeof...(Remaining) + 1>
+        CompileTimeList<T, First, Remaining...>::kValues;
+
+    template <typename T>
+    struct CompileTimeList<T> {
+        static constexpr bool contains(T /*value*/) {
+            return false;
        }
-      }
-    }
-  };
+        static constexpr std::array<T, 0> kValues = { {} };
+    };

-  // Class template that represents the feature set
-  // do internal processing in reverse order of template arguments in order to linearize the amount of calculation at runtime
-  template <typename FirstFeatureType, typename... RemainingFeatureTypes>
-  class FeatureSet<FirstFeatureType, RemainingFeatureTypes...> :
-    public FeatureSetBase<
-    FeatureSet<FirstFeatureType, RemainingFeatureTypes...>> {
-  private:
-    using Head = FirstFeatureType;
-    using Tail = FeatureSet<RemainingFeatureTypes...>;
+    // Class template that adds to the beginning of the list
+    template <typename T, typename ListType, T Value>
+    struct AppendToList;

-  public:
-    // Hash value embedded in the evaluation function file
-    static constexpr std::uint32_t kHashValue =
-      Head::kHashValue ^ (Tail::kHashValue << 1) ^ (Tail::kHashValue >> 31);
-    // number of feature dimensions
-    static constexpr IndexType kDimensions =
-      Head::kDimensions + Tail::kDimensions;
-    // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-    static constexpr IndexType kMaxActiveDimensions =
-      Head::kMaxActiveDimensions + Tail::kMaxActiveDimensions;
-    // List of timings to perform all calculations instead of difference calculation
-    using SortedTriggerSet = typename InsertToSet<TriggerEvent,
-      typename Tail::SortedTriggerSet, Head::kRefreshTrigger>::Result;
-    static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
+    template <typename T, T... Values, T AnotherValue>
+    struct AppendToList<T, CompileTimeList<T, Values...>, AnotherValue> {
+        using Result = CompileTimeList<T, AnotherValue, Values...>;
+    };

-    // Get the feature quantity name
-    static std::string GetName() {
-      return std::string(Head::kName) + "+" + Tail::GetName();
-    }
+    // Class template for adding to a sorted, unique list
+    template <typename T, typename ListType, T Value>
+    struct InsertToSet;

-  private:
-    // Get a list of indices with a value of 1 among the features
-    template <typename IndexListType>
-    static void CollectActiveIndices(
-      const Position& pos, const TriggerEvent trigger, const Color perspective,
-      IndexListType* const active) {
-      Tail::CollectActiveIndices(pos, trigger, perspective, active);
-      if (Head::kRefreshTrigger == trigger) {
-        const auto start = active->size();
-        Head::AppendActiveIndices(pos, perspective, active);
-        for (auto i = start; i < active->size(); ++i) {
-          (*active)[i] += Tail::kDimensions;
+    template <typename T, T First, T... Remaining, T AnotherValue>
+    struct InsertToSet<T, CompileTimeList<T, First, Remaining...>, AnotherValue> {
+        using Result =
+            std::conditional_t<
+                CompileTimeList<T, First, Remaining...>::contains(AnotherValue),
+                CompileTimeList<T, First, Remaining...>,
+                std::conditional_t<
+                    (AnotherValue < First),
+                    CompileTimeList<T, AnotherValue, First, Remaining...>,
+                    typename AppendToList<T, typename InsertToSet<
+                        T, CompileTimeList<T, Remaining...>, AnotherValue>::Result,
+                        First
+                    >::Result
+                >
+            >;
+    };
+
+    template <typename T, T Value>
+    struct InsertToSet<T, CompileTimeList<T>, Value> {
+        using Result = CompileTimeList<T, Value>;
+    };
+
+    // Base class of feature set
+    template <typename Derived>
+    class FeatureSetBase {
+
+       public:
+        // Get a list of indices for active features
+        template <typename IndexListType>
+        static void append_active_indices(
+            const Position& pos, TriggerEvent trigger, IndexListType active[2]) {
+
+            for (Color perspective : { WHITE, BLACK }) {
+                Derived::collect_active_indices(
+                    pos, trigger, perspective, &active[perspective]);
+            }
        }
-      }
-    }

-    // Get a list of indices whose values have changed from the previous one in the feature quantity
-    template <typename IndexListType>
-    static void CollectChangedIndices(
-      const Position& pos, const TriggerEvent trigger, const Color perspective,
-      IndexListType* const removed, IndexListType* const added) {
-      Tail::CollectChangedIndices(pos, trigger, perspective, removed, added);
-      if (Head::kRefreshTrigger == trigger) {
-        const auto start_removed = removed->size();
-        const auto start_added = added->size();
-        Head::AppendChangedIndices(pos, perspective, removed, added);
-        for (auto i = start_removed; i < removed->size(); ++i) {
-          (*removed)[i] += Tail::kDimensions;
+        // Get a list of indices for recently changed features
+        template <typename PositionType, typename IndexListType>
+        static void append_changed_indices(
+            const PositionType& pos,
+            TriggerEvent trigger,
+            IndexListType removed[2],
+            IndexListType added[2],
+            bool reset[2]) {
+
+            const auto& dp = pos.state()->dirtyPiece;
+
+            for (Color perspective : { WHITE, BLACK }) {
+                switch (trigger) {
+                    case TriggerEvent::kNone:
+                        break;
+                    case TriggerEvent::kFriendKingMoved:
+                        if (dp.dirty_num == 0) continue;
+                        reset[perspective] = dp.piece[0] == make_piece(perspective, KING);
+                        break;
+                    case TriggerEvent::kEnemyKingMoved:
+                        if (dp.dirty_num == 0) continue;
+                        reset[perspective] = dp.piece[0] == make_piece(~perspective, KING);
+                        break;
+                    case TriggerEvent::kAnyKingMoved:
+                        if (dp.dirty_num == 0) continue;
+                        reset[perspective] = type_of(dp.piece[0]) == KING;
+                        break;
+                    case TriggerEvent::kAnyPieceMoved:
+                        reset[perspective] = true;
+                        break;
+                    default:
+                        assert(false);
+                        break;
+                }
+
+                if (reset[perspective]) {
+                    Derived::collect_active_indices(
+                        pos, trigger, perspective, &added[perspective]);
+                } else {
+                    Derived::collect_changed_indices(
+                        pos, trigger, perspective,
+                        &removed[perspective], &added[perspective]);
+                }
+            }
        }
-        for (auto i = start_added; i < added->size(); ++i) {
-          (*added)[i] += Tail::kDimensions;
+    };
+
+    // Class template that represents the feature set
+    // do internal processing in reverse order of template arguments in order to linearize the amount of calculation at runtime
+    template <typename FirstFeatureType, typename... RemainingFeatureTypes>
+    class FeatureSet<FirstFeatureType, RemainingFeatureTypes...> :
+      public FeatureSetBase<
+          FeatureSet<FirstFeatureType, RemainingFeatureTypes...>
+      > {
+
+    private:
+        using Head = FirstFeatureType;
+        using Tail = FeatureSet<RemainingFeatureTypes...>;
+
+    public:
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t kHashValue =
+            Head::kHashValue ^ (Tail::kHashValue << 1) ^ (Tail::kHashValue >> 31);
+
+        // number of feature dimensions
+        static constexpr IndexType kDimensions =
+            Head::kDimensions + Tail::kDimensions;
+
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions =
+            Head::kMaxActiveDimensions + Tail::kMaxActiveDimensions;
+
+        // List of timings to perform all calculations instead of difference calculation
+        using SortedTriggerSet = typename InsertToSet<TriggerEvent,
+            typename Tail::SortedTriggerSet, Head::kRefreshTrigger>::Result;
+
+        static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
+
+        // Get the feature quantity name
+        static std::string get_name() {
+            return std::string(Head::kName) + "+" + Tail::get_name();
        }
-      }
-    }

-    // Make the base class and the class template that recursively uses itself a friend
-    friend class FeatureSetBase<FeatureSet>;
-    template <typename... FeatureTypes>
-    friend class FeatureSet;
-  };
+    private:
+        // Get a list of indices with a value of 1 among the features
+        template <typename IndexListType>
+        static void collect_active_indices(
+            const Position& pos,
+            const TriggerEvent trigger,
+            const Color perspective,
+            IndexListType* const active) {

-  // Class template that represents the feature set
-  template <typename FeatureType>
-  class FeatureSet<FeatureType> : public FeatureSetBase<FeatureSet<FeatureType>> {
+            Tail::collect_active_indices(pos, trigger, perspective, active);
+            if (Head::kRefreshTrigger == trigger) {
+                const auto start = active->size();
+                Head::append_active_indices(pos, perspective, active);

-   public:
-    // Hash value embedded in the evaluation file
-    static constexpr std::uint32_t kHashValue = FeatureType::kHashValue;
-    // Number of feature dimensions
-    static constexpr IndexType kDimensions = FeatureType::kDimensions;
-    // Maximum number of simultaneously active features
-    static constexpr IndexType kMaxActiveDimensions =
-        FeatureType::kMaxActiveDimensions;
-    // Trigger for full calculation instead of difference calculation
-    using SortedTriggerSet =
-        CompileTimeList<TriggerEvent, FeatureType::kRefreshTrigger>;
-    static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
+                for (auto i = start; i < active->size(); ++i) {
+                    (*active)[i] += Tail::kDimensions;
+                }
+            }
+        }

-    // Get the feature quantity name
-    static std::string GetName() {
-      return FeatureType::kName;
-    }
+        // Get a list of indices whose values have changed from the previous one in the feature quantity
+        template <typename IndexListType>
+        static void collect_changed_indices(
+            const Position& pos,
+            const TriggerEvent trigger,
+            const Color perspective,
+            IndexListType* const removed,
+            IndexListType* const added) {

-   private:
-    // Get a list of indices for active features
-    static void CollectActiveIndices(
-        const Position& pos, const TriggerEvent trigger, const Color perspective,
-        IndexList* const active) {
-      if (FeatureType::kRefreshTrigger == trigger) {
-        FeatureType::AppendActiveIndices(pos, perspective, active);
-      }
-    }
+            Tail::collect_changed_indices(pos, trigger, perspective, removed, added);
+            if (Head::kRefreshTrigger == trigger) {
+                const auto start_removed = removed->size();
+                const auto start_added = added->size();
+                Head::append_changed_indices(pos, perspective, removed, added);

-    // Get a list of indices for recently changed features
-    static void CollectChangedIndices(
-        const Position& pos, const TriggerEvent trigger, const Color perspective,
-        IndexList* const removed, IndexList* const added) {
+                for (auto i = start_removed; i < removed->size(); ++i) {
+                    (*removed)[i] += Tail::kDimensions;
+                }

-      if (FeatureType::kRefreshTrigger == trigger) {
-        FeatureType::AppendChangedIndices(pos, perspective, removed, added);
-      }
-    }
+                for (auto i = start_added; i < added->size(); ++i) {
+                    (*added)[i] += Tail::kDimensions;
+                }
+            }
+        }

-    // Make the base class and the class template that recursively uses itself a friend
-    friend class FeatureSetBase<FeatureSet>;
-    template <typename... FeatureTypes>
-    friend class FeatureSet;
-  };
+        // Make the base class and the class template that recursively uses itself a friend
+        friend class FeatureSetBase<FeatureSet>;
+
+        template <typename... FeatureTypes>
+        friend class FeatureSet;
+    };
+
+    // Class template that represents the feature set
+    template <typename FeatureType>
+    class FeatureSet<FeatureType> : public FeatureSetBase<FeatureSet<FeatureType>> {
+
+    public:
+        // Hash value embedded in the evaluation file
+        static constexpr std::uint32_t kHashValue = FeatureType::kHashValue;
+
+        // Number of feature dimensions
+        static constexpr IndexType kDimensions = FeatureType::kDimensions;
+
+        // Maximum number of simultaneously active features
+        static constexpr IndexType kMaxActiveDimensions =
+            FeatureType::kMaxActiveDimensions;
+
+        // Trigger for full calculation instead of difference calculation
+        using SortedTriggerSet =
+            CompileTimeList<TriggerEvent, FeatureType::kRefreshTrigger>;
+
+        static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
+
+        // Get the feature quantity name
+        static std::string get_name() {
+            return FeatureType::kName;
+        }
+
+    private:
+        // Get a list of indices for active features
+        static void collect_active_indices(
+            const Position& pos,
+            const TriggerEvent trigger,
+            const Color perspective,
+            IndexList* const active) {
+
+            if (FeatureType::kRefreshTrigger == trigger) {
+              FeatureType::append_active_indices(pos, perspective, active);
+            }
+        }
+
+        // Get a list of indices for recently changed features
+        static void collect_changed_indices(
+            const Position& pos,
+            const TriggerEvent trigger,
+            const Color perspective,
+            IndexList* const removed,
+            IndexList* const added) {
+
+            if (FeatureType::kRefreshTrigger == trigger) {
+              FeatureType::append_changed_indices(pos, perspective, removed, added);
+            }
+        }
+
+        // Make the base class and the class template that recursively uses itself a friend
+        friend class FeatureSetBase<FeatureSet>;
+
+        template <typename... FeatureTypes>
+        friend class FeatureSet;
+    };

 }  // namespace Eval::NNUE::Features

@@ -34,10 +34,10 @@ namespace Eval::NNUE::Features {
  // Trigger to perform full calculations instead of difference only
  enum class TriggerEvent {
    kNone, // Calculate the difference whenever possible
-    kFriendKingMoved, // calculate all when own ball moves
-    kEnemyKingMoved, // do all calculations when enemy balls move
-    kAnyKingMoved, // do all calculations if either ball moves
-    kAnyPieceMoved, // always do all calculations
+    kFriendKingMoved, // calculate full evaluation when own king moves
+    kEnemyKingMoved, // calculate full evaluation when opponent king moves
+    kAnyKingMoved, // calculate full evaluation when any king moves
+    kAnyPieceMoved, // always calculate full evaluation
  };

  enum class Side {
@@ -0,0 +1,93 @@
+/*
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+//Definition of input features HalfKA of NNUE evaluation function
+
+#include "half_ka.h"
+#include "index_list.h"
+
+namespace Eval::NNUE::Features {
+
+    // Orient a square according to perspective (rotate the board 180° for black)
+    // Important note for "halfka": this arch was designed with "flip" in mind 
+    // although it still is untested which approach is better.
+    // this has to stay until we find a better arch that works with "flip".
+    // allows us to use current master net for gensfen (primarily needed for higher quality data)
+    inline Square orient(Color perspective, Square s) {
+        return Square(int(s) ^ (bool(perspective) * 63));
+    }
+
+    // Find the index of the feature quantity from the king position and PieceSquare
+    template <Side AssociatedKing>
+    inline IndexType HalfKA<AssociatedKing>::make_index(
+        Color perspective,
+        Square s,
+        Piece pc,
+        Square ksq) {
+
+        return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective] + PS_END2 * ksq);
+    }
+
+    // Get a list of indices for active features
+    template <Side AssociatedKing>
+    void HalfKA<AssociatedKing>::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {
+
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));
+
+        Bitboard bb = pos.pieces();
+        while (bb) {
+            Square s = pop_lsb(&bb);
+            active->push_back(make_index(perspective, s, pos.piece_on(s), ksq));
+        }
+    }
+
+    // Get a list of indices for recently changed features
+    template <Side AssociatedKing>
+    void HalfKA<AssociatedKing>::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* added) {
+
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));
+
+        const auto& dp = pos.state()->dirtyPiece;
+        for (int i = 0; i < dp.dirty_num; ++i) {
+            Piece pc = dp.piece[i];
+
+            if (dp.from[i] != SQ_NONE)
+                removed->push_back(make_index(perspective, dp.from[i], pc, ksq));
+
+            if (dp.to[i] != SQ_NONE)
+                added->push_back(make_index(perspective, dp.to[i], pc, ksq));
+        }
+    }
+
+    template class HalfKA<Side::kFriend>;
+    template class HalfKA<Side::kEnemy>;
+
+}  // namespace Eval::NNUE::Features
@@ -0,0 +1,75 @@
+/*
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef NNUE_FEATURES_HALF_KA_H_INCLUDED
+#define NNUE_FEATURES_HALF_KA_H_INCLUDED
+
+#include "features_common.h"
+
+#include "evaluate.h"
+
+//Definition of input features HalfKPK of NNUE evaluation function
+namespace Eval::NNUE::Features {
+
+    // Feature HalfKPK: Combination of the position of own king
+    // and the position of pieces other than kings
+    template <Side AssociatedKing>
+    class HalfKA {
+
+    public:
+        // Feature name
+        static constexpr const char* kName = (AssociatedKing == Side::kFriend) ?
+            "HalfKA(Friend)" : "HalfKA(Enemy)";
+
+        // Hash value embedded in the evaluation file
+        static constexpr std::uint32_t kHashValue =
+            0x5F134CB9u ^ (AssociatedKing == Side::kFriend);
+
+        // Number of feature dimensions
+        static constexpr IndexType kDimensions =
+            static_cast<IndexType>(SQUARE_NB) * static_cast<IndexType>(PS_END2);
+
+        // Maximum number of simultaneously active features
+        static constexpr IndexType kMaxActiveDimensions = 32;
+
+        // Trigger for full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger =
+            (AssociatedKing == Side::kFriend) ?
+            TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
+
+        // Get a list of indices for active features
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);
+
+        // Get a list of indices for recently changed features
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
+
+    private:
+        // Index of a feature for a given king position and another piece on some square
+        static IndexType make_index(Color perspective, Square s, Piece pc, Square sq_k);
+    };
+
+}  // namespace Eval::NNUE::Features
+
+#endif // #ifndef NNUE_FEATURES_HALF_KA_H_INCLUDED
@@ -1,19 +1,19 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)

-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.

-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.

-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

 //Definition of input features HalfKP of NNUE evaluation function
@@ -23,50 +23,72 @@

 namespace Eval::NNUE::Features {

-  // Orient a square according to perspective (rotates by 180 for black)
-  inline Square orient(Color perspective, Square s) {
-    return Square(int(s) ^ (bool(perspective) * 63));
-  }
-
-  // Find the index of the feature quantity from the king position and PieceSquare
-  template <Side AssociatedKing>
-  inline IndexType HalfKP<AssociatedKing>::MakeIndex(
-      Color perspective, Square s, Piece pc, Square ksq) {
-
-    return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective] + PS_END * ksq);
-  }
-
-  // Get a list of indices for active features
-  template <Side AssociatedKing>
-  void HalfKP<AssociatedKing>::AppendActiveIndices(
-      const Position& pos, Color perspective, IndexList* active) {
-
-    Square ksq = orient(perspective, pos.square<KING>(perspective));
-    Bitboard bb = pos.pieces() & ~pos.pieces(KING);
-    while (bb) {
-      Square s = pop_lsb(&bb);
-      active->push_back(MakeIndex(perspective, s, pos.piece_on(s), ksq));
+    // Orient a square according to perspective (rotate the board 180° for black)
+    // this has to stay until we find a better arch that works with "flip".
+    // allows us to use current master net for gensfen (primarily needed for higher quality data)
+    inline Square orient(Color perspective, Square s) {
+        return Square(int(s) ^ (bool(perspective) * 63));
    }
-  }

-  // Get a list of indices for recently changed features
-  template <Side AssociatedKing>
-  void HalfKP<AssociatedKing>::AppendChangedIndices(
-      const Position& pos, Color perspective,
-      IndexList* removed, IndexList* added) {
+    // Find the index of the feature quantity from the king position and PieceSquare
+    template <Side AssociatedKing>
+    inline IndexType HalfKP<AssociatedKing>::make_index(
+        Color perspective,
+        Square s,
+        Piece pc,
+        Square ksq) {

-    Square ksq = orient(perspective, pos.square<KING>(perspective));
-    const auto& dp = pos.state()->dirtyPiece;
-    for (int i = 0; i < dp.dirty_num; ++i) {
-      Piece pc = dp.piece[i];
-      if (type_of(pc) == KING) continue;
-      if (dp.from[i] != SQ_NONE)
-        removed->push_back(MakeIndex(perspective, dp.from[i], pc, ksq));
-      if (dp.to[i] != SQ_NONE)
-        added->push_back(MakeIndex(perspective, dp.to[i], pc, ksq));
+        return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective] + PS_END * ksq);
    }
-  }

-  template class HalfKP<Side::kFriend>;
+    // Get a list of indices for active features
+    template <Side AssociatedKing>
+    void HalfKP<AssociatedKing>::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {
+
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));
+
+        Bitboard bb = pos.pieces() & ~pos.pieces(KING);
+        while (bb) {
+            Square s = pop_lsb(&bb);
+            active->push_back(make_index(perspective, s, pos.piece_on(s), ksq));
+        }
+    }
+
+    // Get a list of indices for recently changed features
+    template <Side AssociatedKing>
+    void HalfKP<AssociatedKing>::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* added) {
+
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));
+
+        const auto& dp = pos.state()->dirtyPiece;
+        for (int i = 0; i < dp.dirty_num; ++i) {
+            Piece pc = dp.piece[i];
+
+            if (type_of(pc) == KING)
+                continue;
+
+            if (dp.from[i] != SQ_NONE)
+                removed->push_back(make_index(perspective, dp.from[i], pc, ksq));
+
+            if (dp.to[i] != SQ_NONE)
+                added->push_back(make_index(perspective, dp.to[i], pc, ksq));
+        }
+    }
+
+    template class HalfKP<Side::kFriend>;
+    template class HalfKP<Side::kEnemy>;

 }  // namespace Eval::NNUE::Features
@@ -1,62 +1,74 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)

-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.

-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.

-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

-//Definition of input features HalfKP of NNUE evaluation function
-
 #ifndef NNUE_FEATURES_HALF_KP_H_INCLUDED
 #define NNUE_FEATURES_HALF_KP_H_INCLUDED

-#include "../../evaluate.h"
 #include "features_common.h"

+#include "evaluate.h"
+
+//Definition of input features HalfKP of NNUE evaluation function
 namespace Eval::NNUE::Features {

-  // Feature HalfKP: Combination of the position of own king
-  // and the position of pieces other than kings
-  template <Side AssociatedKing>
-  class HalfKP {
+    // Feature HalfKP: Combination of the position of own king
+    // and the position of pieces other than kings
+    template <Side AssociatedKing>
+    class HalfKP {

-   public:
-    // Feature name
-    static constexpr const char* kName = "HalfKP(Friend)";
-    // Hash value embedded in the evaluation file
-    static constexpr std::uint32_t kHashValue =
-        0x5D69D5B9u ^ (AssociatedKing == Side::kFriend);
-    // Number of feature dimensions
-    static constexpr IndexType kDimensions =
-        static_cast<IndexType>(SQUARE_NB) * static_cast<IndexType>(PS_END);
-    // Maximum number of simultaneously active features
-    static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
-    // Trigger for full calculation instead of difference calculation
-    static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kFriendKingMoved;
+    public:
+        // Feature name
+        static constexpr const char* kName = (AssociatedKing == Side::kFriend) ?
+            "HalfKP(Friend)" : "HalfKP(Enemy)";

-    // Get a list of indices for active features
-    static void AppendActiveIndices(const Position& pos, Color perspective,
-                                    IndexList* active);
+        // Hash value embedded in the evaluation file
+        static constexpr std::uint32_t kHashValue =
+            0x5D69D5B9u ^ (AssociatedKing == Side::kFriend);

-    // Get a list of indices for recently changed features
-    static void AppendChangedIndices(const Position& pos, Color perspective,
-                                     IndexList* removed, IndexList* added);
+        // Number of feature dimensions
+        static constexpr IndexType kDimensions =
+            static_cast<IndexType>(SQUARE_NB) * static_cast<IndexType>(PS_END);

-   private:
-    // Index of a feature for a given king position and another piece on some square
-    static IndexType MakeIndex(Color perspective, Square s, Piece pc, Square sq_k);
-  };
+        // Maximum number of simultaneously active features
+        static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
+
+        // Trigger for full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger =
+            (AssociatedKing == Side::kFriend) ?
+            TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
+
+        // Get a list of indices for active features
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);
+
+        // Get a list of indices for recently changed features
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
+
+    private:
+        // Index of a feature for a given king position and another piece on some square
+        static IndexType make_index(Color perspective, Square s, Piece pc, Square sq_k);
+    };

 }  // namespace Eval::NNUE::Features

@@ -0,0 +1,90 @@
+#include "half_relative_ka.h"
+#include "index_list.h"
+
+//Definition of input features HalfRelativeKA of NNUE evaluation function
+namespace Eval::NNUE::Features {
+
+    // Orient a square according to perspective (rotate the board 180° for black)
+    // Important note for "halfka": this arch was designed with "flip" in mind 
+    // although it still is untested which approach is better.
+    // this has to stay until we find a better arch that works with "flip".
+    // allows us to use current master net for gensfen (primarily needed for higher quality data)
+    inline Square orient(Color perspective, Square s) {
+        return Square(int(s) ^ (bool(perspective) * 63));
+    }
+
+    // Find the index of the feature quantity from the ball position and PieceSquare
+    template <Side AssociatedKing>
+    inline IndexType HalfRelativeKA<AssociatedKing>::make_index(
+        Color perspective,
+        Square s,
+        Piece pc,
+        Square sq_k) {
+
+        const IndexType p = IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
+        return make_index(sq_k, p);
+    }
+
+    // Find the index of the feature quantity from the ball position and PieceSquare
+    template <Side AssociatedKing>
+    inline IndexType HalfRelativeKA<AssociatedKing>::make_index(
+        Square sq_k,
+        IndexType p) {
+
+        constexpr IndexType W = kBoardWidth;
+        constexpr IndexType H = kBoardHeight;
+        const IndexType piece_index = (p - PS_W_PAWN) / SQUARE_NB;
+        const Square sq_p = static_cast<Square>((p - PS_W_PAWN) % SQUARE_NB);
+        const IndexType relative_file = file_of(sq_p) - file_of(sq_k) + (W / 2);
+        const IndexType relative_rank = rank_of(sq_p) - rank_of(sq_k) + (H / 2);
+        return H * W * piece_index + H * relative_file + relative_rank;
+    }
+
+    // Get a list of indices with a value of 1 among the features
+    template <Side AssociatedKing>
+    void HalfRelativeKA<AssociatedKing>::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {
+
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));
+
+        Bitboard bb = pos.pieces();
+        while (bb) {
+            Square s = pop_lsb(&bb);
+            active->push_back(make_index(perspective, s, pos.piece_on(s), ksq));
+        }
+    }
+
+    // Get a list of indices whose values have changed from the previous one in the feature quantity
+    template <Side AssociatedKing>
+    void HalfRelativeKA<AssociatedKing>::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* added) {
+
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));
+
+        const auto& dp = pos.state()->dirtyPiece;
+        for (int i = 0; i < dp.dirty_num; ++i) {
+            Piece pc = dp.piece[i];
+
+            if (dp.from[i] != SQ_NONE)
+                removed->push_back(make_index(perspective, dp.from[i], pc, ksq));
+
+            if (dp.to[i] != SQ_NONE)
+                added->push_back(make_index(perspective, dp.to[i], pc, ksq));
+        }
+    }
+
+    template class HalfRelativeKA<Side::kFriend>;
+    template class HalfRelativeKA<Side::kEnemy>;
+
+}  // namespace Eval::NNUE::Features
@@ -0,0 +1,68 @@
+#ifndef _NNUE_FEATURES_HALF_RELATIVE_KA_H_
+#define _NNUE_FEATURES_HALF_RELATIVE_KA_H_
+
+#include "features_common.h"
+
+#include "evaluate.h"
+
+// Definition of input features HalfRelativeKA of NNUE evaluation function
+// K - King
+// A - Any piece
+// KA - product of K and A
+namespace Eval::NNUE::Features {
+
+    // Feature HalfRelativeKA: Relative position of each piece other than the ball based on own ball or enemy ball
+    template <Side AssociatedKing>
+    class HalfRelativeKA {
+    public:
+        // feature quantity name
+        static constexpr const char* kName = (AssociatedKing == Side::kFriend) ?
+            "HalfRelativeKA(Friend)" : "HalfRelativeKA(Enemy)";
+
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t kHashValue =
+            0xA123051Fu ^ (AssociatedKing == Side::kFriend);
+
+        static constexpr IndexType kNumPieceKinds = 6 * 2;
+
+        // width of the virtual board with the ball in the center
+        static constexpr IndexType kBoardWidth = FILE_NB * 2 - 1;
+
+        // height of a virtual board with balls in the center
+        static constexpr IndexType kBoardHeight = RANK_NB * 2 - 1;
+
+        // number of feature dimensions
+        static constexpr IndexType kDimensions =
+            kNumPieceKinds * kBoardHeight * kBoardWidth;
+
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions = 32;
+
+        // Timing of full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger =
+            (AssociatedKing == Side::kFriend) ?
+            TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
+
+        // Get a list of indices with a value of 1 among the features
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);
+
+        // Get a list of indices whose values have changed from the previous one in the feature quantity
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
+
+        // Find the index of the feature quantity from the ball position and PieceSquare
+        static IndexType make_index(Square s, IndexType p);
+
+        // Find the index of the feature quantity from the ball position and PieceSquare
+        static IndexType make_index(Color perspective, Square s, Piece pc, Square sq_k);
+    };
+
+}  // namespace Eval::NNUE::Features
+
+#endif // #ifndef _NNUE_FEATURES_HALF_RELATIVE_KA_H_
@@ -1,78 +1,91 @@
-//Definition of input features HalfRelativeKP of NNUE evaluation function
-
-#if defined(EVAL_NNUE)
-
-#include "half_relative_kp.h"
+#include "half_relative_kp.h"
 #include "index_list.h"

-namespace Eval {
+//Definition of input features HalfRelativeKP of NNUE evaluation function
+namespace Eval::NNUE::Features {

-namespace NNUE {
+    // Orient a square according to perspective (rotate the board 180° for black)
+    // this has to stay until we find a better arch that works with "flip".
+    // allows us to use current master net for gensfen (primarily needed for higher quality data)
+    inline Square orient(Color perspective, Square s) {
+        return Square(int(s) ^ (bool(perspective) * 63));
+    }

-namespace Features {
+    // Find the index of the feature quantity from the ball position and PieceSquare
+    template <Side AssociatedKing>
+    inline IndexType HalfRelativeKP<AssociatedKing>::make_index(
+        Color perspective,
+        Square s,
+        Piece pc,
+        Square sq_k) {

-// Orient a square according to perspective (rotates by 180 for black)
-inline Square orient(Color perspective, Square s) {
-  return Square(int(s) ^ (bool(perspective) * 63));
-}
+        const IndexType p = IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
+        return make_index(sq_k, p);
+    }

-// Find the index of the feature quantity from the ball position and PieceSquare
-template <Side AssociatedKing>
-inline IndexType HalfRelativeKP<AssociatedKing>::MakeIndex(
-  Color perspective, Square s, Piece pc, Square sq_k) {
-  const IndexType p = IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
-  return MakeIndex(sq_k, p);
-}
+    // Find the index of the feature quantity from the ball position and PieceSquare
+    template <Side AssociatedKing>
+    inline IndexType HalfRelativeKP<AssociatedKing>::make_index(
+        Square sq_k,
+        IndexType p) {

-// Find the index of the feature quantity from the ball position and PieceSquare
-template <Side AssociatedKing>
-inline IndexType HalfRelativeKP<AssociatedKing>::MakeIndex(
-    Square sq_k, IndexType p) {
-  constexpr IndexType W = kBoardWidth;
-  constexpr IndexType H = kBoardHeight;
-  const IndexType piece_index = (p - PS_W_PAWN) / SQUARE_NB;
-  const Square sq_p = static_cast<Square>((p - PS_W_PAWN) % SQUARE_NB);
-  const IndexType relative_file = file_of(sq_p) - file_of(sq_k) + (W / 2);
-  const IndexType relative_rank = rank_of(sq_p) - rank_of(sq_k) + (H / 2);
-  return H * W * piece_index + H * relative_file + relative_rank;
-}
+        constexpr IndexType W = kBoardWidth;
+        constexpr IndexType H = kBoardHeight;
+        const IndexType piece_index = (p - PS_W_PAWN) / SQUARE_NB;
+        const Square sq_p = static_cast<Square>((p - PS_W_PAWN) % SQUARE_NB);
+        const IndexType relative_file = file_of(sq_p) - file_of(sq_k) + (W / 2);
+        const IndexType relative_rank = rank_of(sq_p) - rank_of(sq_k) + (H / 2);
+        return H * W * piece_index + H * relative_file + relative_rank;
+    }

-// Get a list of indices with a value of 1 among the features
-template <Side AssociatedKing>
-void HalfRelativeKP<AssociatedKing>::AppendActiveIndices(
-    const Position& pos, Color perspective, IndexList* active) {
-  Square ksq = orient(perspective, pos.square<KING>(perspective));
-  Bitboard bb = pos.pieces() & ~pos.pieces(KING);
-  while (bb) {
-    Square s = pop_lsb(&bb);
-    active->push_back(MakeIndex(perspective, s, pos.piece_on(s), ksq));
-  }
-}
+    // Get a list of indices with a value of 1 among the features
+    template <Side AssociatedKing>
+    void HalfRelativeKP<AssociatedKing>::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {

-// Get a list of indices whose values have changed from the previous one in the feature quantity
-template <Side AssociatedKing>
-void HalfRelativeKP<AssociatedKing>::AppendChangedIndices(
-    const Position& pos, Color perspective,
-    IndexList* removed, IndexList* added) {
-  Square ksq = orient(perspective, pos.square<KING>(perspective));
-  const auto& dp = pos.state()->dirtyPiece;
-  for (int i = 0; i < dp.dirty_num; ++i) {
-    Piece pc = dp.piece[i];
-    if (type_of(pc) == KING) continue;
-    if (dp.from[i] != SQ_NONE)
-      removed->push_back(MakeIndex(perspective, dp.from[i], pc, ksq));
-    if (dp.to[i] != SQ_NONE)
-      added->push_back(MakeIndex(perspective, dp.to[i], pc, ksq));
-  }
-}
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));

-template class HalfRelativeKP<Side::kFriend>;
-template class HalfRelativeKP<Side::kEnemy>;
+        Bitboard bb = pos.pieces() & ~pos.pieces(KING);
+        while (bb) {
+            Square s = pop_lsb(&bb);
+            active->push_back(make_index(perspective, s, pos.piece_on(s), ksq));
+        }
+    }

-}  // namespace Features
+    // Get a list of indices whose values have changed from the previous one in the feature quantity
+    template <Side AssociatedKing>
+    void HalfRelativeKP<AssociatedKing>::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* added) {

-}  // namespace NNUE
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));

-}  // namespace Eval
+        const auto& dp = pos.state()->dirtyPiece;
+        for (int i = 0; i < dp.dirty_num; ++i) {
+            Piece pc = dp.piece[i];

-#endif  // defined(EVAL_NNUE)
+            if (type_of(pc) == KING)
+                continue;
+
+            if (dp.from[i] != SQ_NONE)
+                removed->push_back(make_index(perspective, dp.from[i], pc, ksq));
+
+            if (dp.to[i] != SQ_NONE)
+                added->push_back(make_index(perspective, dp.to[i], pc, ksq));
+        }
+    }
+
+    template class HalfRelativeKP<Side::kFriend>;
+    template class HalfRelativeKP<Side::kEnemy>;
+
+}  // namespace Eval::NNUE::Features
@@ -1,65 +1,66 @@
-//Definition of input features HalfRelativeKP of NNUE evaluation function
-
-#ifndef _NNUE_FEATURES_HALF_RELATIVE_KP_H_
+#ifndef _NNUE_FEATURES_HALF_RELATIVE_KP_H_
 #define _NNUE_FEATURES_HALF_RELATIVE_KP_H_

-#if defined(EVAL_NNUE)
-
-#include "../../evaluate.h"
 #include "features_common.h"

-namespace Eval {
+#include "evaluate.h"

-namespace NNUE {
+//Definition of input features HalfRelativeKP of NNUE evaluation function
+namespace Eval::NNUE::Features {

-namespace Features {
+    // Feature HalfRelativeKP: Relative position of each piece other than the ball based on own ball or enemy ball
+    template <Side AssociatedKing>
+    class HalfRelativeKP {
+    public:
+        // feature quantity name
+        static constexpr const char* kName = (AssociatedKing == Side::kFriend) ?
+            "HalfRelativeKP(Friend)" : "HalfRelativeKP(Enemy)";

-// Feature HalfRelativeKP: Relative position of each piece other than the ball based on own ball or enemy ball
-template <Side AssociatedKing>
-class HalfRelativeKP {
- public:
-  // feature quantity name
-  static constexpr const char* kName = (AssociatedKing == Side::kFriend) ?
-      "HalfRelativeKP(Friend)" : "HalfRelativeKP(Enemy)";
-  // Hash value embedded in the evaluation function file
-  static constexpr std::uint32_t kHashValue =
-      0xF9180919u ^ (AssociatedKing == Side::kFriend);
-  // Piece type excluding balls
-  static constexpr IndexType kNumPieceKinds = 5 * 2;
-  // width of the virtual board with the ball in the center
-  static constexpr IndexType kBoardWidth = FILE_NB * 2 - 1;
-  // height of a virtual board with balls in the center
-  static constexpr IndexType kBoardHeight = RANK_NB * 2 - 1;
-  // number of feature dimensions
-  static constexpr IndexType kDimensions =
-      kNumPieceKinds * kBoardHeight * kBoardWidth;
-  // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-  static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
-  // Timing of full calculation instead of difference calculation
-  static constexpr TriggerEvent kRefreshTrigger =
-      (AssociatedKing == Side::kFriend) ?
-      TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t kHashValue =
+            0xF9180919u ^ (AssociatedKing == Side::kFriend);

-  // Get a list of indices with a value of 1 among the features
-  static void AppendActiveIndices(const Position& pos, Color perspective,
-                                  IndexList* active);
+        // Piece type excluding balls
+        static constexpr IndexType kNumPieceKinds = 5 * 2;

-  // Get a list of indices whose values have changed from the previous one in the feature quantity
-  static void AppendChangedIndices(const Position& pos, Color perspective,
-                                   IndexList* removed, IndexList* added);
+        // width of the virtual board with the ball in the center
+        static constexpr IndexType kBoardWidth = FILE_NB * 2 - 1;

-  // Find the index of the feature quantity from the ball position and PieceSquare
-  static IndexType MakeIndex(Square s, IndexType p);
-  // Find the index of the feature quantity from the ball position and PieceSquare
-  static IndexType MakeIndex(Color perspective, Square s, Piece pc, Square sq_k);
-};
+        // height of a virtual board with balls in the center
+        static constexpr IndexType kBoardHeight = RANK_NB * 2 - 1;

-}  // namespace Features
+        // number of feature dimensions
+        static constexpr IndexType kDimensions =
+            kNumPieceKinds * kBoardHeight * kBoardWidth;

-}  // namespace NNUE
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count

-}  // namespace Eval
+        // Timing of full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger =
+            (AssociatedKing == Side::kFriend) ?
+            TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;

-#endif  // defined(EVAL_NNUE)
+        // Get a list of indices with a value of 1 among the features
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);
+
+        // Get a list of indices whose values have changed from the previous one in the feature quantity
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
+
+        // Find the index of the feature quantity from the ball position and PieceSquare
+        static IndexType make_index(Square s, IndexType p);
+
+        // Find the index of the feature quantity from the ball position and PieceSquare
+        static IndexType make_index(Color perspective, Square s, Piece pc, Square sq_k);
+    };
+
+}  // namespace Eval::NNUE::Features

 #endif
@@ -1,58 +1,45 @@
-//Definition of input feature quantity K of NNUE evaluation function
-
-#if defined(EVAL_NNUE)
-
-#include "k.h"
+#include "k.h"
 #include "index_list.h"

-namespace Eval {
+//Definition of input feature quantity K of NNUE evaluation function
+namespace Eval::NNUE::Features {

-namespace NNUE {
+    // Orient a square according to perspective (rotate the board 180° for black)
+    // this has to stay until we find a better arch that works with "flip".
+    // allows us to use current master net for gensfen (primarily needed for higher quality data)
+    inline Square orient(Color perspective, Square s) {
+        return Square(int(s) ^ (bool(perspective) * 63));
+    }

-namespace Features {
+    // Index of a feature for a given king position.
+    IndexType K::make_index(Color perspective, Square s, Color king_color) {
+        return IndexType(orient(perspective, s) + bool(perspective ^ king_color) * 64);
+    }

-// Orient a square according to perspective (rotates by 180 for black)
-inline Square orient(Color perspective, Square s) {
-  return Square(int(s) ^ (bool(perspective) * 63));
-}
+    // Get a list of indices with a value of 1 among the features
+    void K::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {

-// Index of a feature for a given king position.
-IndexType K::MakeIndex(Color perspective, Square s, Color king_color) {
-  return IndexType(orient(perspective, s) + bool(perspective ^ king_color) * 64);
-}
+        for (auto color : Colors) {
+          active->push_back(make_index(perspective, pos.square<KING>(color), color));
+        }
+    }

-// Get a list of indices with a value of 1 among the features
-void K::AppendActiveIndices(
-    const Position& pos, Color perspective, IndexList* active) {
-  for (auto color : Colors) {
-    active->push_back(MakeIndex(perspective, pos.square<KING>(color), color));
-  }
-}
+    // Get a list of indices whose values have changed from the previous one in the feature quantity
+    void K::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* added) {

-// Get a list of indices whose values have changed from the previous one in the feature quantity
-void K::AppendChangedIndices(
-    const Position& pos, Color perspective,
-    IndexList* removed, IndexList* added) {
-  const auto& dp = pos.state()->dirtyPiece;
-  Color king_color;
-  if (dp.piece[0] == Piece::W_KING) {
-    king_color = WHITE;
-  }
-  else if (dp.piece[0] == Piece::B_KING) {
-    king_color = BLACK;
-  }
-  else {
-    return;
-  }
+        const auto& dp = pos.state()->dirtyPiece;
+        if (type_of(dp.piece[0]) == KING)
+        {
+            removed->push_back(make_index(perspective, dp.from[0], color_of(dp.piece[0])));
+            added->push_back(make_index(perspective, dp.to[0], color_of(dp.piece[0])));
+        }
+    }

-  removed->push_back(MakeIndex(perspective, dp.from[0], king_color));
-  added->push_back(MakeIndex(perspective, dp.to[0], king_color));
-}
-
-}  // namespace Features
-
-}  // namespace NNUE
-
-}  // namespace Eval
-
-#endif  // defined(EVAL_NNUE)
+}  // namespace Eval::NNUE::Features
@@ -1,52 +1,49 @@
-//Definition of input feature quantity K of NNUE evaluation function
-
-#ifndef _NNUE_FEATURES_K_H_
+#ifndef _NNUE_FEATURES_K_H_
 #define _NNUE_FEATURES_K_H_

-#if defined(EVAL_NNUE)
-
-#include "../../evaluate.h"
 #include "features_common.h"

-namespace Eval {
+#include "evaluate.h"

-namespace NNUE {
+//Definition of input feature quantity K of NNUE evaluation function
+namespace Eval::NNUE::Features {

-namespace Features {
+    // Feature K: Ball position
+    class K {
+    public:
+        // feature quantity name
+        static constexpr const char* kName = "K";

-// Feature K: Ball position
-class K {
- public:
-  // feature quantity name
-  static constexpr const char* kName = "K";
-  // Hash value embedded in the evaluation function file
-  static constexpr std::uint32_t kHashValue = 0xD3CEE169u;
-  // number of feature dimensions
-  static constexpr IndexType kDimensions = SQUARE_NB * 2;
-  // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-  static constexpr IndexType kMaxActiveDimensions = 2;
-  // Timing of full calculation instead of difference calculation
-  static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t kHashValue = 0xD3CEE169u;

-  // Get a list of indices with a value of 1 among the features
-  static void AppendActiveIndices(const Position& pos, Color perspective,
-                                  IndexList* active);
+        // number of feature dimensions
+        static constexpr IndexType kDimensions = SQUARE_NB * 2;

-  // Get a list of indices whose values have changed from the previous one in the feature quantity
-  static void AppendChangedIndices(const Position& pos, Color perspective,
-                                   IndexList* removed, IndexList* added);
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions = 2;

-private:
-  // Index of a feature for a given king position.
-  static IndexType MakeIndex(Color perspective, Square s, Color king_color);
-};
+        // Timing of full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;

-}  // namespace Features
+        // Get a list of indices with a value of 1 among the features
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);

-}  // namespace NNUE
+        // Get a list of indices whose values have changed from the previous one in the feature quantity
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);

-}  // namespace Eval
+    private:
+        // Index of a feature for a given king position.
+        static IndexType make_index(Color perspective, Square s, Color king_color);
+    };

-#endif  // defined(EVAL_NNUE)
+}  // namespace Eval::NNUE::Features

 #endif
@@ -1,56 +1,55 @@
-//Definition of input feature P of NNUE evaluation function
-
-#if defined(EVAL_NNUE)
-
-#include "p.h"
+#include "p.h"
 #include "index_list.h"

-namespace Eval {
+//Definition of input feature P of NNUE evaluation function
+namespace Eval::NNUE::Features {

-namespace NNUE {
+    // Orient a square according to perspective (rotate the board 180° for black)
+    // this has to stay until we find a better arch that works with "flip".
+    // allows us to use current master net for gensfen (primarily needed for higher quality data)
+    inline Square orient(Color perspective, Square s) {
+        return Square(int(s) ^ (bool(perspective) * 63));
+    }

-namespace Features {
+    // Find the index of the feature quantity from the king position and PieceSquare
+    inline IndexType P::make_index(
+        Color perspective, Square s, Piece pc) {
+        return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
+    }

-// Orient a square according to perspective (rotates by 180 for black)
-inline Square orient(Color perspective, Square s) {
-  return Square(int(s) ^ (bool(perspective) * 63));
-}
+    // Get a list of indices with a value of 1 among the features
+    void P::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {

-// Find the index of the feature quantity from the king position and PieceSquare
-inline IndexType P::MakeIndex(
-  Color perspective, Square s, Piece pc) {
-  return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
-}
+        Bitboard bb = pos.pieces() & ~pos.pieces(KING);
+        while (bb) {
+            Square s = pop_lsb(&bb);
+            active->push_back(make_index(perspective, s, pos.piece_on(s)));
+        }
+    }

-// Get a list of indices with a value of 1 among the features
-void P::AppendActiveIndices(
-    const Position& pos, Color perspective, IndexList* active) {
-  Bitboard bb = pos.pieces() & ~pos.pieces(KING);
-  while (bb) {
-    Square s = pop_lsb(&bb);
-    active->push_back(MakeIndex(perspective, s, pos.piece_on(s)));
-  }
-}
+    // Get a list of indices whose values have changed from the previous one in the feature quantity
+    void P::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* added) {

-// Get a list of indices whose values have changed from the previous one in the feature quantity
-void P::AppendChangedIndices(
-    const Position& pos, Color perspective,
-    IndexList* removed, IndexList* added) {
-  const auto& dp = pos.state()->dirtyPiece;
-  for (int i = 0; i < dp.dirty_num; ++i) {
-    Piece pc = dp.piece[i];
-    if (type_of(pc) == KING) continue;
-    if (dp.from[i] != SQ_NONE)
-      removed->push_back(MakeIndex(perspective, dp.from[i], pc));
-    if (dp.to[i] != SQ_NONE)
-      added->push_back(MakeIndex(perspective, dp.to[i], pc));
-  }
-}
+        const auto& dp = pos.state()->dirtyPiece;
+        for (int i = 0; i < dp.dirty_num; ++i) {
+            Piece pc = dp.piece[i];

-}  // namespace Features
+            if (type_of(pc) == KING)
+              continue;

-}  // namespace NNUE
+            if (dp.from[i] != SQ_NONE)
+              removed->push_back(make_index(perspective, dp.from[i], pc));

-}  // namespace Eval
+            if (dp.to[i] != SQ_NONE)
+              added->push_back(make_index(perspective, dp.to[i], pc));
+        }
+    }

-#endif  // defined(EVAL_NNUE)
+}  // namespace Eval::NNUE::Features
@@ -1,52 +1,49 @@
-//Definition of input feature P of NNUE evaluation function
-
-#ifndef _NNUE_FEATURES_P_H_
+#ifndef _NNUE_FEATURES_P_H_
 #define _NNUE_FEATURES_P_H_

-#if defined(EVAL_NNUE)
-
-#include "../../evaluate.h"
 #include "features_common.h"

-namespace Eval {
+#include "evaluate.h"

-namespace NNUE {
+//Definition of input feature P of NNUE evaluation function
+namespace Eval::NNUE::Features {

-namespace Features {
+    // Feature P: PieceSquare of pieces other than balls
+    class P {
+    public:
+        // feature quantity name
+        static constexpr const char* kName = "P";

-// Feature P: PieceSquare of pieces other than balls
-class P {
- public:
-  // feature quantity name
-  static constexpr const char* kName = "P";
-  // Hash value embedded in the evaluation function file
-  static constexpr std::uint32_t kHashValue = 0x764CFB4Bu;
-  // number of feature dimensions
-  static constexpr IndexType kDimensions = PS_END;
-  // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-  static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
-  // Timing of full calculation instead of difference calculation
-  static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t kHashValue = 0x764CFB4Bu;

-  // Get a list of indices with a value of 1 among the features
-  static void AppendActiveIndices(const Position& pos, Color perspective,
-                                  IndexList* active);
+        // number of feature dimensions
+        static constexpr IndexType kDimensions = PS_END;

-  // Get a list of indices whose values have changed from the previous one in the feature quantity
-  static void AppendChangedIndices(const Position& pos, Color perspective,
-                                   IndexList* removed, IndexList* added);
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count

- private:
-  // Index of a feature for a given piece on some square
-  static IndexType MakeIndex(Color perspective, Square s, Piece pc);
-};
+        // Timing of full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;

-}  // namespace Features
+        // Get a list of indices with a value of 1 among the features
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);

-}  // namespace NNUE
+        // Get a list of indices whose values have changed from the previous one in the feature quantity
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);

-}  // namespace Eval
+    private:
+        // Index of a feature for a given piece on some square
+        static IndexType make_index(Color perspective, Square s, Piece pc);
+    };

-#endif  // defined(EVAL_NNUE)
+}  // namespace Eval::NNUE::Features

 #endif
@@ -24,6 +24,10 @@
 #include <iostream>
 #include "../nnue_common.h"

+#include <string>
+#include <type_traits>
+#include <cstdint>
+
 namespace Eval::NNUE::Layers {

  // Affine transformation layer
@@ -50,6 +54,8 @@ namespace Eval::NNUE::Layers {
    static constexpr std::size_t kBufferSize =
        PreviousLayer::kBufferSize + kSelfBufferSize;

+    static constexpr int kLayerIndex = PreviousLayer::kLayerIndex + 1;
+
    // Hash value embedded in the evaluation file
    static constexpr std::uint32_t GetHashValue() {
      std::uint32_t hash_value = 0xCC03DAE4u;
@@ -59,14 +65,27 @@ namespace Eval::NNUE::Layers {
      return hash_value;
    }

-    // A string that represents the structure from the input layer to this layer
-    static std::string GetStructureString() {
-      return "AffineTransform[" +
-        std::to_string(kOutputDimensions) + "<-" +
-        std::to_string(kInputDimensions) + "](" +
-        PreviousLayer::GetStructureString() + ")";
+    static std::string get_name() {
+        return "AffineTransform[" +
+            std::to_string(kOutputDimensions) + "<-" +
+            std::to_string(kInputDimensions) + "]";
    }
-    
+
+    // A string that represents the structure from the input layer to this layer
+    static std::string get_structure_string() {
+        return get_name() + "(" +
+            PreviousLayer::get_structure_string() + ")";
+    }
+
+    static std::string get_layers_info() {
+        std::string info = PreviousLayer::get_layers_info();
+        info += "\n  - ";
+        info += std::to_string(kLayerIndex);
+        info += " - ";
+        info += get_name();
+        return info;
+    }
+
   // Read network parameters
    bool ReadParameters(std::istream& stream) {
      if (!previous_layer_.ReadParameters(stream)) return false;
@@ -79,13 +98,17 @@ namespace Eval::NNUE::Layers {

    // write parameters
    bool WriteParameters(std::ostream& stream) const {
-      if (!previous_layer_.WriteParameters(stream)) return false;
-      stream.write(reinterpret_cast<const char*>(biases_),
-        kOutputDimensions * sizeof(BiasType));
-      stream.write(reinterpret_cast<const char*>(weights_),
-        kOutputDimensions * kPaddedInputDimensions *
-        sizeof(WeightType));
-      return !stream.fail();
+        if (!previous_layer_.WriteParameters(stream))
+            return false;
+
+        stream.write(reinterpret_cast<const char*>(biases_),
+            kOutputDimensions * sizeof(BiasType));
+
+        stream.write(reinterpret_cast<const char*>(weights_),
+            kOutputDimensions * kPaddedInputDimensions *
+            sizeof(WeightType));
+
+        return !stream.fail();
    }

    // Forward propagation
@@ -93,113 +116,606 @@ namespace Eval::NNUE::Layers {
        const TransformedFeatureType* transformed_features, char* buffer) const {
      const auto input = previous_layer_.Propagate(
          transformed_features, buffer + kSelfBufferSize);
+
+#if defined (USE_AVX512)
+
+      [[maybe_unused]] const __m512i kOnes512 = _mm512_set1_epi16(1);
+
+      [[maybe_unused]] auto m512_hadd = [](__m512i sum, int bias) -> int {
+        return _mm512_reduce_add_epi32(sum) + bias;
+      };
+
+      // This function takes
+      //   sum0 = [xmm0a, xmm0b, xmm0c, xmm0d]
+      //   sum1 = [xmm1a, xmm1b, xmm1c, xmm1d]
+      //   sum2 = [xmm2a, xmm2b, xmm2c, xmm2d]
+      //   sum3 = [xmm3a, xmm3b, xmm3c, xmm3d]
+      // and returns
+      //   ret = [
+      //     reduce_add_epi32(xmm0a), reduce_add_epi32(xmm1a), reduce_add_epi32(xmm2a), reduce_add_epi32(xmm3a),
+      //     reduce_add_epi32(xmm0b), reduce_add_epi32(xmm1b), reduce_add_epi32(xmm2b), reduce_add_epi32(xmm3b),
+      //     reduce_add_epi32(xmm0c), reduce_add_epi32(xmm1c), reduce_add_epi32(xmm2c), reduce_add_epi32(xmm3c),
+      //     reduce_add_epi32(xmm0d), reduce_add_epi32(xmm1d), reduce_add_epi32(xmm2d), reduce_add_epi32(xmm3d)
+      //   ]
+      [[maybe_unused]] auto m512_hadd128x16_interleave = [](
+        __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3) -> __m512i {
+
+        __m512i sum01a = _mm512_unpacklo_epi32(sum0, sum1);
+        __m512i sum01b = _mm512_unpackhi_epi32(sum0, sum1);
+
+        __m512i sum23a = _mm512_unpacklo_epi32(sum2, sum3);
+        __m512i sum23b = _mm512_unpackhi_epi32(sum2, sum3);
+
+        __m512i sum01 = _mm512_add_epi32(sum01a, sum01b);
+        __m512i sum23 = _mm512_add_epi32(sum23a, sum23b);
+
+        __m512i sum0123a = _mm512_unpacklo_epi64(sum01, sum23);
+        __m512i sum0123b = _mm512_unpackhi_epi64(sum01, sum23);
+
+        return _mm512_add_epi32(sum0123a, sum0123b);
+      };
+
+      [[maybe_unused]] auto m512_haddx4 = [m512_hadd128x16_interleave](
+        __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3, __m128i bias) -> __m128i {
+
+        __m512i sum = m512_hadd128x16_interleave(sum0, sum1, sum2, sum3);
+
+        __m256i sum256lo = _mm512_castsi512_si256(sum);
+        __m256i sum256hi = _mm512_extracti64x4_epi64(sum, 1);
+
+        sum256lo = _mm256_add_epi32(sum256lo, sum256hi);
+
+        __m128i sum128lo = _mm256_castsi256_si128(sum256lo);
+        __m128i sum128hi = _mm256_extracti128_si256(sum256lo, 1);
+
+        return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias);
+      };
+
+      [[maybe_unused]] auto m512_haddx8 = [m512_hadd128x16_interleave](
+        __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3,
+        __m512i sum4, __m512i sum5, __m512i sum6, __m512i sum7, __m256i bias) -> __m256i {
+
+        __m512i suma = m512_hadd128x16_interleave(sum0, sum1, sum2, sum3);
+        __m512i sumb = m512_hadd128x16_interleave(sum4, sum5, sum6, sum7);
+
+        __m512i indices0 = _mm512_setr_epi64(0, 1, 8, 9, 4, 5, 12, 13);
+        __m512i indices1 = _mm512_setr_epi64(2, 3, 10, 11, 6, 7, 14, 15);
+        __m512i x = _mm512_add_epi32(
+          _mm512_permutex2var_epi64(suma, indices0, sumb),
+          _mm512_permutex2var_epi64(suma, indices1, sumb));
+
+        __m256i sum256lo = _mm512_castsi512_si256(x);
+        __m256i sum256hi = _mm512_extracti64x4_epi64(x, 1);
+
+        return _mm256_add_epi32(_mm256_add_epi32(sum256lo, sum256hi), bias);
+      };
+
+      [[maybe_unused]] auto m512_hadd256x8 =[m512_hadd128x16_interleave](
+        __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3, __m256i bias) -> __m256i {
+
+        __m512i sum = m512_hadd128x16_interleave(sum0, sum1, sum2, sum3);
+
+        __m512i indices = _mm512_setr_epi32(
+          0, 4, 8, 12, 2, 6, 10, 14,
+          1, 5, 9, 13, 3, 7, 11, 15);
+        sum = _mm512_permutexvar_epi32(indices, sum);
+
+        __m256i sum256lo = _mm512_castsi512_si256(sum);
+        __m256i sum256hi = _mm512_extracti64x4_epi64(sum, 1);
+
+        return _mm256_add_epi32(_mm256_hadd_epi32(sum256lo, sum256hi), bias);
+      };
+
+      [[maybe_unused]] auto m512_hadd256x16 = [m512_hadd128x16_interleave](
+        __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3,
+        __m512i sum4, __m512i sum5, __m512i sum6, __m512i sum7, __m512i bias) -> __m512i {
+
+        __m512i suma = m512_hadd128x16_interleave(sum0, sum1, sum2, sum3);
+        __m512i sumb = m512_hadd128x16_interleave(sum4, sum5, sum6, sum7);
+
+        __m512i indices0 = _mm512_setr_epi64(0, 1, 8, 9, 4, 5, 12, 13);
+        __m512i indices1 = _mm512_setr_epi64(2, 3, 10, 11, 6, 7, 14, 15);
+        __m512i x = _mm512_add_epi32(
+          _mm512_permutex2var_epi64(suma, indices0, sumb),
+          _mm512_permutex2var_epi64(suma, indices1, sumb));
+
+        __m512i indices = _mm512_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15);
+        return _mm512_add_epi32(_mm512_permutexvar_epi32(indices, x), bias);
+      };
+
+#if defined (USE_VNNI)
+      [[maybe_unused]] auto m512_add_dpbusd_epi32 = [=](__m512i& acc, __m512i a, __m512i b) {
+        acc = _mm512_dpbusd_epi32(acc, a, b);
+#else
+      [[maybe_unused]] auto m512_dpbusd_epi32 = [=](__m512i a, __m512i b) -> __m512i {
+        __m512i product0 = _mm512_maddubs_epi16(a, b);
+        return _mm512_madd_epi16(product0, kOnes512);
+#endif
+      };
+
+#endif
+#if defined (USE_AVX2)
+
+      [[maybe_unused]] const __m256i kOnes256 = _mm256_set1_epi16(1);
+
+      [[maybe_unused]] auto m256_hadd = [](__m256i sum, int bias) -> int {
+        __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
+        sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC));
+        sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB));
+        return _mm_cvtsi128_si32(sum128) + bias;
+      };
+
+      [[maybe_unused]] auto m256_haddx4 = [](__m256i sum0, __m256i sum1, __m256i sum2, __m256i sum3, __m128i bias) -> __m128i {
+        sum0 = _mm256_hadd_epi32(sum0, sum1);
+        sum2 = _mm256_hadd_epi32(sum2, sum3);
+
+        sum0 = _mm256_hadd_epi32(sum0, sum2);
+
+        __m128i sum128lo = _mm256_castsi256_si128(sum0);
+        __m128i sum128hi = _mm256_extracti128_si256(sum0, 1);
+
+        return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias);
+      };
+#if defined (USE_VNNI)
+      [[maybe_unused]] auto m256_add_dpbusd_epi32 = [=](__m256i& acc, __m256i a, __m256i b) {
+        acc = _mm256_dpbusd_epi32(acc, a, b);
+#else
+      [[maybe_unused]] auto m256_dpbusd_epi32 = [=](__m256i a, __m256i b) -> __m256i {
+        __m256i product0 = _mm256_maddubs_epi16(a, b);
+        return _mm256_madd_epi16(product0, kOnes256);
+#endif
+      };
+
+#endif
+
+#if defined (USE_SSSE3)
+
+      [[maybe_unused]] const __m128i kOnes128 = _mm_set1_epi16(1);
+
+      [[maybe_unused]] auto m128_hadd = [](__m128i sum, int bias) -> int {
+        sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC
+        sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB
+        return _mm_cvtsi128_si32(sum) + bias;
+      };
+
+      [[maybe_unused]] auto m128_haddx4 = [](__m128i sum0, __m128i sum1, __m128i sum2, __m128i sum3, __m128i bias) -> __m128i {
+        sum0 = _mm_hadd_epi32(sum0, sum1);
+        sum2 = _mm_hadd_epi32(sum2, sum3);
+
+        sum0 = _mm_hadd_epi32(sum0, sum2);
+
+        return _mm_add_epi32(sum0, bias);
+      };
+
+      [[maybe_unused]] auto m128_dpbusd_epi32 = [=](__m128i a, __m128i b) -> __m128i {
+        __m128i product0 = _mm_maddubs_epi16(a, b);
+        return _mm_madd_epi16(product0, kOnes128);
+      };
+
+#endif
+
+#if defined (USE_AVX512)
+
+      constexpr IndexType kNumChunks512 = kPaddedInputDimensions / (kSimdWidth * 2);
+      constexpr IndexType kNumChunks256 = kPaddedInputDimensions / kSimdWidth;
+
      const auto output = reinterpret_cast<OutputType*>(buffer);

-  #if defined(USE_AVX512)
-      constexpr IndexType kNumChunks = kPaddedInputDimensions / (kSimdWidth * 2);
-      const auto input_vector = reinterpret_cast<const __m512i*>(input);
-  #if !defined(USE_VNNI)
-      const __m512i kOnes = _mm512_set1_epi16(1);
-  #endif
+      // Since to saturate a zmm register it takes 64 bytes we
+      // cannot use AVX512 for the smaller affine transforms.
+      // Instead we fallback to a AVX2 implementation if the
+      // kInputDimensions isn't a multiple of 64.
+      // Note that this means that for example for
+      // kInputDimensions of 96 we fallback to AVX2 even though
+      // the first 64 elements could be processed with AVX512.
+      // This is caused by mixing the __m256 and __m512 variables
+      // required to better handle that case and it would
+      // require handling more cases statically not to lose performance.
+      // This should be revisited if such input dimensions are to be considered.
+      [[maybe_unused]] const auto input_vector512 = reinterpret_cast<const __m512i*>(input);
+      [[maybe_unused]] const auto input_vector256 = reinterpret_cast<const __m256i*>(input);
+
+      // kOutputDimensions is either 1 or a multiple of kSimdWidth
+      // because then it is also an input dimension.
+      if constexpr (kOutputDimensions % 16 == 0 && kNumChunks256 == 1)
+      {
+        for (IndexType i = 0; i < kOutputDimensions; i += 16)
+        {
+          const IndexType offset01a = (i + 0) * kPaddedInputDimensions;
+          const IndexType offset23a = (i + 2) * kPaddedInputDimensions;
+          const IndexType offset45a = (i + 4) * kPaddedInputDimensions;
+          const IndexType offset67a = (i + 6) * kPaddedInputDimensions;
+          const IndexType offset01b = (i + 8) * kPaddedInputDimensions;
+          const IndexType offset23b = (i + 10) * kPaddedInputDimensions;
+          const IndexType offset45b = (i + 12) * kPaddedInputDimensions;
+          const IndexType offset67b = (i + 14) * kPaddedInputDimensions;
+
+          const __m512i bias = *reinterpret_cast<const __m512i*>(&biases_[i]);
+          __m512i* outptr = reinterpret_cast<__m512i*>(&output[i]);
+
+          const auto row01a = *reinterpret_cast<const __m512i*>(&weights_[offset01a]);
+          const auto row23a = *reinterpret_cast<const __m512i*>(&weights_[offset23a]);
+          const auto row45a = *reinterpret_cast<const __m512i*>(&weights_[offset45a]);
+          const auto row67a = *reinterpret_cast<const __m512i*>(&weights_[offset67a]);
+          const auto row01b = *reinterpret_cast<const __m512i*>(&weights_[offset01b]);
+          const auto row23b = *reinterpret_cast<const __m512i*>(&weights_[offset23b]);
+          const auto row45b = *reinterpret_cast<const __m512i*>(&weights_[offset45b]);
+          const auto row67b = *reinterpret_cast<const __m512i*>(&weights_[offset67b]);
+
+          const __m256i in256 = input_vector256[0];
+          const __m512i in = _mm512_inserti64x4(_mm512_castsi256_si512(in256), in256, 1);
+
+#if defined (USE_VNNI)
+          __m512i sum01a = _mm512_setzero_si512();
+          __m512i sum23a = _mm512_setzero_si512();
+          __m512i sum45a = _mm512_setzero_si512();
+          __m512i sum67a = _mm512_setzero_si512();
+          __m512i sum01b = _mm512_setzero_si512();
+          __m512i sum23b = _mm512_setzero_si512();
+          __m512i sum45b = _mm512_setzero_si512();
+          __m512i sum67b = _mm512_setzero_si512();
+
+          m512_add_dpbusd_epi32(sum01a, in, row01a);
+          m512_add_dpbusd_epi32(sum23a, in, row23a);
+          m512_add_dpbusd_epi32(sum45a, in, row45a);
+          m512_add_dpbusd_epi32(sum67a, in, row67a);
+          m512_add_dpbusd_epi32(sum01b, in, row01b);
+          m512_add_dpbusd_epi32(sum23b, in, row23b);
+          m512_add_dpbusd_epi32(sum45b, in, row45b);
+          m512_add_dpbusd_epi32(sum67b, in, row67b);
+#else
+          __m512i sum01a = m512_dpbusd_epi32(in, row01a);
+          __m512i sum23a = m512_dpbusd_epi32(in, row23a);
+          __m512i sum45a = m512_dpbusd_epi32(in, row45a);
+          __m512i sum67a = m512_dpbusd_epi32(in, row67a);
+          __m512i sum01b = m512_dpbusd_epi32(in, row01b);
+          __m512i sum23b = m512_dpbusd_epi32(in, row23b);
+          __m512i sum45b = m512_dpbusd_epi32(in, row45b);
+          __m512i sum67b = m512_dpbusd_epi32(in, row67b);
+#endif
+
+          *outptr = m512_hadd256x16(
+            sum01a, sum23a, sum45a, sum67a,
+            sum01b, sum23b, sum45b, sum67b, bias);
+        }
+      }
+      else if constexpr (kOutputDimensions % 4 == 0)
+      {
+        for (IndexType i = 0; i < kOutputDimensions; i += 4)
+        {
+          const IndexType offset0 = (i + 0) * kPaddedInputDimensions;
+          const IndexType offset1 = (i + 1) * kPaddedInputDimensions;
+          const IndexType offset2 = (i + 2) * kPaddedInputDimensions;
+          const IndexType offset3 = (i + 3) * kPaddedInputDimensions;
+
+          const __m128i bias = *reinterpret_cast<const __m128i*>(&biases_[i]);
+          __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]);
+
+          if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) == 0)
+          {
+            const auto row0 = reinterpret_cast<const __m512i*>(&weights_[offset0]);
+            const auto row1 = reinterpret_cast<const __m512i*>(&weights_[offset1]);
+            const auto row2 = reinterpret_cast<const __m512i*>(&weights_[offset2]);
+            const auto row3 = reinterpret_cast<const __m512i*>(&weights_[offset3]);
+
+#if defined (USE_VNNI)
+            __m512i sum0 = _mm512_setzero_si512();
+            __m512i sum1 = _mm512_setzero_si512();
+            __m512i sum2 = _mm512_setzero_si512();
+            __m512i sum3 = _mm512_setzero_si512();
+            const IndexType kStart = 0;
+#else
+            __m512i sum0 = m512_dpbusd_epi32(input_vector512[0], row0[0]);
+            __m512i sum1 = m512_dpbusd_epi32(input_vector512[0], row1[0]);
+            __m512i sum2 = m512_dpbusd_epi32(input_vector512[0], row2[0]);
+            __m512i sum3 = m512_dpbusd_epi32(input_vector512[0], row3[0]);
+            const IndexType kStart = 1;
+#endif
+
+            for (IndexType j = kStart; j < kNumChunks512; ++j)
+            {
+              const __m512i in = input_vector512[j];
+
+#if defined (USE_VNNI)
+              m512_add_dpbusd_epi32(sum0, in, row0[j]);
+              m512_add_dpbusd_epi32(sum1, in, row1[j]);
+              m512_add_dpbusd_epi32(sum2, in, row2[j]);
+              m512_add_dpbusd_epi32(sum3, in, row3[j]);
+#else
+              sum0 = _mm512_add_epi32(sum0, m512_dpbusd_epi32(in, row0[j]));
+              sum1 = _mm512_add_epi32(sum1, m512_dpbusd_epi32(in, row1[j]));
+              sum2 = _mm512_add_epi32(sum2, m512_dpbusd_epi32(in, row2[j]));
+              sum3 = _mm512_add_epi32(sum3, m512_dpbusd_epi32(in, row3[j]));
+#endif
+            }
+
+            *outptr = m512_haddx4(sum0, sum1, sum2, sum3, bias);
+          }
+          else
+          {
+            const auto row0 = reinterpret_cast<const __m256i*>(&weights_[offset0]);
+            const auto row1 = reinterpret_cast<const __m256i*>(&weights_[offset1]);
+            const auto row2 = reinterpret_cast<const __m256i*>(&weights_[offset2]);
+            const auto row3 = reinterpret_cast<const __m256i*>(&weights_[offset3]);
+
+#if defined (USE_VNNI)
+            __m256i sum0 = _mm256_setzero_si256();
+            __m256i sum1 = _mm256_setzero_si256();
+            __m256i sum2 = _mm256_setzero_si256();
+            __m256i sum3 = _mm256_setzero_si256();
+            const IndexType kStart = 0;
+#else
+            __m256i sum0 = m256_dpbusd_epi32(input_vector256[0], row0[0]);
+            __m256i sum1 = m256_dpbusd_epi32(input_vector256[0], row1[0]);
+            __m256i sum2 = m256_dpbusd_epi32(input_vector256[0], row2[0]);
+            __m256i sum3 = m256_dpbusd_epi32(input_vector256[0], row3[0]);
+            const IndexType kStart = 1;
+#endif
+
+            for (IndexType j = kStart; j < kNumChunks256; ++j)
+            {
+              const __m256i in = input_vector256[j];
+
+#if defined (USE_VNNI)
+              m256_add_dpbusd_epi32(sum0, in, row0[j]);
+              m256_add_dpbusd_epi32(sum1, in, row1[j]);
+              m256_add_dpbusd_epi32(sum2, in, row2[j]);
+              m256_add_dpbusd_epi32(sum3, in, row3[j]);
+#else
+              sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
+              sum1 = _mm256_add_epi32(sum1, m256_dpbusd_epi32(in, row1[j]));
+              sum2 = _mm256_add_epi32(sum2, m256_dpbusd_epi32(in, row2[j]));
+              sum3 = _mm256_add_epi32(sum3, m256_dpbusd_epi32(in, row3[j]));
+#endif
+            }
+
+            *outptr = m256_haddx4(sum0, sum1, sum2, sum3, bias);
+          }
+        }
+      }
+      else if constexpr (kOutputDimensions == 1)
+      {
+        if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) == 0)
+        {
+          const auto row0 = reinterpret_cast<const __m512i*>(&weights_[0]);
+
+#if defined (USE_VNNI)
+          __m512i sum0 = _mm512_setzero_si512();
+          const IndexType kStart = 0;
+#else
+          __m512i sum0 = m512_dpbusd_epi32(input_vector512[0], row0[0]);
+          const IndexType kStart = 1;
+#endif
+
+          for (IndexType j = kStart; j < kNumChunks512; ++j)
+          {
+            const __m512i in = input_vector512[j];
+
+#if defined (USE_VNNI)
+            m512_add_dpbusd_epi32(sum0, in, row0[j]);
+#else
+            sum0 = _mm512_add_epi32(sum0, m512_dpbusd_epi32(in, row0[j]));
+#endif
+          }
+
+          output[0] = m512_hadd(sum0, biases_[0]);
+        }
+        else
+        {
+          const auto row0 = reinterpret_cast<const __m256i*>(&weights_[0]);
+
+#if defined (USE_VNNI)
+          __m256i sum0 = _mm256_setzero_si256();
+          const IndexType kStart = 0;
+#else
+          __m256i sum0 = m256_dpbusd_epi32(input_vector256[0], row0[0]);
+          const IndexType kStart = 1;
+#endif
+
+          for (IndexType j = kStart; j < kNumChunks256; ++j)
+          {
+            const __m256i in = input_vector256[j];
+
+#if defined (USE_VNNI)
+            m256_add_dpbusd_epi32(sum0, in, row0[j]);
+#else
+            sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
+#endif
+          }
+
+          output[0] = m256_hadd(sum0, biases_[0]);
+        }
+      }
+      else
+      {
+        // This case can never happen because kOutputDimensions
+        // is always 1 or a multiple of kSimdWidth.
+        assert(false);
+      }
+
+#elif defined (USE_AVX2)

-  #elif defined(USE_AVX2)
      constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+
+      const auto output = reinterpret_cast<OutputType*>(buffer);
      const auto input_vector = reinterpret_cast<const __m256i*>(input);
-  #if !defined(USE_VNNI)
-      const __m256i kOnes = _mm256_set1_epi16(1);
-  #endif

-  #elif defined(USE_SSE2)
+      // kOutputDimensions is either 1 or a multiple of kSimdWidth
+      // because then it is also an input dimension.
+      if constexpr (kOutputDimensions % 4 == 0)
+      {
+        for (IndexType i = 0; i < kOutputDimensions; i += 4)
+        {
+          const IndexType offset0 = (i + 0) * kPaddedInputDimensions;
+          const IndexType offset1 = (i + 1) * kPaddedInputDimensions;
+          const IndexType offset2 = (i + 2) * kPaddedInputDimensions;
+          const IndexType offset3 = (i + 3) * kPaddedInputDimensions;
+
+          const __m128i bias = *reinterpret_cast<const __m128i*>(&biases_[i]);
+          __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]);
+
+          const auto row0 = reinterpret_cast<const __m256i*>(&weights_[offset0]);
+          const auto row1 = reinterpret_cast<const __m256i*>(&weights_[offset1]);
+          const auto row2 = reinterpret_cast<const __m256i*>(&weights_[offset2]);
+          const auto row3 = reinterpret_cast<const __m256i*>(&weights_[offset3]);
+
+#if defined (USE_VNNI)
+          __m256i sum0 = _mm256_setzero_si256();
+          __m256i sum1 = _mm256_setzero_si256();
+          __m256i sum2 = _mm256_setzero_si256();
+          __m256i sum3 = _mm256_setzero_si256();
+          const IndexType kStart = 0;
+#else
+          __m256i sum0 = m256_dpbusd_epi32(input_vector[0], row0[0]);
+          __m256i sum1 = m256_dpbusd_epi32(input_vector[0], row1[0]);
+          __m256i sum2 = m256_dpbusd_epi32(input_vector[0], row2[0]);
+          __m256i sum3 = m256_dpbusd_epi32(input_vector[0], row3[0]);
+          const IndexType kStart = 1;
+#endif
+
+          for (IndexType j = kStart; j < kNumChunks; ++j)
+          {
+            const __m256i in = input_vector[j];
+
+#if defined (USE_VNNI)
+            m256_add_dpbusd_epi32(sum0, in, row0[j]);
+            m256_add_dpbusd_epi32(sum1, in, row1[j]);
+            m256_add_dpbusd_epi32(sum2, in, row2[j]);
+            m256_add_dpbusd_epi32(sum3, in, row3[j]);
+#else
+            sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
+            sum1 = _mm256_add_epi32(sum1, m256_dpbusd_epi32(in, row1[j]));
+            sum2 = _mm256_add_epi32(sum2, m256_dpbusd_epi32(in, row2[j]));
+            sum3 = _mm256_add_epi32(sum3, m256_dpbusd_epi32(in, row3[j]));
+#endif
+          }
+
+          *outptr = m256_haddx4(sum0, sum1, sum2, sum3, bias);
+        }
+      }
+      else if constexpr (kOutputDimensions == 1)
+      {
+        const auto row0 = reinterpret_cast<const __m256i*>(&weights_[0]);
+
+#if defined (USE_VNNI)
+        __m256i sum0 = _mm256_setzero_si256();
+        const IndexType kStart = 0;
+#else
+        __m256i sum0 = m256_dpbusd_epi32(input_vector[0], row0[0]);
+        const IndexType kStart = 1;
+#endif
+
+        for (IndexType j = kStart; j < kNumChunks; ++j)
+        {
+          const __m256i in = input_vector[j];
+
+#if defined (USE_VNNI)
+          m256_add_dpbusd_epi32(sum0, in, row0[j]);
+#else
+          sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
+#endif
+        }
+
+        output[0] = m256_hadd(sum0, biases_[0]);
+      }
+      else
+      {
+        // This case can never happen because kOutputDimensions
+        // is always 1 or a multiple of kSimdWidth.
+        assert(false);
+      }
+
+#elif defined (USE_SSSE3)
+
      constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
-  #ifndef USE_SSSE3
-      const __m128i kZeros = _mm_setzero_si128();
-  #else
-      const __m128i kOnes = _mm_set1_epi16(1);
-  #endif
+
+      auto output = reinterpret_cast<OutputType*>(buffer);
      const auto input_vector = reinterpret_cast<const __m128i*>(input);

-  #elif defined(USE_MMX)
+      // kOutputDimensions is either 1 or a multiple of kSimdWidth
+      // because then it is also an input dimension.
+      if constexpr (kOutputDimensions % 4 == 0)
+      {
+        for (IndexType i = 0; i < kOutputDimensions; i += 4)
+        {
+          const IndexType offset0 = (i + 0) * kPaddedInputDimensions;
+          const IndexType offset1 = (i + 1) * kPaddedInputDimensions;
+          const IndexType offset2 = (i + 2) * kPaddedInputDimensions;
+          const IndexType offset3 = (i + 3) * kPaddedInputDimensions;
+
+          const __m128i bias = *reinterpret_cast<const __m128i*>(&biases_[i]);
+          __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]);
+
+          const auto row0 = reinterpret_cast<const __m128i*>(&weights_[offset0]);
+          const auto row1 = reinterpret_cast<const __m128i*>(&weights_[offset1]);
+          const auto row2 = reinterpret_cast<const __m128i*>(&weights_[offset2]);
+          const auto row3 = reinterpret_cast<const __m128i*>(&weights_[offset3]);
+
+          __m128i sum0 = m128_dpbusd_epi32(input_vector[0], row0[0]);
+          __m128i sum1 = m128_dpbusd_epi32(input_vector[0], row1[0]);
+          __m128i sum2 = m128_dpbusd_epi32(input_vector[0], row2[0]);
+          __m128i sum3 = m128_dpbusd_epi32(input_vector[0], row3[0]);
+
+          for (int j = 1; j < (int)kNumChunks; ++j)
+          {
+            const __m128i in = input_vector[j];
+
+            sum0 = _mm_add_epi32(sum0, m128_dpbusd_epi32(in, row0[j]));
+            sum1 = _mm_add_epi32(sum1, m128_dpbusd_epi32(in, row1[j]));
+            sum2 = _mm_add_epi32(sum2, m128_dpbusd_epi32(in, row2[j]));
+            sum3 = _mm_add_epi32(sum3, m128_dpbusd_epi32(in, row3[j]));
+          }
+
+          *outptr = m128_haddx4(sum0, sum1, sum2, sum3, bias);
+        }
+      }
+      else if constexpr (kOutputDimensions == 1)
+      {
+        const auto row0 = reinterpret_cast<const __m128i*>(&weights_[0]);
+
+        __m128i sum0 = m128_dpbusd_epi32(input_vector[0], row0[0]);
+
+        for (int j = 1; j < (int)kNumChunks; ++j)
+          sum0 = _mm_add_epi32(sum0, m128_dpbusd_epi32(input_vector[j], row0[j]));
+
+        output[0] = m128_hadd(sum0, biases_[0]);
+      }
+      else
+      {
+        // This case can never happen because kOutputDimensions
+        // is always 1 or a multiple of kSimdWidth.
+        assert(false);
+      }
+
+#else
+
+// Use old implementation for the other architectures.
+
+      auto output = reinterpret_cast<OutputType*>(buffer);
+
+#if defined(USE_SSE2)
+      constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+#ifndef USE_SSSE3
+      const __m128i kZeros = _mm_setzero_si128();
+#else
+      const __m128i kOnes = _mm_set1_epi16(1);
+#endif
+      const auto input_vector = reinterpret_cast<const __m128i*>(input);
+
+#elif defined(USE_MMX)
      constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
      const __m64 kZeros = _mm_setzero_si64();
      const auto input_vector = reinterpret_cast<const __m64*>(input);

-  #elif defined(USE_NEON)
+#elif defined(USE_NEON)
      constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
      const auto input_vector = reinterpret_cast<const int8x8_t*>(input);
-  #endif
+#endif

      for (IndexType i = 0; i < kOutputDimensions; ++i) {
        const IndexType offset = i * kPaddedInputDimensions;

-  #if defined(USE_AVX512)
-        __m512i sum = _mm512_setzero_si512();
-        const auto row = reinterpret_cast<const __m512i*>(&weights_[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-  #if defined(USE_VNNI)
-            sum = _mm512_dpbusd_epi32(sum, _mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
-  #else
-            __m512i product = _mm512_maddubs_epi16(_mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
-            product = _mm512_madd_epi16(product, kOnes);
-            sum = _mm512_add_epi32(sum, product);
-  #endif
-        }
-
-        // Note: Changing kMaxSimdWidth from 32 to 64 breaks loading existing networks.
-        // As a result kPaddedInputDimensions may not be an even multiple of 64(512bit)
-        // and we have to do one more 256bit chunk.
-        if (kPaddedInputDimensions != kNumChunks * kSimdWidth * 2)
-        {
-            const auto iv256  = reinterpret_cast<const __m256i*>(&input_vector[kNumChunks]);
-            const auto row256 = reinterpret_cast<const __m256i*>(&row[kNumChunks]);
-  #if defined(USE_VNNI)
-            __m256i product256 = _mm256_dpbusd_epi32(
-                _mm512_castsi512_si256(sum), _mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0]));
-            sum = _mm512_inserti32x8(sum, product256, 0);
-  #else
-            __m256i product256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0]));
-            sum = _mm512_add_epi32(sum, _mm512_cvtepi16_epi32(product256));
-  #endif
-        }
-        output[i] = _mm512_reduce_add_epi32(sum) + biases_[i];
-
-  #elif defined(USE_AVX2)
-        __m256i sum = _mm256_setzero_si256();
-        const auto row = reinterpret_cast<const __m256i*>(&weights_[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-  #if defined(USE_VNNI)
-          sum = _mm256_dpbusd_epi32(sum, _mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
-  #else
-          __m256i product = _mm256_maddubs_epi16(_mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
-          product = _mm256_madd_epi16(product, kOnes);
-          sum = _mm256_add_epi32(sum, product);
-  #endif
-        }
-        __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
-        sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC));
-        sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB));
-        output[i] = _mm_cvtsi128_si32(sum128) + biases_[i];
-
-  #elif defined(USE_SSSE3)
-        __m128i sum = _mm_setzero_si128();
-        const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
-        for (int j = 0; j < (int)kNumChunks - 1; j += 2) {
-          __m128i product0 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j]));
-          product0 = _mm_madd_epi16(product0, kOnes);
-          sum = _mm_add_epi32(sum, product0);
-          __m128i product1 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j+1]), _mm_load_si128(&row[j+1]));
-          product1 = _mm_madd_epi16(product1, kOnes);
-          sum = _mm_add_epi32(sum, product1);
-        }
-        if (kNumChunks & 0x1) {
-          __m128i product = _mm_maddubs_epi16(_mm_load_si128(&input_vector[kNumChunks-1]), _mm_load_si128(&row[kNumChunks-1]));
-          product = _mm_madd_epi16(product, kOnes);
-          sum = _mm_add_epi32(sum, product);
-        }
-        sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC
-        sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB
-        output[i] = _mm_cvtsi128_si32(sum) + biases_[i];
-
-  #elif defined(USE_SSE2)
+#if defined(USE_SSE2)
        __m128i sum_lo = _mm_cvtsi32_si128(biases_[i]);
        __m128i sum_hi = kZeros;
        const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
@@ -223,7 +739,7 @@ namespace Eval::NNUE::Layers {
        sum = _mm_add_epi32(sum, sum_second_32);
        output[i] = _mm_cvtsi128_si32(sum);

-  #elif defined(USE_MMX)
+#elif defined(USE_MMX)
        __m64 sum_lo = _mm_cvtsi32_si64(biases_[i]);
        __m64 sum_hi = kZeros;
        const auto row = reinterpret_cast<const __m64*>(&weights_[offset]);
@@ -244,7 +760,7 @@ namespace Eval::NNUE::Layers {
        sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum));
        output[i] = _mm_cvtsi64_si32(sum);

-  #elif defined(USE_NEON)
+#elif defined(USE_NEON)
        int32x4_t sum = {biases_[i]};
        const auto row = reinterpret_cast<const int8x8_t*>(&weights_[offset]);
        for (IndexType j = 0; j < kNumChunks; ++j) {
@@ -254,18 +770,21 @@ namespace Eval::NNUE::Layers {
        }
        output[i] = sum[0] + sum[1] + sum[2] + sum[3];

-  #else
+#else
        OutputType sum = biases_[i];
        for (IndexType j = 0; j < kInputDimensions; ++j) {
          sum += weights_[offset + j] * input[j];
        }
        output[i] = sum;
-  #endif
+#endif

      }
-  #if defined(USE_MMX)
+#if defined(USE_MMX)
      _mm_empty();
-  #endif
+#endif
+
+#endif
+
      return output;
    }

@@ -23,6 +23,10 @@

 #include "../nnue_common.h"

+#include <string>
+#include <cstdint>
+#include <type_traits>
+
 namespace Eval::NNUE::Layers {

  // Clipped ReLU
@@ -47,6 +51,8 @@ namespace Eval::NNUE::Layers {
    static constexpr std::size_t kBufferSize =
        PreviousLayer::kBufferSize + kSelfBufferSize;

+    static constexpr int kLayerIndex = PreviousLayer::kLayerIndex + 1;
+
    // Hash value embedded in the evaluation file
    static constexpr std::uint32_t GetHashValue() {
      std::uint32_t hash_value = 0x538D24C7u;
@@ -54,11 +60,24 @@ namespace Eval::NNUE::Layers {
      return hash_value;
    }

+    static std::string get_name() {
+        return "ClippedReLU[" +
+            std::to_string(kOutputDimensions) + "]";
+    }
+
    // A string that represents the structure from the input layer to this layer
-    static std::string GetStructureString() {
-      return "ClippedReLU[" +
-        std::to_string(kOutputDimensions) + "](" +
-        PreviousLayer::GetStructureString() + ")";
+    static std::string get_structure_string() {
+        return get_name() + "(" +
+            PreviousLayer::get_structure_string() + ")";
+    }
+
+    static std::string get_layers_info() {
+        std::string info = PreviousLayer::get_layers_info();
+        info += "\n  - ";
+        info += std::to_string(kLayerIndex);
+        info += " - ";
+        info += get_name();
+        return info;
    }

    // Read network parameters
@@ -68,7 +87,7 @@ namespace Eval::NNUE::Layers {

    // write parameters
    bool WriteParameters(std::ostream& stream) const {
-      return previous_layer_.WriteParameters(stream);
+        return previous_layer_.WriteParameters(stream);
    }

    // Forward propagation
@@ -86,12 +105,12 @@ namespace Eval::NNUE::Layers {
      const auto out = reinterpret_cast<__m256i*>(output);
      for (IndexType i = 0; i < kNumChunks; ++i) {
        const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32(
-            _mm256_loadA_si256(&in[i * 4 + 0]),
-            _mm256_loadA_si256(&in[i * 4 + 1])), kWeightScaleBits);
+            _mm256_load_si256(&in[i * 4 + 0]),
+            _mm256_load_si256(&in[i * 4 + 1])), kWeightScaleBits);
        const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32(
-            _mm256_loadA_si256(&in[i * 4 + 2]),
-            _mm256_loadA_si256(&in[i * 4 + 3])), kWeightScaleBits);
-        _mm256_storeA_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
+            _mm256_load_si256(&in[i * 4 + 2]),
+            _mm256_load_si256(&in[i * 4 + 3])), kWeightScaleBits);
+        _mm256_store_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
            _mm256_packs_epi16(words0, words1), kZero), kOffsets));
      }
      constexpr IndexType kStart = kNumChunks * kSimdWidth;
@@ -170,9 +189,9 @@ namespace Eval::NNUE::Layers {
    }

   private:
-     // Make the learning class a friend
-     friend class Trainer<ClippedReLU>;
-     
+    // Make the learning class a friend
+    friend class Trainer<ClippedReLU>;
+
    PreviousLayer previous_layer_;
  };

@@ -41,6 +41,8 @@ class InputSlice {
  // Size of forward propagation buffer used from the input layer to this layer
  static constexpr std::size_t kBufferSize = 0;

+  static constexpr int kLayerIndex = 1;
+
  // Hash value embedded in the evaluation file
  static constexpr std::uint32_t GetHashValue() {
    std::uint32_t hash_value = 0xEC42E90Du;
@@ -48,12 +50,24 @@ class InputSlice {
    return hash_value;
  }

-  // A string that represents the structure from the input layer to this layer
-  static std::string GetStructureString() {
-    return "InputSlice[" + std::to_string(kOutputDimensions) + "(" +
-      std::to_string(Offset) + ":" +
-      std::to_string(Offset + kOutputDimensions) + ")]";
-  }
+    static std::string get_name() {
+        return "InputSlice[" + std::to_string(kOutputDimensions) + "(" +
+            std::to_string(Offset) + ":" +
+            std::to_string(Offset + kOutputDimensions) + ")]";
+    }
+
+    // A string that represents the structure from the input layer to this layer
+    static std::string get_structure_string() {
+        return get_name();
+    }
+
+    static std::string get_layers_info() {
+        std::string info = "  - ";
+        info += std::to_string(kLayerIndex);
+        info += " - ";
+        info += get_name();
+        return info;
+    }

  // Read network parameters
  bool ReadParameters(std::istream& /*stream*/) {
@@ -62,7 +76,7 @@ class InputSlice {

  // write parameters
  bool WriteParameters(std::ostream& /*stream*/) const {
-    return true;
+      return true;
  }

  // Forward propagation
@@ -1,163 +1,196 @@
-// Definition of layer Sum of NNUE evaluation function
-
-#ifndef _NNUE_LAYERS_SUM_H_
+#ifndef _NNUE_LAYERS_SUM_H_
 #define _NNUE_LAYERS_SUM_H_

-#if defined(EVAL_NNUE)
+#include "nnue/nnue_common.h"

-#include "../nnue_common.h"
+// Definition of layer Sum of NNUE evaluation function
+namespace Eval::NNUE::Layers {

-namespace Eval {
+    // Layer that sums the output of multiple layers
+    template <typename FirstPreviousLayer, typename... RemainingPreviousLayers>
+    class Sum : public Sum<RemainingPreviousLayers...> {
+    private:
+        using Head = FirstPreviousLayer;
+        using Tail = Sum<RemainingPreviousLayers...>;

-namespace NNUE {
+     public:
+        // Input/output type
+        using InputType = typename Head::OutputType;

-namespace Layers {
+        using OutputType = InputType;

-// Layer that sums the output of multiple layers
-template <typename FirstPreviousLayer, typename... RemainingPreviousLayers>
-class Sum : public Sum<RemainingPreviousLayers...> {
- private:
-  using Head = FirstPreviousLayer;
-  using Tail = Sum<RemainingPreviousLayers...>;
+        static_assert(std::is_same<InputType, typename Tail::InputType>::value, "");

- public:
-  // Input/output type
-  using InputType = typename Head::OutputType;
-  using OutputType = InputType;
-  static_assert(std::is_same<InputType, typename Tail::InputType>::value, "");
+        // number of input/output dimensions
+        static constexpr IndexType kInputDimensions = Head::kOutputDimensions;

-  // number of input/output dimensions
-  static constexpr IndexType kInputDimensions = Head::kOutputDimensions;
-  static constexpr IndexType kOutputDimensions = kInputDimensions;
-  static_assert(kInputDimensions == Tail::kInputDimensions ,"");
+        static constexpr IndexType kOutputDimensions = kInputDimensions;

-  // Size of forward propagation buffer used in this layer
-  static constexpr std::size_t kSelfBufferSize =
-      CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
+        static_assert(kInputDimensions == Tail::kInputDimensions ,"");

-  // Size of the forward propagation buffer used from the input layer to this layer
-  static constexpr std::size_t kBufferSize =
-      std::max(Head::kBufferSize + kSelfBufferSize, Tail::kBufferSize);
+        // Size of forward propagation buffer used in this layer
+        static constexpr std::size_t kSelfBufferSize =
+            CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);

-  // Hash value embedded in the evaluation function file
-  static constexpr std::uint32_t GetHashValue() {
-    std::uint32_t hash_value = 0xBCE400B4u;
-    hash_value ^= Head::GetHashValue() >> 1;
-    hash_value ^= Head::GetHashValue() << 31;
-    hash_value ^= Tail::GetHashValue() >> 2;
-    hash_value ^= Tail::GetHashValue() << 30;
-    return hash_value;
-  }
+        // Size of the forward propagation buffer used from the input layer to this layer
+        static constexpr std::size_t kBufferSize =
+            std::max(Head::kBufferSize + kSelfBufferSize, Tail::kBufferSize);

-  // A string that represents the structure from the input layer to this layer
-  static std::string GetStructureString() {
-    return "Sum[" +
-        std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")";
-  }
+        static constexpr int kLayerIndex = Tail::kLayerIndex + 1;

-  // read parameters
-  bool ReadParameters(std::istream& stream) {
-    if (!Tail::ReadParameters(stream)) return false;
-    return previous_layer_.ReadParameters(stream);
-  }
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t GetHashValue() {
+            std::uint32_t hash_value = 0xBCE400B4u;
+            hash_value ^= Head::GetHashValue() >> 1;
+            hash_value ^= Head::GetHashValue() << 31;
+            hash_value ^= Tail::GetHashValue() >> 2;
+            hash_value ^= Tail::GetHashValue() << 30;
+            return hash_value;
+        }

-  // write parameters
-  bool WriteParameters(std::ostream& stream) const {
-    if (!Tail::WriteParameters(stream)) return false;
-    return previous_layer_.WriteParameters(stream);
-  }
+        static std::string get_name() {
+             return "Sum[" +
+                std::to_string(kOutputDimensions) + "]";
+        }

-  // forward propagation
-  const OutputType* Propagate(
-      const TransformedFeatureType* transformed_features, char* buffer) const {
-    Tail::Propagate(transformed_features, buffer);
-    const auto head_output = previous_layer_.Propagate(
-        transformed_features, buffer + kSelfBufferSize);
-    const auto output = reinterpret_cast<OutputType*>(buffer);
-    for (IndexType i = 0; i <kOutputDimensions; ++i) {
-      output[i] += head_output[i];
-    }
-    return output;
-  }
+        // A string that represents the structure from the input layer to this layer
+        static std::string get_structure_string() {
+            return get_name() + "(" + get_summands_string() + ")";
+        }

- protected:
-  // A string that represents the list of layers to be summed
-  static std::string GetSummandsString() {
-    return Head::GetStructureString() + "," + Tail::GetSummandsString();
-  }
+        static std::string get_layers_info() {
+            std::string info = Tail::get_layers_info();
+            info += "\n  - ";
+            info += std::to_string(kLayerIndex);
+            info += " - ";
+            info += get_name();
+            return info;
+        }

-  // Make the learning class a friend
-  friend class Trainer<Sum>;
+        // read parameters
+        bool ReadParameters(std::istream& stream) {
+            if (!Tail::ReadParameters(stream))
+                return false;

-  // the layer immediately before this layer
-  FirstPreviousLayer previous_layer_;
-};
+            return previous_layer_.ReadParameters(stream);
+        }

-// Layer that sums the output of multiple layers (when there is one template argument)
-template <typename PreviousLayer>
-class Sum<PreviousLayer> {
- public:
-  // Input/output type
-  using InputType = typename PreviousLayer::OutputType;
-  using OutputType = InputType;
+        // write parameters
+        bool WriteParameters(std::ostream& stream) const {
+            if (!Tail::WriteParameters(stream))
+                return false;

-  // number of input/output dimensions
-  static constexpr IndexType kInputDimensions =
-      PreviousLayer::kOutputDimensions;
-  static constexpr IndexType kOutputDimensions = kInputDimensions;
+            return previous_layer_.WriteParameters(stream);
+        }

-  // Size of the forward propagation buffer used from the input layer to this layer
-  static constexpr std::size_t kBufferSize = PreviousLayer::kBufferSize;
+        // forward propagation
+        const OutputType* propagate(
+            const TransformedFeatureType* transformed_features, char* buffer) const {

-  // Hash value embedded in the evaluation function file
-  static constexpr std::uint32_t GetHashValue() {
-    std::uint32_t hash_value = 0xBCE400B4u;
-    hash_value ^= PreviousLayer::GetHashValue() >> 1;
-    hash_value ^= PreviousLayer::GetHashValue() << 31;
-    return hash_value;
-  }
+            Tail::propagate(transformed_features, buffer);

-  // A string that represents the structure from the input layer to this layer
-  static std::string GetStructureString() {
-    return "Sum[" +
-        std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")";
-  }
+            const auto head_output = previous_layer_.Propagate(
+                transformed_features, buffer + kSelfBufferSize);

-  // read parameters
-  bool ReadParameters(std::istream& stream) {
-    return previous_layer_.ReadParameters(stream);
-  }
+            const auto output = reinterpret_cast<OutputType*>(buffer);

-  // write parameters
-  bool WriteParameters(std::ostream& stream) const {
-    return previous_layer_.WriteParameters(stream);
-  }
+            for (IndexType i = 0; i <kOutputDimensions; ++i) {
+                output[i] += head_output[i];
+            }

-  // forward propagation
-  const OutputType* Propagate(
-      const TransformedFeatureType* transformed_features, char* buffer) const {
-    return previous_layer_.Propagate(transformed_features, buffer);
-  }
+            return output;
+        }

- protected:
-  // A string that represents the list of layers to be summed
-  static std::string GetSummandsString() {
-    return PreviousLayer::GetStructureString();
-  }
+    protected:
+        // A string that represents the list of layers to be summed
+        static std::string get_summands_string() {
+            return Head::get_structure_string() + "," + Tail::get_summands_string();
+        }

-  // Make the learning class a friend
-  friend class Trainer<Sum>;
+        // Make the learning class a friend
+        friend class Trainer<Sum>;

-  // the layer immediately before this layer
-  PreviousLayer previous_layer_;
-};
+        // the layer immediately before this layer
+        FirstPreviousLayer previous_layer_;
+    };

-}  // namespace Layers
+    // Layer that sums the output of multiple layers (when there is one template argument)
+    template <typename PreviousLayer>
+    class Sum<PreviousLayer> {
+    public:
+        // Input/output type
+        using InputType = typename PreviousLayer::OutputType;

-}  // namespace NNUE
+        using OutputType = InputType;

-}  // namespace Eval
+        // number of input/output dimensions
+        static constexpr IndexType kInputDimensions =
+            PreviousLayer::kOutputDimensions;

-#endif  // defined(EVAL_NNUE)
+        static constexpr IndexType kOutputDimensions = kInputDimensions;
+
+        // Size of the forward propagation buffer used from the input layer to this layer
+        static constexpr std::size_t kBufferSize = PreviousLayer::kBufferSize;
+
+        static constexpr int kLayerIndex = PreviousLayer::kLayerIndex + 1;
+
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t GetHashValue() {
+            std::uint32_t hash_value = 0xBCE400B4u;
+            hash_value ^= PreviousLayer::GetHashValue() >> 1;
+            hash_value ^= PreviousLayer::GetHashValue() << 31;
+            return hash_value;
+        }
+
+        static std::string get_name() {
+             return "Sum[" +
+                std::to_string(kOutputDimensions) + "]";
+        }
+
+        // A string that represents the structure from the input layer to this layer
+        static std::string get_structure_string() {
+            return get_name() + "(" + get_summands_string() + ")";
+        }
+
+        static std::string get_layers_info() {
+            std::string info = PreviousLayer::get_layers_info();
+            info += '\n';
+            info += std::to_string(kLayerIndex);
+            info += ": ";
+            info += get_name();
+            return info;
+        }
+
+        // read parameters
+        bool ReadParameters(std::istream& stream) {
+            return previous_layer_.ReadParameters(stream);
+        }
+
+        // write parameters
+        bool WriteParameters(std::ostream& stream) const {
+            return previous_layer_.WriteParameters(stream);
+        }
+
+        // forward propagation
+        const OutputType* Propagate(
+            const TransformedFeatureType* transformed_features, char* buffer) const {
+
+            return previous_layer_.Propagate(transformed_features, buffer);
+        }
+
+    protected:
+        // A string that represents the list of layers to be summed
+        static std::string get_summands_string() {
+            return PreviousLayer::get_structure_string();
+        }
+
+        // Make the learning class a friend
+        friend class Trainer<Sum>;
+
+        // the layer immediately before this layer
+        PreviousLayer previous_layer_;
+    };
+
+}  // namespace Eval::NNUE::Layers

 #endif
@@ -27,11 +27,8 @@ namespace Eval::NNUE {

  // Class that holds the result of affine transformation of input features
  struct alignas(kCacheLineSize) Accumulator {
-    std::int16_t
-        accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions];
-    Value score;
-    bool computed_accumulation;
-    bool computed_score;
+      std::int16_t accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions];
+      bool computed_accumulation;
  };

 }  // namespace Eval::NNUE
@@ -21,6 +21,8 @@
 #ifndef NNUE_COMMON_H_INCLUDED
 #define NNUE_COMMON_H_INCLUDED

+#include "types.h"
+
 #include <cstring>
 #include <iostream>

@@ -43,29 +45,6 @@
 #include <arm_neon.h>
 #endif

-// HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Otherwise a binary
-//       compiled with older g++ crashes because the output memory is not aligned
-//       even though alignas is specified.
-#if defined(USE_AVX2)
-#if defined(__GNUC__ ) && (__GNUC__ < 9) && defined(_WIN32) && !defined(__clang__)
-#define _mm256_loadA_si256  _mm256_loadu_si256
-#define _mm256_storeA_si256 _mm256_storeu_si256
-#else
-#define _mm256_loadA_si256  _mm256_load_si256
-#define _mm256_storeA_si256 _mm256_store_si256
-#endif
-#endif
-
-#if defined(USE_AVX512)
-#if defined(__GNUC__ ) && (__GNUC__ < 9) && defined(_WIN32) && !defined(__clang__)
-#define _mm512_loadA_si512   _mm512_loadu_si512
-#define _mm512_storeA_si512  _mm512_storeu_si512
-#else
-#define _mm512_loadA_si512   _mm512_load_si512
-#define _mm512_storeA_si512  _mm512_store_si512
-#endif
-#endif
-
 namespace Eval::NNUE {

  // Version of the evaluation file
@@ -113,7 +92,7 @@ namespace Eval::NNUE {
    PS_END2     = 12 * SQUARE_NB + 1
  };

-  extern uint32_t kpp_board_index[PIECE_NB][COLOR_NB];
+  extern const uint32_t kpp_board_index[PIECE_NB][COLOR_NB];

  // Type of input feature after conversion
  using TransformedFeatureType = std::uint8_t;
@@ -25,10 +25,66 @@
 #include "nnue_architecture.h"
 #include "features/index_list.h"

-#include <cstring> // std::memset()
+#include <cstring>
+#include <string>

 namespace Eval::NNUE {

+  // If vector instructions are enabled, we update and refresh the
+  // accumulator tile by tile such that each tile fits in the CPU's
+  // vector registers.
+  #define VECTOR
+
+  #ifdef USE_AVX512
+  typedef __m512i vec_t;
+  #define vec_load(a) _mm512_load_si512(a)
+  #define vec_store(a,b) _mm512_store_si512(a,b)
+  #define vec_add_16(a,b) _mm512_add_epi16(a,b)
+  #define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
+  #define vec_zero _mm512_setzero_si512()
+  static constexpr IndexType kNumRegs = 8; // only 8 are needed
+
+  #elif USE_AVX2
+  typedef __m256i vec_t;
+  #define vec_load(a) _mm256_load_si256(a)
+  #define vec_store(a,b) _mm256_store_si256(a,b)
+  #define vec_add_16(a,b) _mm256_add_epi16(a,b)
+  #define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
+  #define vec_zero _mm256_setzero_si256()
+  static constexpr IndexType kNumRegs = 16;
+
+  #elif USE_SSE2
+  typedef __m128i vec_t;
+  #define vec_load(a) (*(a))
+  #define vec_store(a,b) *(a)=(b)
+  #define vec_add_16(a,b) _mm_add_epi16(a,b)
+  #define vec_sub_16(a,b) _mm_sub_epi16(a,b)
+  #define vec_zero _mm_setzero_si128()
+  static constexpr IndexType kNumRegs = Is64Bit ? 16 : 8;
+
+  #elif USE_MMX
+  typedef __m64 vec_t;
+  #define vec_load(a) (*(a))
+  #define vec_store(a,b) *(a)=(b)
+  #define vec_add_16(a,b) _mm_add_pi16(a,b)
+  #define vec_sub_16(a,b) _mm_sub_pi16(a,b)
+  #define vec_zero _mm_setzero_si64()
+  static constexpr IndexType kNumRegs = 8;
+
+  #elif USE_NEON
+  typedef int16x8_t vec_t;
+  #define vec_load(a) (*(a))
+  #define vec_store(a,b) *(a)=(b)
+  #define vec_add_16(a,b) vaddq_s16(a,b)
+  #define vec_sub_16(a,b) vsubq_s16(a,b)
+  #define vec_zero {0}
+  static constexpr IndexType kNumRegs = 16;
+
+  #else
+  #undef VECTOR
+
+  #endif
+
  // Input feature converter
  class FeatureTransformer {

@@ -36,6 +92,11 @@ namespace Eval::NNUE {
    // Number of output dimensions for one side
    static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions;

+    #ifdef VECTOR
+    static constexpr IndexType kTileHeight = kNumRegs * sizeof(vec_t) / 2;
+    static_assert(kHalfDimensions % kTileHeight == 0, "kTileHeight must divide kHalfDimensions");
+    #endif
+
   public:
    // Output type
    using OutputType = TransformedFeatureType;
@@ -48,20 +109,36 @@ namespace Eval::NNUE {
    static constexpr std::size_t kBufferSize =
        kOutputDimensions * sizeof(OutputType);

+    static constexpr int kLayerIndex = 0;
+
    // Hash value embedded in the evaluation file
    static constexpr std::uint32_t GetHashValue() {
+
      return RawFeatures::kHashValue ^ kOutputDimensions;
    }

+    static std::string get_name() {
+      return RawFeatures::get_name() + "[" +
+          std::to_string(kInputDimensions) + "->" +
+          std::to_string(kHalfDimensions) + "x2]";
+    }
+
    // a string representing the structure
-    static std::string GetStructureString() {
-      return RawFeatures::GetName() + "[" +
-        std::to_string(kInputDimensions) + "->" +
-        std::to_string(kHalfDimensions) + "x2]";
+    static std::string get_structure_string() {
+      return get_name();
+    }
+
+    static std::string get_layers_info() {
+      std::string info = "  - ";
+      info += std::to_string(kLayerIndex);
+      info += " - ";
+      info += get_name();
+      return info;
    }

    // Read network parameters
    bool ReadParameters(std::istream& stream) {
+
      for (std::size_t i = 0; i < kHalfDimensions; ++i)
        biases_[i] = read_little_endian<BiasType>(stream);
      for (std::size_t i = 0; i < kHalfDimensions * kInputDimensions; ++i)
@@ -72,34 +149,45 @@ namespace Eval::NNUE {
    // write parameters
    bool WriteParameters(std::ostream& stream) const {
      stream.write(reinterpret_cast<const char*>(biases_),
-        kHalfDimensions * sizeof(BiasType));
+          kHalfDimensions * sizeof(BiasType));
+
      stream.write(reinterpret_cast<const char*>(weights_),
-        kHalfDimensions * kInputDimensions * sizeof(WeightType));
+          kHalfDimensions * kInputDimensions * sizeof(WeightType));
+
      return !stream.fail();
    }

    // Proceed with the difference calculation if possible
-    bool UpdateAccumulatorIfPossible(const Position& pos) const {
+    bool update_accumulator_if_possible(const Position& pos) const {
+
      const auto now = pos.state();
-      if (now->accumulator.computed_accumulation) {
+      if (now->accumulator.computed_accumulation)
        return true;
-      }
+
      const auto prev = now->previous;
      if (prev && prev->accumulator.computed_accumulation) {
-        UpdateAccumulator(pos);
+        update_accumulator(pos);
        return true;
      }
+
      return false;
    }

    // Convert input features
-    void Transform(const Position& pos, OutputType* output, bool refresh) const {
-      if (refresh || !UpdateAccumulatorIfPossible(pos)) {
-        RefreshAccumulator(pos);
-      }
+    void Transform(const Position& pos, OutputType* output) const {
+
+      if (!update_accumulator_if_possible(pos))
+        refresh_accumulator(pos);
+
      const auto& accumulation = pos.state()->accumulator.accumulation;

-  #if defined(USE_AVX2)
+  #if defined(USE_AVX512)
+      constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth * 2);
+      static_assert(kHalfDimensions % (kSimdWidth * 2) == 0);
+      const __m512i kControl = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
+      const __m512i kZero = _mm512_setzero_si512();
+
+  #elif defined(USE_AVX2)
      constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
      constexpr int kControl = 0b11011000;
      const __m256i kZero = _mm256_setzero_si256();
@@ -126,14 +214,39 @@ namespace Eval::NNUE {
      for (IndexType p = 0; p < 2; ++p) {
        const IndexType offset = kHalfDimensions * p;

-  #if defined(USE_AVX2)
+  #if defined(USE_AVX512)
+        auto out = reinterpret_cast<__m512i*>(&output[offset]);
+        for (IndexType j = 0; j < kNumChunks; ++j) {
+          __m512i sum0 = _mm512_load_si512(
+              &reinterpret_cast<const __m512i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
+          __m512i sum1 = _mm512_load_si512(
+              &reinterpret_cast<const __m512i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
+          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+              sum0 = _mm512_add_epi16(sum0, reinterpret_cast<const __m512i*>(
+                  accumulation[perspectives[p]][i])[j * 2 + 0]);
+              sum1 = _mm512_add_epi16(sum1, reinterpret_cast<const __m512i*>(
+                  accumulation[perspectives[p]][i])[j * 2 + 1]);
+          }
+
+          _mm512_store_si512(&out[j], _mm512_permutexvar_epi64(kControl,
+              _mm512_max_epi8(_mm512_packs_epi16(sum0, sum1), kZero)));
+        }
+
+  #elif defined(USE_AVX2)
        auto out = reinterpret_cast<__m256i*>(&output[offset]);
        for (IndexType j = 0; j < kNumChunks; ++j) {
-          __m256i sum0 = _mm256_loadA_si256(
+          __m256i sum0 = _mm256_load_si256(
              &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
-          __m256i sum1 = _mm256_loadA_si256(
-            &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
-          _mm256_storeA_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
+          __m256i sum1 = _mm256_load_si256(
+              &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
+          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+              sum0 = _mm256_add_epi16(sum0, reinterpret_cast<const __m256i*>(
+                  accumulation[perspectives[p]][i])[j * 2 + 0]);
+              sum1 = _mm256_add_epi16(sum1, reinterpret_cast<const __m256i*>(
+                  accumulation[perspectives[p]][i])[j * 2 + 1]);
+          }
+
+          _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
              _mm256_packs_epi16(sum0, sum1), kZero), kControl));
        }

@@ -144,14 +257,21 @@ namespace Eval::NNUE {
              accumulation[perspectives[p]][0])[j * 2 + 0]);
          __m128i sum1 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
              accumulation[perspectives[p]][0])[j * 2 + 1]);
+          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+            sum0 = _mm_add_epi16(sum0, reinterpret_cast<const __m128i*>(
+                accumulation[perspectives[p]][i])[j * 2 + 0]);
+            sum1 = _mm_add_epi16(sum1, reinterpret_cast<const __m128i*>(
+                accumulation[perspectives[p]][i])[j * 2 + 1]);
+          }
+
      const __m128i packedbytes = _mm_packs_epi16(sum0, sum1);

          _mm_store_si128(&out[j],

  #ifdef USE_SSE41
-            _mm_max_epi8(packedbytes, kZero)
+              _mm_max_epi8(packedbytes, kZero)
  #else
-            _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
+              _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
  #endif

          );
@@ -164,6 +284,13 @@ namespace Eval::NNUE {
              accumulation[perspectives[p]][0])[j * 2 + 0]);
          __m64 sum1 = *(&reinterpret_cast<const __m64*>(
              accumulation[perspectives[p]][0])[j * 2 + 1]);
+          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+              sum0 = _mm_add_pi16(sum0, reinterpret_cast<const __m64*>(
+                  accumulation[perspectives[p]][i])[j * 2 + 0]);
+              sum1 = _mm_add_pi16(sum1, reinterpret_cast<const __m64*>(
+                  accumulation[perspectives[p]][i])[j * 2 + 1]);
+          }
+
          const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
          out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
        }
@@ -173,12 +300,22 @@ namespace Eval::NNUE {
        for (IndexType j = 0; j < kNumChunks; ++j) {
          int16x8_t sum = reinterpret_cast<const int16x8_t*>(
              accumulation[perspectives[p]][0])[j];
+
+          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+              sum = vaddq_s16(sum, reinterpret_cast<const int16x8_t*>(
+                  accumulation[perspectives[p]][i])[j]);
+          }
+
          out[j] = vmax_s8(vqmovn_s16(sum), kZero);
        }

  #else
        for (IndexType j = 0; j < kHalfDimensions; ++j) {
          BiasType sum = accumulation[static_cast<int>(perspectives[p])][0][j];
+          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+              sum += accumulation[static_cast<int>(perspectives[p])][i][j];
+          }
+
          output[offset + j] = static_cast<OutputType>(
              std::max<int>(0, std::min<int>(127, sum)));
        }
@@ -192,108 +329,150 @@ namespace Eval::NNUE {

   private:
    // Calculate cumulative value without using difference calculation
-    void RefreshAccumulator(const Position& pos) const {
+    void refresh_accumulator(const Position& pos) const {
+
+  #ifdef VECTOR
+      // Gcc-10.2 unnecessarily spills AVX2 registers if this array
+      // is defined in the VECTOR code below, once in each branch
+      vec_t acc[kNumRegs];
+  #endif
      auto& accumulator = pos.state()->accumulator;
-      IndexType i = 0;
-      Features::IndexList active_indices[2];
-      RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i],
-                                       active_indices);
-      for (Color perspective : { WHITE, BLACK }) {
-        std::memcpy(accumulator.accumulation[perspective][i], biases_,
-                   kHalfDimensions * sizeof(BiasType));
-        for (const auto index : active_indices[perspective]) {
-          const IndexType offset = kHalfDimensions * index;
-  #if defined(USE_AVX512)
-          auto accumulation = reinterpret_cast<__m512i*>(
-              &accumulator.accumulation[perspective][i][0]);
-          auto column = reinterpret_cast<const __m512i*>(&weights_[offset]);
-          constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
-          for (IndexType j = 0; j < kNumChunks; ++j)
-            _mm512_storeA_si512(&accumulation[j], _mm512_add_epi16(_mm512_loadA_si512(&accumulation[j]), column[j]));
+      for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+        Features::IndexList active_indices[2];
+        RawFeatures::append_active_indices(pos, kRefreshTriggers[i],
+                                           active_indices);
+          for (Color perspective : { WHITE, BLACK }) {
+#ifdef VECTOR
+            for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
+              auto accTile = reinterpret_cast<vec_t*>(
+                  &accumulator.accumulation[perspective][i][j * kTileHeight]);

-  #elif defined(USE_AVX2)
-          auto accumulation = reinterpret_cast<__m256i*>(
-              &accumulator.accumulation[perspective][i][0]);
-          auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
-          constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-          for (IndexType j = 0; j < kNumChunks; ++j)
-            _mm256_storeA_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadA_si256(&accumulation[j]), column[j]));
+              if (i == 0) {
+                auto biasesTile = reinterpret_cast<const vec_t*>(
+                    &biases_[j * kTileHeight]);
+                for (IndexType k = 0; k < kNumRegs; ++k)
+                  acc[k] = biasesTile[k];
+              } else {
+                for (IndexType k = 0; k < kNumRegs; ++k)
+                  acc[k] = vec_zero;
+              }

-  #elif defined(USE_SSE2)
-          auto accumulation = reinterpret_cast<__m128i*>(
-              &accumulator.accumulation[perspective][i][0]);
-          auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
-          constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-          for (IndexType j = 0; j < kNumChunks; ++j)
-            accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
+              for (const auto index : active_indices[perspective]) {
+                const IndexType offset = kHalfDimensions * index + j * kTileHeight;
+                auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);

-  #elif defined(USE_MMX)
-          auto accumulation = reinterpret_cast<__m64*>(
-              &accumulator.accumulation[perspective][i][0]);
-          auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
-          constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-          for (IndexType j = 0; j < kNumChunks; ++j) {
-            accumulation[j] = _mm_add_pi16(accumulation[j], column[j]);
+                for (IndexType k = 0; k < kNumRegs; ++k)
+                  acc[k] = vec_add_16(acc[k], column[k]);
+              }
+
+              for (IndexType k = 0; k < kNumRegs; k++)
+                vec_store(&accTile[k], acc[k]);
+            }
+#else
+            if (i == 0) {
+              std::memcpy(accumulator.accumulation[perspective][i], biases_,
+                          kHalfDimensions * sizeof(BiasType));
+            } else {
+              std::memset(accumulator.accumulation[perspective][i], 0,
+                          kHalfDimensions * sizeof(BiasType));
+            }
+
+            for (const auto index : active_indices[perspective]) {
+              const IndexType offset = kHalfDimensions * index;
+
+              for (IndexType j = 0; j < kHalfDimensions; ++j)
+                accumulator.accumulation[perspective][i][j] += weights_[offset + j];
+            }
+#endif
          }

-  #elif defined(USE_NEON)
-          auto accumulation = reinterpret_cast<int16x8_t*>(
-              &accumulator.accumulation[perspective][i][0]);
-          auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
-          constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-          for (IndexType j = 0; j < kNumChunks; ++j)
-            accumulation[j] = vaddq_s16(accumulation[j], column[j]);
-
-  #else
-          for (IndexType j = 0; j < kHalfDimensions; ++j)
-            accumulator.accumulation[perspective][i][j] += weights_[offset + j];
-  #endif
-
        }
-      }
-  #if defined(USE_MMX)
-      _mm_empty();
-  #endif

-      accumulator.computed_accumulation = true;
-      accumulator.computed_score = false;
+#if defined(USE_MMX)
+        _mm_empty();
+#endif
+
+        accumulator.computed_accumulation = true;
    }

    // Calculate cumulative value using difference calculation
-    void UpdateAccumulator(const Position& pos) const {
-      const auto prev_accumulator = pos.state()->previous->accumulator;
-      auto& accumulator = pos.state()->accumulator;
-      IndexType i = 0;
+    void update_accumulator(const Position& pos) const {
+
+  #ifdef VECTOR
+      // Gcc-10.2 unnecessarily spills AVX2 registers if this array
+      // is defined in the VECTOR code below, once in each branch
+      vec_t acc[kNumRegs];
+  #endif
+    const auto& prev_accumulator = pos.state()->previous->accumulator;
+    auto& accumulator = pos.state()->accumulator;
+    for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
      Features::IndexList removed_indices[2], added_indices[2];
-      bool reset[2];
-      RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i],
-                                        removed_indices, added_indices, reset);
+      bool reset[2] = { false, false };
+      RawFeatures::append_changed_indices(pos, kRefreshTriggers[i],
+                                          removed_indices, added_indices, reset);
+
+#ifdef VECTOR
+      for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
+        for (Color perspective : { WHITE, BLACK }) {
+          auto accTile = reinterpret_cast<vec_t*>(
+              &accumulator.accumulation[perspective][i][j * kTileHeight]);
+
+          if (reset[perspective]) {
+            if (i == 0) {
+              auto biasesTile = reinterpret_cast<const vec_t*>(
+                  &biases_[j * kTileHeight]);
+              for (IndexType k = 0; k < kNumRegs; ++k)
+                acc[k] = biasesTile[k];
+            } else {
+              for (IndexType k = 0; k < kNumRegs; ++k)
+                acc[k] = vec_zero;
+            }
+          } else {
+            auto prevAccTile = reinterpret_cast<const vec_t*>(
+                &prev_accumulator.accumulation[perspective][i][j * kTileHeight]);
+
+            for (IndexType k = 0; k < kNumRegs; ++k)
+              acc[k] = vec_load(&prevAccTile[k]);
+
+            // Difference calculation for the deactivated features
+            for (const auto index : removed_indices[perspective]) {
+              const IndexType offset = kHalfDimensions * index + j * kTileHeight;
+              auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
+
+              for (IndexType k = 0; k < kNumRegs; ++k)
+                acc[k] = vec_sub_16(acc[k], column[k]);
+            }
+          }
+
+          { // Difference calculation for the activated features
+            for (const auto index : added_indices[perspective]) {
+              const IndexType offset = kHalfDimensions * index + j * kTileHeight;
+              auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
+
+              for (IndexType k = 0; k < kNumRegs; ++k)
+                acc[k] = vec_add_16(acc[k], column[k]);
+            }
+          }
+
+          for (IndexType k = 0; k < kNumRegs; ++k)
+            vec_store(&accTile[k], acc[k]);
+        }
+      }
+#if defined(USE_MMX)
+      _mm_empty();
+#endif
+
+#else
      for (Color perspective : { WHITE, BLACK }) {

-  #if defined(USE_AVX2)
-        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-        auto accumulation = reinterpret_cast<__m256i*>(
-            &accumulator.accumulation[perspective][i][0]);
-
-  #elif defined(USE_SSE2)
-        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-        auto accumulation = reinterpret_cast<__m128i*>(
-            &accumulator.accumulation[perspective][i][0]);
-
-  #elif defined(USE_MMX)
-        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-        auto accumulation = reinterpret_cast<__m64*>(
-            &accumulator.accumulation[perspective][i][0]);
-
-  #elif defined(USE_NEON)
-        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-        auto accumulation = reinterpret_cast<int16x8_t*>(
-            &accumulator.accumulation[perspective][i][0]);
-  #endif
-
        if (reset[perspective]) {
-          std::memcpy(accumulator.accumulation[perspective][i], biases_,
-                      kHalfDimensions * sizeof(BiasType));
+          if (i == 0) {
+            std::memcpy(accumulator.accumulation[perspective][i], biases_,
+                        kHalfDimensions * sizeof(BiasType));
+          } else {
+            std::memset(accumulator.accumulation[perspective][i], 0,
+                        kHalfDimensions * sizeof(BiasType));
+          }
        } else {
          std::memcpy(accumulator.accumulation[perspective][i],
                      prev_accumulator.accumulation[perspective][i],
@@ -302,83 +481,22 @@ namespace Eval::NNUE {
          for (const auto index : removed_indices[perspective]) {
            const IndexType offset = kHalfDimensions * index;

-  #if defined(USE_AVX2)
-            auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j) {
-              accumulation[j] = _mm256_sub_epi16(accumulation[j], column[j]);
-            }
-
-  #elif defined(USE_SSE2)
-            auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j) {
-              accumulation[j] = _mm_sub_epi16(accumulation[j], column[j]);
-            }
-
-  #elif defined(USE_MMX)
-            auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j) {
-              accumulation[j] = _mm_sub_pi16(accumulation[j], column[j]);
-            }
-
-  #elif defined(USE_NEON)
-            auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j) {
-              accumulation[j] = vsubq_s16(accumulation[j], column[j]);
-            }
-
-  #else
-            for (IndexType j = 0; j < kHalfDimensions; ++j) {
-              accumulator.accumulation[perspective][i][j] -=
-                  weights_[offset + j];
-            }
-  #endif
-
+            for (IndexType j = 0; j < kHalfDimensions; ++j)
+              accumulator.accumulation[perspective][i][j] -= weights_[offset + j];
          }
        }
        { // Difference calculation for the activated features
          for (const auto index : added_indices[perspective]) {
            const IndexType offset = kHalfDimensions * index;

-  #if defined(USE_AVX2)
-            auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j) {
-              accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]);
-            }
-
-  #elif defined(USE_SSE2)
-            auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j) {
-              accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
-            }
-
-  #elif defined(USE_MMX)
-            auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j) {
-              accumulation[j] = _mm_add_pi16(accumulation[j], column[j]);
-            }
-
-  #elif defined(USE_NEON)
-            auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j) {
-              accumulation[j] = vaddq_s16(accumulation[j], column[j]);
-            }
-
-  #else
-            for (IndexType j = 0; j < kHalfDimensions; ++j) {
-              accumulator.accumulation[perspective][i][j] +=
-                  weights_[offset + j];
-            }
-  #endif
-
+            for (IndexType j = 0; j < kHalfDimensions; ++j)
+              accumulator.accumulation[perspective][i][j] += weights_[offset + j];
          }
        }
      }
-  #if defined(USE_MMX)
-      _mm_empty();
-  #endif
-
+#endif
+      }
      accumulator.computed_accumulation = true;
-      accumulator.computed_score = false;
    }

    using BiasType = std::int16_t;
@@ -1,201 +1,215 @@
-// USI extended command for NNUE evaluation function
-
-#if defined(ENABLE_TEST_CMD) && defined(EVAL_NNUE)
-
-#include "../thread.h"
-#include "../uci.h"
-#include "evaluate_nnue.h"
+#include "evaluate_nnue.h"
 #include "nnue_test_command.h"

+#include "thread.h"
+#include "uci.h"
+
 #include <set>
 #include <fstream>

-#define ASSERT(X) { if (!(X)) { std::cout << "\nError : ASSERT(" << #X << "), " << __FILE__ << "(" << __LINE__ << "): " << __func__ << std::endl; \
- std::this_thread::sleep_for(std::chrono::microseconds(3000)); *(int*)1 =0;} }
-
-namespace Eval {
-
-namespace NNUE {
-
-namespace {
-
-// Testing RawFeatures mainly for difference calculation
-void TestFeatures(Position& pos) {
-  const std::uint64_t num_games = 1000;
-  StateInfo si;
-  pos.set(StartFEN, false, &si, Threads.main());
-  const int MAX_PLY = 256; // test up to 256 hands
-
-  StateInfo state[MAX_PLY]; // StateInfo only for the maximum number of steps
-  int ply; // Trouble from the initial phase
-
-  PRNG prng(20171128);
-
-  std::uint64_t num_moves = 0;
-  std::vector<std::uint64_t> num_updates(kRefreshTriggers.size() + 1);
-  std::vector<std::uint64_t> num_resets(kRefreshTriggers.size());
-  constexpr IndexType kUnknown = -1;
-  std::vector<IndexType> trigger_map(RawFeatures::kDimensions, kUnknown);
-  auto make_index_sets = [&](const Position& pos) {
-    std::vector<std::vector<std::set<IndexType>>> index_sets(
-        kRefreshTriggers.size(), std::vector<std::set<IndexType>>(2));
-    for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-      Features::IndexList active_indices[2];
-      RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i],
-                                       active_indices);
-      for (const auto perspective : Colors) {
-        for (const auto index : active_indices[perspective]) {
-          ASSERT(index < RawFeatures::kDimensions);
-          ASSERT(index_sets[i][perspective].count(index) == 0);
-          ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
-          index_sets[i][perspective].insert(index);
-          trigger_map[index] = i;
-        }
-      }
-    }
-    return index_sets;
-  };
-  auto update_index_sets = [&](const Position& pos, auto* index_sets) {
-    for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-      Features::IndexList removed_indices[2], added_indices[2];
-      bool reset[2];
-      RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i],
-                                        removed_indices, added_indices, reset);
-      for (const auto perspective : Colors) {
-        if (reset[perspective]) {
-          (*index_sets)[i][perspective].clear();
-          ++num_resets[i];
-        } else {
-          for (const auto index : removed_indices[perspective]) {
-            ASSERT(index < RawFeatures::kDimensions);
-            ASSERT((*index_sets)[i][perspective].count(index) == 1);
-            ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
-            (*index_sets)[i][perspective].erase(index);
-            ++num_updates.back();
-            ++num_updates[i];
-            trigger_map[index] = i;
-          }
-        }
-        for (const auto index : added_indices[perspective]) {
-          ASSERT(index < RawFeatures::kDimensions);
-          ASSERT((*index_sets)[i][perspective].count(index) == 0);
-          ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
-          (*index_sets)[i][perspective].insert(index);
-          ++num_updates.back();
-          ++num_updates[i];
-          trigger_map[index] = i;
-        }
-      }
-    }
-  };
-
-  std::cout << "feature set: " << RawFeatures::GetName()
-            << "[" << RawFeatures::kDimensions << "]" << std::endl;
-  std::cout << "start testing with random games";
-
-  for (std::uint64_t i = 0; i < num_games; ++i) {
-    auto index_sets = make_index_sets(pos);
-    for (ply = 0; ply < MAX_PLY; ++ply) {
-      MoveList<LEGAL> mg(pos); // Generate all legal hands
-
-      // There was no legal move == Clog
-      if (mg.size() == 0)
-        break;
-
-      // Randomly choose from the generated moves and advance the phase with the moves.
-      Move m = mg.begin()[prng.rand(mg.size())];
-      pos.do_move(m, state[ply]);
-
-      ++num_moves;
-      update_index_sets(pos, &index_sets);
-      ASSERT(index_sets == make_index_sets(pos));
-    }
-
-    pos.set(StartFEN, false, &si, Threads.main());
-
-    // Output'.' every 100 times (so you can see that it's progressing)
-    if ((i % 100) == 0)
-      std::cout << "." << std::flush;
-  }
-  std::cout << "passed." << std::endl;
-  std::cout << num_games << " games, " << num_moves << " moves, "
-            << num_updates.back() << " updates, "
-            << (1.0 * num_updates.back() / num_moves)
-            << " updates per move" << std::endl;
-  std::size_t num_observed_indices = 0;
-  for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-    const auto count = std::count(trigger_map.begin(), trigger_map.end(), i);
-    num_observed_indices += count;
-    std::cout << "TriggerEvent(" << static_cast<int>(kRefreshTriggers[i])
-              << "): " << count << " features ("
-              << (100.0 * count / RawFeatures::kDimensions) << "%), "
-              << num_updates[i] << " updates ("
-              << (1.0 * num_updates[i] / num_moves) << " per move), "
-              << num_resets[i] << " resets ("
-              << (100.0 * num_resets[i] / num_moves) << "%)"
-              << std::endl;
-  }
-  std::cout << "observed " << num_observed_indices << " ("
-            << (100.0 * num_observed_indices / RawFeatures::kDimensions)
-            << "% of " << RawFeatures::kDimensions
-            << ") features" << std::endl;
+#define ASSERT(X) { \
+    if (!(X)) { \
+        std::cout \
+            << "\nError : ASSERT(" << #X << "), " \
+            << __FILE__ << "(" << __LINE__ << "): " \
+            << __func__ << std::endl; \
+            std::this_thread::sleep_for(std::chrono::microseconds(3000)); \
+            *(int*)1 =0; \
+    } \
 }

-// Output a string that represents the structure of the evaluation function
-void PrintInfo(std::istream& stream) {
-  std::cout << "network architecture: " << GetArchitectureString() << std::endl;
-
-  while (true) {
-    std::string file_name;
-    stream >> file_name;
-    if (file_name.empty()) break;
-
-    std::uint32_t hash_value;
-    std::string architecture;
-    const bool success = [&]() {
-      std::ifstream file_stream(file_name, std::ios::binary);
-      if (!file_stream) return false;
-      if (!ReadHeader(file_stream, &hash_value, &architecture)) return false;
-      return true;
-    }();
-
-    std::cout << file_name << ": ";
-    if (success) {
-      if (hash_value == kHashValue) {
-        std::cout << "matches with this binary";
-        if (architecture != GetArchitectureString()) {
-          std::cout << ", but architecture string differs: " << architecture;
-        }
-        std::cout << std::endl;
-      } else {
-        std::cout << architecture << std::endl;
-      }
-    } else {
-      std::cout << "failed to read header" << std::endl;
-    }
-  }
-}
-
-}  // namespace
-
 // USI extended command for NNUE evaluation function
-void TestCommand(Position& pos, std::istream& stream) {
-  std::string sub_command;
-  stream >> sub_command;
+namespace Eval::NNUE {

-  if (sub_command == "test_features") {
-    TestFeatures(pos);
-  } else if (sub_command == "info") {
-    PrintInfo(stream);
-  } else {
-    std::cout << "usage:" << std::endl;
-    std::cout << " test nnue test_features" << std::endl;
-    std::cout << " test nnue info [path/to/" << fileName << "...]" << std::endl;
-  }
-}
+    namespace {

-}  // namespace NNUE
+        // Testing RawFeatures mainly for difference calculation
+        void test_features(Position& pos) {
+            const std::uint64_t num_games = 1000;
+            StateInfo si;
+            pos.set(StartFEN, false, &si, Threads.main());
+            const int MAX_PLY = 256; // test up to 256 hands

-}  // namespace Eval
+            StateInfo state[MAX_PLY]; // StateInfo only for the maximum number of steps
+            int ply; // Trouble from the initial phase

-#endif  // defined(ENABLE_TEST_CMD) && defined(EVAL_NNUE)
+            PRNG prng(20171128);
+
+            std::uint64_t num_moves = 0;
+            std::vector<std::uint64_t> num_updates(kRefreshTriggers.size() + 1);
+            std::vector<std::uint64_t> num_resets(kRefreshTriggers.size());
+            constexpr IndexType kUnknown = -1;
+            std::vector<IndexType> trigger_map(RawFeatures::kDimensions, kUnknown);
+
+            auto make_index_sets = [&](const Position& position) {
+                std::vector<std::vector<std::set<IndexType>>> index_sets(
+                    kRefreshTriggers.size(), std::vector<std::set<IndexType>>(2));
+
+                for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+                    Features::IndexList active_indices[2];
+                    RawFeatures::append_active_indices(position, kRefreshTriggers[i],
+                                                     active_indices);
+
+                    for (const auto perspective : Colors) {
+                        for (const auto index : active_indices[perspective]) {
+                            ASSERT(index < RawFeatures::kDimensions);
+                            ASSERT(index_sets[i][perspective].count(index) == 0);
+                            ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
+                            index_sets[i][perspective].insert(index);
+                            trigger_map[index] = i;
+                        }
+                    }
+                }
+
+                return index_sets;
+            };
+
+            auto update_index_sets = [&](const Position& position, auto* index_sets) {
+                for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+                    Features::IndexList removed_indices[2], added_indices[2];
+                    bool reset[2] = { false, false };
+                    RawFeatures::append_changed_indices(position, kRefreshTriggers[i],
+                                                      removed_indices, added_indices, reset);
+                    for (const auto perspective : Colors) {
+                        if (reset[perspective]) {
+                            (*index_sets)[i][perspective].clear();
+                            ++num_resets[i];
+                        } else {
+                            for (const auto index : removed_indices[perspective]) {
+                                ASSERT(index < RawFeatures::kDimensions);
+                                ASSERT((*index_sets)[i][perspective].count(index) == 1);
+                                ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
+                                (*index_sets)[i][perspective].erase(index);
+                                ++num_updates.back();
+                                ++num_updates[i];
+                                trigger_map[index] = i;
+                            }
+                        }
+
+                        for (const auto index : added_indices[perspective]) {
+                            ASSERT(index < RawFeatures::kDimensions);
+                            ASSERT((*index_sets)[i][perspective].count(index) == 0);
+                            ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
+                            (*index_sets)[i][perspective].insert(index);
+                            ++num_updates.back();
+                            ++num_updates[i];
+                            trigger_map[index] = i;
+                        }
+                    }
+                }
+            };
+
+            std::cout << "feature set: " << RawFeatures::get_name()
+                      << "[" << RawFeatures::kDimensions << "]" << std::endl;
+            std::cout << "start testing with random games";
+
+            for (std::uint64_t i = 0; i < num_games; ++i) {
+                auto index_sets = make_index_sets(pos);
+                for (ply = 0; ply < MAX_PLY; ++ply) {
+                    MoveList<LEGAL> mg(pos); // Generate all legal hands
+
+                    // There was no legal move == Clog
+                    if (mg.size() == 0)
+                        break;
+
+                    // Randomly choose from the generated moves and advance the phase with the moves.
+                    Move m = mg.begin()[prng.rand(mg.size())];
+                    pos.do_move(m, state[ply]);
+
+                    ++num_moves;
+                    update_index_sets(pos, &index_sets);
+                    ASSERT(index_sets == make_index_sets(pos));
+                }
+
+                pos.set(StartFEN, false, &si, Threads.main());
+
+                // Output'.' every 100 times (so you can see that it's progressing)
+                if ((i % 100) == 0)
+                    std::cout << "." << std::flush;
+            }
+
+            std::cout << "passed." << std::endl;
+            std::cout << num_games << " games, " << num_moves << " moves, "
+                      << num_updates.back() << " updates, "
+                      << (1.0 * num_updates.back() / num_moves)
+                      << " updates per move" << std::endl;
+            std::size_t num_observed_indices = 0;
+
+            for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+                const auto count = std::count(trigger_map.begin(), trigger_map.end(), i);
+                num_observed_indices += count;
+                std::cout << "TriggerEvent(" << static_cast<int>(kRefreshTriggers[i])
+                          << "): " << count << " features ("
+                          << (100.0 * count / RawFeatures::kDimensions) << "%), "
+                          << num_updates[i] << " updates ("
+                          << (1.0 * num_updates[i] / num_moves) << " per move), "
+                          << num_resets[i] << " resets ("
+                          << (100.0 * num_resets[i] / num_moves) << "%)"
+                          << std::endl;
+            }
+            std::cout << "observed " << num_observed_indices << " ("
+                      << (100.0 * num_observed_indices / RawFeatures::kDimensions)
+                      << "% of " << RawFeatures::kDimensions
+                      << ") features" << std::endl;
+        }
+
+        // Output a string that represents the structure of the evaluation function
+        void print_info(std::istream& stream) {
+            std::cout << "network architecture: " << get_architecture_string() << std::endl;
+
+            while (true) {
+                std::string file_name;
+                stream >> file_name;
+                if (file_name.empty())
+                    break;
+
+                std::uint32_t hash_value;
+                std::string architecture;
+                const bool success = [&]() {
+                    std::ifstream file_stream(file_name, std::ios::binary);
+
+                    if (!file_stream)
+                        return false;
+                    if (!read_header(file_stream, &hash_value, &architecture))
+                        return false;
+
+                    return true;
+                }();
+
+                std::cout << file_name << ": ";
+                if (success) {
+                    if (hash_value == kHashValue) {
+                        std::cout << "matches with this binary";
+                        if (architecture != get_architecture_string()) {
+                            std::cout << ", but architecture string differs: " << architecture;
+                        }
+
+                        std::cout << std::endl;
+                    } else {
+                        std::cout << architecture << std::endl;
+                    }
+                } else {
+                    std::cout << "failed to read header" << std::endl;
+                }
+            }
+        }
+
+    }  // namespace
+
+    // USI extended command for NNUE evaluation function
+    void test_command(Position& pos, std::istream& stream) {
+        std::string sub_command;
+        stream >> sub_command;
+
+        if (sub_command == "test_features") {
+            test_features(pos);
+        } else if (sub_command == "info") {
+            print_info(stream);
+        } else {
+            std::cout << "usage:" << std::endl;
+            std::cout << " test nnue test_features" << std::endl;
+            std::cout << " test nnue info [path/to/" << fileName << "...]" << std::endl;
+        }
+    }
+
+}  // namespace Eval::NNUE
@@ -1,21 +1,12 @@
-// USI extended command interface for NNUE evaluation function
-
-#ifndef _NNUE_TEST_COMMAND_H_
+#ifndef _NNUE_TEST_COMMAND_H_
 #define _NNUE_TEST_COMMAND_H_

-#if defined(ENABLE_TEST_CMD) && defined(EVAL_NNUE)
+// USI extended command interface for NNUE evaluation function
+namespace Eval::NNUE {

-namespace Eval {
+    // USI extended command for NNUE evaluation function
+    void test_command(Position& pos, std::istream& stream);

-namespace NNUE {
-
-// USI extended command for NNUE evaluation function
-void TestCommand(Position& pos, std::istream& stream);
-
-}  // namespace NNUE
-
-}  // namespace Eval
-
-#endif  // defined(ENABLE_TEST_CMD) && defined(EVAL_NNUE)
+}  // namespace Eval::NNUE

 #endif
@@ -0,0 +1,10 @@
+#ifndef _NNUE_TRAINER_FEATURES_ALL_FACTORIZERS_H_
+#define _NNUE_TRAINER_FEATURES_ALL_FACTORIZERS_H_
+
+#include "factorizer.h"
+#include "factorizer_feature_set.h"
+
+#include "factorizer_half_kp.h"
+#include "factorizer_half_ka.h"
+
+#endif
@@ -1,110 +1,117 @@
-// NNUE evaluation function feature conversion class template
-
-#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_H_
+#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_H_
 #define _NNUE_TRAINER_FEATURES_FACTORIZER_H_

-#if defined(EVAL_NNUE)
+#include "nnue/nnue_common.h"

-#include "../../nnue_common.h"
-#include "../trainer.h"
+#include "nnue/trainer/trainer.h"

-namespace Eval {
+// NNUE evaluation function feature conversion class template
+namespace Eval::NNUE::Features {

-namespace NNUE {
+    // Class template that converts input features into learning features
+    // By default, the learning feature is the same as the original input feature, and specialized as necessary
+    template <typename FeatureType>
+    class Factorizer {
+    public:
+        static constexpr std::string get_name() {
+            return "Factorizer<" + FeatureType::get_name() + "> -> " + std::string("No factorizer");
+        }

-namespace Features {
+        static constexpr std::string get_factorizers_string() {
+            return "  - " + get_name();
+        }

-// Class template that converts input features into learning features
-// By default, the learning feature is the same as the original input feature, and specialized as necessary
-template <typename FeatureType>
-class Factorizer {
- public:
-  // Get the dimensionality of the learning feature
-  static constexpr IndexType GetDimensions() {
-    return FeatureType::kDimensions;
-  }
+        // Get the dimensionality of the learning feature
+        static constexpr IndexType get_dimensions() {
+            return FeatureType::kDimensions;
+        }

-  // Get index of learning feature and scale of learning rate
-  static void AppendTrainingFeatures(
-      IndexType base_index, std::vector<TrainingFeature>* training_features) {
-    assert(base_index <FeatureType::kDimensions);
-    training_features->emplace_back(base_index);
-  }
-};
+        // Get index of learning feature and scale of learning rate
+        static void append_training_features(
+            IndexType base_index, std::vector<TrainingFeature>* training_features) {

-// Learning feature information
-struct FeatureProperties {
-  bool active;
-  IndexType dimensions;
-};
+            assert(base_index <FeatureType::kDimensions);
+            training_features->emplace_back(base_index);
+        }
+    };

-// Add the original input features to the learning features
-template <typename FeatureType>
-IndexType AppendBaseFeature(
-    FeatureProperties properties, IndexType base_index,
-    std::vector<TrainingFeature>* training_features) {
-  assert(properties.dimensions == FeatureType::kDimensions);
-  assert(base_index < FeatureType::kDimensions);
-  training_features->emplace_back(base_index);
-  return properties.dimensions;
-}
+    // Learning feature information
+    struct FeatureProperties {
+        bool active;
+        IndexType dimensions;
+    };

-// If the learning rate scale is not 0, inherit other types of learning features
-template <typename FeatureType>
-IndexType InheritFeaturesIfRequired(
-    IndexType index_offset, FeatureProperties properties, IndexType base_index,
-    std::vector<TrainingFeature>* training_features) {
-  if (!properties.active) {
-    return 0;
-  }
-  assert(properties.dimensions == Factorizer<FeatureType>::GetDimensions());
-  assert(base_index < FeatureType::kDimensions);
-  const auto start = training_features->size();
-  Factorizer<FeatureType>::AppendTrainingFeatures(
-      base_index, training_features);
-  for (auto i = start; i < training_features->size(); ++i) {
-    auto& feature = (*training_features)[i];
-    assert(feature.GetIndex() < Factorizer<FeatureType>::GetDimensions());
-    feature.ShiftIndex(index_offset);
-  }
-  return properties.dimensions;
-}
+    // Add the original input features to the learning features
+    template <typename FeatureType>
+    IndexType append_base_feature(
+        FeatureProperties properties, IndexType base_index,
+        std::vector<TrainingFeature>* training_features) {

-// Return the index difference as needed, without adding learning features
-// Call instead of InheritFeaturesIfRequired() if there are no corresponding features
-IndexType SkipFeatures(FeatureProperties properties) {
-  if (!properties.active) {
-    return 0;
-  }
-  return properties.dimensions;
-}
-
-// Get the dimensionality of the learning feature
-template <std::size_t N>
-constexpr IndexType GetActiveDimensions(
-    const FeatureProperties (&properties)[N]) {
-  static_assert(N > 0, "");
-  IndexType dimensions = properties[0].dimensions;
-  for (std::size_t i = 1; i < N; ++i) {
-    if (properties[i].active) {
-      dimensions += properties[i].dimensions;
+        assert(properties.dimensions == FeatureType::kDimensions);
+        assert(base_index < FeatureType::kDimensions);
+        training_features->emplace_back(base_index);
+        return properties.dimensions;
    }
-  }
-  return dimensions;
-}

-// get the number of elements in the array
-template <typename T, std::size_t N>
-constexpr std::size_t GetArrayLength(const T (&/*array*/)[N]) {
-  return N;
-}
+    // If the learning rate scale is not 0, inherit other types of learning features
+    template <typename FeatureType>
+    IndexType inherit_features_if_required(
+        IndexType index_offset, FeatureProperties properties, IndexType base_index,
+        std::vector<TrainingFeature>* training_features) {

-}  // namespace Features
+        if (!properties.active) {
+            return 0;
+        }

-}  // namespace NNUE
+        assert(properties.dimensions == Factorizer<FeatureType>::get_dimensions());
+        assert(base_index < FeatureType::kDimensions);

-}  // namespace Eval
+        const auto start = training_features->size();
+        Factorizer<FeatureType>::append_training_features(
+            base_index, training_features);

-#endif  // defined(EVAL_NNUE)
+        for (auto i = start; i < training_features->size(); ++i) {
+            auto& feature = (*training_features)[i];
+            assert(feature.get_index() < Factorizer<FeatureType>::get_dimensions());
+            feature.shift_index(index_offset);
+        }
+
+        return properties.dimensions;
+    }
+
+    // Return the index difference as needed, without adding learning features
+    // Call instead of InheritFeaturesIfRequired() if there are no corresponding features
+    IndexType skip_features(FeatureProperties properties) {
+        if (!properties.active)
+            return 0;
+
+        return properties.dimensions;
+    }
+
+    // Get the dimensionality of the learning feature
+    template <std::size_t N>
+    constexpr IndexType get_active_dimensions(
+        const FeatureProperties (&properties)[N]) {
+
+        static_assert(N > 0, "");
+
+        IndexType dimensions = properties[0].dimensions;
+
+        for (std::size_t i = 1; i < N; ++i) {
+            if (properties[i].active) {
+                dimensions += properties[i].dimensions;
+            }
+        }
+
+        return dimensions;
+    }
+
+    // get the number of elements in the array
+    template <typename T, std::size_t N>
+    constexpr std::size_t get_array_length(const T (&/*array*/)[N]) {
+        return N;
+    }
+
+}  // namespace Eval::NNUE::Features

 #endif
@@ -1,104 +1,121 @@
-// Specialization for feature set of feature conversion class template of NNUE evaluation function
-
-#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_
+#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_
 #define _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_

-#if defined(EVAL_NNUE)
-
-#include "../../features/feature_set.h"
 #include "factorizer.h"

-namespace Eval {
+#include "nnue/features/feature_set.h"

-namespace NNUE {
+// Specialization for feature set of feature conversion class template of NNUE evaluation function
+namespace Eval::NNUE::Features {

-namespace Features {
+    // Class template that converts input features into learning features
+    // Specialization for FeatureSet
+    template <typename FirstFeatureType, typename... RemainingFeatureTypes>
+    class Factorizer<FeatureSet<FirstFeatureType, RemainingFeatureTypes...>> {
+    private:
+        using Head = Factorizer<FeatureSet<FirstFeatureType>>;
+        using Tail = Factorizer<FeatureSet<RemainingFeatureTypes...>>;

-// Class template that converts input features into learning features
-// Specialization for FeatureSet
-template <typename FirstFeatureType, typename... RemainingFeatureTypes>
-class Factorizer<FeatureSet<FirstFeatureType, RemainingFeatureTypes...>> {
- private:
-  using Head = Factorizer<FeatureSet<FirstFeatureType>>;
-  using Tail = Factorizer<FeatureSet<RemainingFeatureTypes...>>;
+    public:
+        // number of dimensions of original input features
+        static constexpr IndexType kBaseDimensions =
+            FeatureSet<FirstFeatureType, RemainingFeatureTypes...>::kDimensions;

- public:
-  // number of dimensions of original input features
-  static constexpr IndexType kBaseDimensions =
-      FeatureSet<FirstFeatureType, RemainingFeatureTypes...>::kDimensions;
-
-  // Get the dimensionality of the learning feature
-  static constexpr IndexType GetDimensions() {
-    return Head::GetDimensions() + Tail::GetDimensions();
-  }
-
-  // Get index of learning feature and scale of learning rate
-  static void AppendTrainingFeatures(
-      IndexType base_index, std::vector<TrainingFeature>* training_features,
-      IndexType base_dimensions = kBaseDimensions) {
-    assert(base_index < kBaseDimensions);
-    constexpr auto boundary = FeatureSet<RemainingFeatureTypes...>::kDimensions;
-    if (base_index < boundary) {
-      Tail::AppendTrainingFeatures(
-          base_index, training_features, base_dimensions);
-    } else {
-      const auto start = training_features->size();
-      Head::AppendTrainingFeatures(
-          base_index - boundary, training_features, base_dimensions);
-      for (auto i = start; i < training_features->size(); ++i) {
-        auto& feature = (*training_features)[i];
-        const auto index = feature.GetIndex();
-        assert(index < Head::GetDimensions() ||
-                   (index >= base_dimensions &&
-                    index < base_dimensions +
-                            Head::GetDimensions() - Head::kBaseDimensions));
-        if (index < Head::kBaseDimensions) {
-          feature.ShiftIndex(Tail::kBaseDimensions);
-        } else {
-          feature.ShiftIndex(Tail::GetDimensions() - Tail::kBaseDimensions);
+        static constexpr std::string get_factorizers_string() {
+            std::string str = "  - ";
+            str += Head::get_name();
+            str += '\n';
+            str += Tail::get_factorizers_string();
+            return str;
        }
-      }
-    }
-  }
-};

-// Class template that converts input features into learning features
-// Specialization when FeatureSet has one template argument
-template <typename FeatureType>
-class Factorizer<FeatureSet<FeatureType>> {
-public:
-  // number of dimensions of original input features
-  static constexpr IndexType kBaseDimensions = FeatureType::kDimensions;
+        // Get the dimensionality of the learning feature
+        static constexpr IndexType get_dimensions() {
+            return Head::get_dimensions() + Tail::get_dimensions();
+        }

-  // Get the dimensionality of the learning feature
-  static constexpr IndexType GetDimensions() {
-    return Factorizer<FeatureType>::GetDimensions();
-  }
+        // Get index of learning feature and scale of learning rate
+        static void append_training_features(
+            IndexType base_index, std::vector<TrainingFeature>* training_features,
+            IndexType base_dimensions = kBaseDimensions) {

-  // Get index of learning feature and scale of learning rate
-  static void AppendTrainingFeatures(
-      IndexType base_index, std::vector<TrainingFeature>* training_features,
-      IndexType base_dimensions = kBaseDimensions) {
-    assert(base_index < kBaseDimensions);
-    const auto start = training_features->size();
-    Factorizer<FeatureType>::AppendTrainingFeatures(
-        base_index, training_features);
-    for (auto i = start; i < training_features->size(); ++i) {
-      auto& feature = (*training_features)[i];
-      assert(feature.GetIndex() < Factorizer<FeatureType>::GetDimensions());
-      if (feature.GetIndex() >= kBaseDimensions) {
-        feature.ShiftIndex(base_dimensions - kBaseDimensions);
-      }
-    }
-  }
-};
+            assert(base_index < kBaseDimensions);

-}  // namespace Features
+            constexpr auto boundary = FeatureSet<RemainingFeatureTypes...>::kDimensions;

-}  // namespace NNUE
+            if (base_index < boundary) {
+                Tail::append_training_features(
+                    base_index, training_features, base_dimensions);
+            }
+            else {
+                const auto start = training_features->size();

-}  // namespace Eval
+                Head::append_training_features(
+                    base_index - boundary, training_features, base_dimensions);

-#endif  // defined(EVAL_NNUE)
+                for (auto i = start; i < training_features->size(); ++i) {
+                    auto& feature = (*training_features)[i];
+                    const auto index = feature.get_index();
+
+                    assert(index < Head::get_dimensions() ||
+                               (index >= base_dimensions &&
+                                index < base_dimensions +
+                                        Head::get_dimensions() - Head::kBaseDimensions));
+
+                    if (index < Head::kBaseDimensions) {
+                        feature.shift_index(Tail::kBaseDimensions);
+                    }
+                    else {
+                        feature.shift_index(Tail::get_dimensions() - Tail::kBaseDimensions);
+                    }
+                }
+            }
+        }
+    };
+
+    // Class template that converts input features into learning features
+    // Specialization when FeatureSet has one template argument
+    template <typename FeatureType>
+    class Factorizer<FeatureSet<FeatureType>> {
+    public:
+        // number of dimensions of original input features
+        static constexpr IndexType kBaseDimensions = FeatureType::kDimensions;
+
+        static constexpr std::string get_name() {
+            return Factorizer<FeatureType>::get_name();
+        }
+
+        static constexpr std::string get_factorizers_string() {
+            return "  - " + get_name();
+        }
+
+        // Get the dimensionality of the learning feature
+        static constexpr IndexType get_dimensions() {
+            return Factorizer<FeatureType>::get_dimensions();
+        }
+
+        // Get index of learning feature and scale of learning rate
+        static void append_training_features(
+            IndexType base_index, std::vector<TrainingFeature>* training_features,
+            IndexType base_dimensions = kBaseDimensions) {
+
+            assert(base_index < kBaseDimensions);
+
+            const auto start = training_features->size();
+
+            Factorizer<FeatureType>::append_training_features(
+                base_index, training_features);
+
+            for (auto i = start; i < training_features->size(); ++i) {
+                auto& feature = (*training_features)[i];
+                assert(feature.get_index() < Factorizer<FeatureType>::get_dimensions());
+                if (feature.get_index() >= kBaseDimensions) {
+                    feature.shift_index(base_dimensions - kBaseDimensions);
+                }
+            }
+        }
+    };
+
+}  // namespace Eval::NNUE::Features

 #endif
@@ -0,0 +1,93 @@
+#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KA_H_
+#define _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KA_H_
+
+#include "factorizer.h"
+
+#include "nnue/features/half_ka.h"
+#include "nnue/features/a.h"
+#include "nnue/features/half_relative_ka.h"
+
+// Specialization of NNUE evaluation function feature conversion class template for HalfKA
+namespace Eval::NNUE::Features {
+
+    // Class template that converts input features into learning features
+    // Specialization for HalfKA
+    template <Side AssociatedKing>
+    class Factorizer<HalfKA<AssociatedKing>> {
+    private:
+        using FeatureType = HalfKA<AssociatedKing>;
+
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions =
+            FeatureType::kMaxActiveDimensions;
+
+        // Type of learning feature
+        enum TrainingFeatureType {
+            kFeaturesHalfKA,
+            kFeaturesA,
+            kFeaturesHalfRelativeKA,
+            kNumTrainingFeatureTypes,
+        };
+
+        // Learning feature information
+        static constexpr FeatureProperties kProperties[] = {
+            // kFeaturesHalfA
+            {true, FeatureType::kDimensions},
+            // kFeaturesA
+            {true, Factorizer<A>::get_dimensions()},
+            // kFeaturesHalfRelativeKA
+            {true, Factorizer<HalfRelativeKA<AssociatedKing>>::get_dimensions()},
+        };
+
+        static_assert(get_array_length(kProperties) == kNumTrainingFeatureTypes, "");
+
+    public:
+        static constexpr std::string get_name() {
+            return std::string("Factorizer<") + FeatureType::kName + "> -> " + "A, HalfRelativeKA";
+        }
+
+        static constexpr std::string get_factorizers_string() {
+            return "  - " + get_name();
+        }
+
+        // Get the dimensionality of the learning feature
+        static constexpr IndexType get_dimensions() {
+            return get_active_dimensions(kProperties);
+        }
+
+        // Get index of learning feature and scale of learning rate
+        static void append_training_features(
+            IndexType base_index, std::vector<TrainingFeature>* training_features) {
+
+            // kFeaturesHalfA
+            IndexType index_offset = append_base_feature<FeatureType>(
+                kProperties[kFeaturesHalfKA], base_index, training_features);
+
+            const auto sq_k = static_cast<Square>(base_index / PS_END2);
+            const auto a = static_cast<IndexType>(base_index % PS_END2);
+
+            // kFeaturesA
+            index_offset += inherit_features_if_required<A>(
+                index_offset, kProperties[kFeaturesA], a, training_features);
+
+            // kFeaturesHalfRelativeKA
+            if (a >= PS_W_PAWN) {
+                index_offset += inherit_features_if_required<HalfRelativeKA<AssociatedKing>>(
+                    index_offset, kProperties[kFeaturesHalfRelativeKA],
+                    HalfRelativeKA<AssociatedKing>::make_index(sq_k, a),
+                    training_features);
+            }
+            else {
+                index_offset += skip_features(kProperties[kFeaturesHalfRelativeKA]);
+            }
+
+            assert(index_offset == get_dimensions());
+        }
+    };
+
+    template <Side AssociatedKing>
+    constexpr FeatureProperties Factorizer<HalfKA<AssociatedKing>>::kProperties[];
+
+}  // namespace Eval::NNUE::Features
+
+#endif // #ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KA_H_
@@ -1,103 +1,104 @@
-// Specialization of NNUE evaluation function feature conversion class template for HalfKP
-
-#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_
+#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_
 #define _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_

-#if defined(EVAL_NNUE)
-
-#include "../../features/half_kp.h"
-#include "../../features/p.h"
-#include "../../features/half_relative_kp.h"
 #include "factorizer.h"

-namespace Eval {
+#include "nnue/features/half_kp.h"
+#include "nnue/features/p.h"
+#include "nnue/features/half_relative_kp.h"

-namespace NNUE {
+// Specialization of NNUE evaluation function feature conversion class template for HalfKP
+namespace Eval::NNUE::Features {

-namespace Features {
+    // Class template that converts input features into learning features
+    // Specialization for HalfKP
+    template <Side AssociatedKing>
+    class Factorizer<HalfKP<AssociatedKing>> {
+    private:
+        using FeatureType = HalfKP<AssociatedKing>;

-// Class template that converts input features into learning features
-// Specialization for HalfKP
-template <Side AssociatedKing>
-class Factorizer<HalfKP<AssociatedKing>> {
- private:
-  using FeatureType = HalfKP<AssociatedKing>;
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions =
+            FeatureType::kMaxActiveDimensions;

-  // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-  static constexpr IndexType kMaxActiveDimensions =
-      FeatureType::kMaxActiveDimensions;
+        // Type of learning feature
+        enum TrainingFeatureType {
+            kFeaturesHalfKP,
+            kFeaturesHalfK,
+            kFeaturesP,
+            kFeaturesHalfRelativeKP,
+            kNumTrainingFeatureTypes,
+        };

-  // Type of learning feature
-  enum TrainingFeatureType {
-    kFeaturesHalfKP,
-    kFeaturesHalfK,
-    kFeaturesP,
-    kFeaturesHalfRelativeKP,
-    kNumTrainingFeatureTypes,
-  };
+        // Learning feature information
+        static constexpr FeatureProperties kProperties[] = {
+            // kFeaturesHalfKP
+            {true, FeatureType::kDimensions},
+            // kFeaturesHalfK
+            {true, SQUARE_NB},
+            // kFeaturesP
+            {true, Factorizer<P>::get_dimensions()},
+            // kFeaturesHalfRelativeKP
+            {true, Factorizer<HalfRelativeKP<AssociatedKing>>::get_dimensions()},
+        };

-  // Learning feature information
-  static constexpr FeatureProperties kProperties[] = {
-    // kFeaturesHalfKP
-    {true, FeatureType::kDimensions},
-    // kFeaturesHalfK
-    {true, SQUARE_NB},
-    // kFeaturesP
-    {true, Factorizer<P>::GetDimensions()},
-    // kFeaturesHalfRelativeKP
-    {true, Factorizer<HalfRelativeKP<AssociatedKing>>::GetDimensions()},
-  };
-  static_assert(GetArrayLength(kProperties) == kNumTrainingFeatureTypes, "");
+        static_assert(get_array_length(kProperties) == kNumTrainingFeatureTypes, "");

- public:
-  // Get the dimensionality of the learning feature
-  static constexpr IndexType GetDimensions() {
-    return GetActiveDimensions(kProperties);
-  }
+    public:
+        static constexpr std::string get_name() {
+            return std::string("Factorizer<") + FeatureType::kName + "> -> " + "HalfK, P, HalfRelativeKP";
+        }

-  // Get index of learning feature and scale of learning rate
-  static void AppendTrainingFeatures(
-      IndexType base_index, std::vector<TrainingFeature>* training_features) {
-    // kFeaturesHalfKP
-    IndexType index_offset = AppendBaseFeature<FeatureType>(
-        kProperties[kFeaturesHalfKP], base_index, training_features);
+        static constexpr std::string get_factorizers_string() {
+            return "  - " + get_name();
+        }

-    const auto sq_k = static_cast<Square>(base_index / PS_END);
-    const auto p = static_cast<IndexType>(base_index % PS_END);
-    // kFeaturesHalfK
-    {
-      const auto& properties = kProperties[kFeaturesHalfK];
-      if (properties.active) {
-        training_features->emplace_back(index_offset + sq_k);
-        index_offset += properties.dimensions;
-      }
-    }
-    // kFeaturesP
-    index_offset += InheritFeaturesIfRequired<P>(
-        index_offset, kProperties[kFeaturesP], p, training_features);
-    // kFeaturesHalfRelativeKP
-    if (p >= PS_W_PAWN) {
-      index_offset += InheritFeaturesIfRequired<HalfRelativeKP<AssociatedKing>>(
-          index_offset, kProperties[kFeaturesHalfRelativeKP],
-          HalfRelativeKP<AssociatedKing>::MakeIndex(sq_k, p),
-          training_features);
-    } else {
-      index_offset += SkipFeatures(kProperties[kFeaturesHalfRelativeKP]);
-    }
+        // Get the dimensionality of the learning feature
+        static constexpr IndexType get_dimensions() {
+            return get_active_dimensions(kProperties);
+        }

-    assert(index_offset == GetDimensions());
-  }
-};
+        // Get index of learning feature and scale of learning rate
+        static void append_training_features(
+            IndexType base_index, std::vector<TrainingFeature>* training_features) {

-template <Side AssociatedKing>
-constexpr FeatureProperties Factorizer<HalfKP<AssociatedKing>>::kProperties[];
+            // kFeaturesHalfKP
+            IndexType index_offset = append_base_feature<FeatureType>(
+                kProperties[kFeaturesHalfKP], base_index, training_features);

-}  // namespace Features
+            const auto sq_k = static_cast<Square>(base_index / PS_END);
+            const auto p = static_cast<IndexType>(base_index % PS_END);

-}  // namespace NNUE
+            // kFeaturesHalfK
+            {
+                const auto& properties = kProperties[kFeaturesHalfK];
+                if (properties.active) {
+                    training_features->emplace_back(index_offset + sq_k);
+                    index_offset += properties.dimensions;
+                }
+            }

-}  // namespace Eval
+            // kFeaturesP
+            index_offset += inherit_features_if_required<P>(
+                index_offset, kProperties[kFeaturesP], p, training_features);
+            // kFeaturesHalfRelativeKP
+            if (p >= PS_W_PAWN) {
+                index_offset += inherit_features_if_required<HalfRelativeKP<AssociatedKing>>(
+                    index_offset, kProperties[kFeaturesHalfRelativeKP],
+                    HalfRelativeKP<AssociatedKing>::make_index(sq_k, p),
+                    training_features);
+            }
+            else {
+                index_offset += skip_features(kProperties[kFeaturesHalfRelativeKP]);
+            }

-#endif  // defined(EVAL_NNUE)
+            assert(index_offset == get_dimensions());
+        }
+    };
+
+    template <Side AssociatedKing>
+    constexpr FeatureProperties Factorizer<HalfKP<AssociatedKing>>::kProperties[];
+
+}  // namespace Eval::NNUE::Features

 #endif
@@ -1,125 +1,122 @@
-// Common header of class template for learning NNUE evaluation function
-
-#ifndef _NNUE_TRAINER_H_
+#ifndef _NNUE_TRAINER_H_
 #define _NNUE_TRAINER_H_

-#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
-
-#include "../nnue_common.h"
-#include "../features/index_list.h"
+#include "nnue/nnue_common.h"
+#include "nnue/features/index_list.h"

 #include <sstream>
+
 #if defined(USE_BLAS)
 static_assert(std::is_same<LearnFloatType, float>::value, "");
 #include <cblas.h>
 #endif

-namespace Eval {
+// Common header of class template for learning NNUE evaluation function
+namespace Eval::NNUE {

-namespace NNUE {
+    // Ponanza constant used in the relation between evaluation value and winning percentage
+    constexpr double kPonanzaConstant = 600.0;

-// Ponanza constant used in the relation between evaluation value and winning percentage
-constexpr double kPonanzaConstant = 600.0;
+    // Class that represents one index of learning feature
+    class TrainingFeature {
+        using StorageType = std::uint32_t;
+        static_assert(std::is_unsigned<StorageType>::value, "");

-// Class that represents one index of learning feature
-class TrainingFeature {
-  using StorageType = std::uint32_t;
-  static_assert(std::is_unsigned<StorageType>::value, "");
+    public:
+        static constexpr std::uint32_t kIndexBits = 24;

- public:
-  static constexpr std::uint32_t kIndexBits = 24;
-  static_assert(kIndexBits < std::numeric_limits<StorageType>::digits, "");
-  static constexpr std::uint32_t kCountBits =
-      std::numeric_limits<StorageType>::digits - kIndexBits;
+        static_assert(kIndexBits < std::numeric_limits<StorageType>::digits, "");

-  explicit TrainingFeature(IndexType index) :
-      index_and_count_((index << kCountBits) | 1) {
-    assert(index < (1 << kIndexBits));
-  }
-  TrainingFeature& operator+=(const TrainingFeature& other) {
-    assert(other.GetIndex() == GetIndex());
-    assert(other.GetCount() + GetCount() < (1 << kCountBits));
-    index_and_count_ += other.GetCount();
-    return *this;
-  }
-  IndexType GetIndex() const {
-    return static_cast<IndexType>(index_and_count_ >> kCountBits);
-  }
-  void ShiftIndex(IndexType offset) {
-    assert(GetIndex() + offset < (1 << kIndexBits));
-    index_and_count_ += offset << kCountBits;
-  }
-  IndexType GetCount() const {
-    return static_cast<IndexType>(index_and_count_ & ((1 << kCountBits) - 1));
-  }
-  bool operator<(const TrainingFeature& other) const {
-    return index_and_count_ < other.index_and_count_;
-  }
+        static constexpr std::uint32_t kCountBits =
+            std::numeric_limits<StorageType>::digits - kIndexBits;

- private:
-  StorageType index_and_count_;
-};
+        explicit TrainingFeature(IndexType index) :
+            index_and_count_((index << kCountBits) | 1) {

-// Structure that represents one sample of training data
-struct Example {
-  std::vector<TrainingFeature> training_features[2];
-  Learner::PackedSfenValue psv;
-  int sign;
-  double weight;
-};
+            assert(index < (1 << kIndexBits));
+        }

-// Message used for setting hyperparameters
-struct Message {
-  Message(const std::string& name, const std::string& value = ""):
-      name(name), value(value), num_peekers(0), num_receivers(0) {}
-  const std::string name;
-  const std::string value;
-  std::uint32_t num_peekers;
-  std::uint32_t num_receivers;
-};
+        TrainingFeature& operator+=(const TrainingFeature& other) {
+            assert(other.get_index() == get_index());
+            assert(other.get_count() + get_count() < (1 << kCountBits));
+            index_and_count_ += other.get_count();
+            return *this;
+        }

-// determine whether to accept the message
-bool ReceiveMessage(const std::string& name, Message* message) {
-  const auto subscript = "[" + std::to_string(message->num_peekers) + "]";
-  if (message->name.substr(0, name.size() + 1) == name + "[") {
-    ++message->num_peekers;
-  }
-  if (message->name == name || message->name == name + subscript) {
-    ++message->num_receivers;
-    return true;
-  }
-  return false;
-}
+        IndexType get_index() const {
+            return static_cast<IndexType>(index_and_count_ >> kCountBits);
+        }

-// split the string
-std::vector<std::string> Split(const std::string& input, char delimiter) {
-  std::istringstream stream(input);
-  std::string field;
-  std::vector<std::string> fields;
-  while (std::getline(stream, field, delimiter)) {
-    fields.push_back(field);
-  }
-  return fields;
-}
+        void shift_index(IndexType offset) {
+            assert(get_index() + offset < (1 << kIndexBits));
+            index_and_count_ += offset << kCountBits;
+        }

-// round a floating point number to an integer
-template <typename IntType>
-IntType Round(double value) {
-  return static_cast<IntType>(std::floor(value + 0.5));
-}
+        IndexType get_count() const {
+            return static_cast<IndexType>(index_and_count_ & ((1 << kCountBits) - 1));
+        }

-// make_shared with alignment
-template <typename T, typename... ArgumentTypes>
-std::shared_ptr<T> MakeAlignedSharedPtr(ArgumentTypes&&... arguments) {
-  const auto ptr = new(std_aligned_alloc(alignof(T), sizeof(T)))
-      T(std::forward<ArgumentTypes>(arguments)...);
-  return std::shared_ptr<T>(ptr, AlignedDeleter<T>());
-}
+        bool operator<(const TrainingFeature& other) const {
+            return index_and_count_ < other.index_and_count_;
+        }

-}  // namespace NNUE
+    private:
+        StorageType index_and_count_;
+    };

-}  // namespace Eval
+    // Structure that represents one sample of training data
+    struct Example {
+        std::vector<TrainingFeature> training_features[2];
+        Learner::PackedSfenValue psv;
+        Value discrete_nn_eval;
+        int sign;
+        double weight;
+    };

-#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+    // Message used for setting hyperparameters
+    struct Message {
+        Message(const std::string& message_name, const std::string& message_value = "") :
+            name(message_name), value(message_value), num_peekers(0), num_receivers(0)
+        {
+        }
+
+        const std::string name;
+        const std::string value;
+        std::uint32_t num_peekers;
+        std::uint32_t num_receivers;
+    };
+
+    // determine whether to accept the message
+    bool receive_message(const std::string& name, Message* message) {
+        const auto subscript = "[" + std::to_string(message->num_peekers) + "]";
+
+        if (message->name.substr(0, name.size() + 1) == name + "[") {
+            ++message->num_peekers;
+        }
+
+        if (message->name == name || message->name == name + subscript) {
+            ++message->num_receivers;
+            return true;
+        }
+
+        return false;
+    }
+
+    // round a floating point number to an integer
+    template <typename IntType>
+    IntType round(double value) {
+        return static_cast<IntType>(std::floor(value + 0.5));
+    }
+
+    // make_shared with alignment
+    template <typename T, typename... ArgumentTypes>
+    std::shared_ptr<T> make_aligned_shared_ptr(ArgumentTypes&&... arguments) {
+        const auto ptr = new(std_aligned_alloc(alignof(T), sizeof(T)))
+            T(std::forward<ArgumentTypes>(arguments)...);
+
+        return std::shared_ptr<T>(ptr, AlignedDeleter<T>());
+    }
+
+}  // namespace Eval::NNUE

 #endif
@@ -1,301 +1,476 @@
-// Specialization of NNUE evaluation function learning class template for AffineTransform
-
-#ifndef _NNUE_TRAINER_AFFINE_TRANSFORM_H_
+#ifndef _NNUE_TRAINER_AFFINE_TRANSFORM_H_
 #define _NNUE_TRAINER_AFFINE_TRANSFORM_H_

-#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
-
-#include "../../learn/learn.h"
-#include "../layers/affine_transform.h"
 #include "trainer.h"

+#include "extra/stockfish_blas.h"
+
+#include "learn/learn.h"
+
+#include "nnue/layers/affine_transform.h"
+
+#include "thread.h"
+
 #include <random>

-namespace Eval {
+// Specialization of NNUE evaluation function learning class template for AffineTransform
+namespace Eval::NNUE {

-namespace NNUE {
+    // Learning: Affine transformation layer
+    template <typename PreviousLayer, IndexType OutputDimensions>
+    class Trainer<Layers::AffineTransform<PreviousLayer, OutputDimensions>> {
+    private:
+        // Type of layer to learn
+        using LayerType = Layers::AffineTransform<PreviousLayer, OutputDimensions>;

-// Learning: Affine transformation layer
-template <typename PreviousLayer, IndexType OutputDimensions>
-class Trainer<Layers::AffineTransform<PreviousLayer, OutputDimensions>> {
- private:
-  // Type of layer to learn
-  using LayerType = Layers::AffineTransform<PreviousLayer, OutputDimensions>;
+    public:
+        // factory function
+        static std::shared_ptr<Trainer> create(
+            LayerType* target_layer, FeatureTransformer* ft) {

- public:
-  // factory function
-  static std::shared_ptr<Trainer> Create(
-      LayerType* target_layer, FeatureTransformer* feature_transformer) {
-    return std::shared_ptr<Trainer>(
-        new Trainer(target_layer, feature_transformer));
-  }
-
-  // Set options such as hyperparameters
-  void SendMessage(Message* message) {
-    previous_layer_trainer_->SendMessage(message);
-    if (ReceiveMessage("momentum", message)) {
-      momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
-    }
-    if (ReceiveMessage("learning_rate_scale", message)) {
-      learning_rate_scale_ =
-          static_cast<LearnFloatType>(std::stod(message->value));
-    }
-    if (ReceiveMessage("reset", message)) {
-      DequantizeParameters();
-    }
-    if (ReceiveMessage("quantize_parameters", message)) {
-      QuantizeParameters();
-    }
-  }
-
-  // Initialize the parameters with random numbers
-  template <typename RNG>
-  void Initialize(RNG& rng) {
-    previous_layer_trainer_->Initialize(rng);
-    if (kIsOutputLayer) {
-      // Initialize output layer with 0
-      std::fill(std::begin(biases_), std::end(biases_),
-                static_cast<LearnFloatType>(0.0));
-      std::fill(std::begin(weights_), std::end(weights_),
-                static_cast<LearnFloatType>(0.0));
-    } else {
-      // Assuming that the input distribution is unit-mean 0.5, equal variance,
-      // Initialize the output distribution so that each unit has a mean of 0.5 and the same variance as the input
-      const double kSigma = 1.0 / std::sqrt(kInputDimensions);
-      auto distribution = std::normal_distribution<double>(0.0, kSigma);
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        double sum = 0.0;
-        for (IndexType j = 0; j < kInputDimensions; ++j) {
-          const auto weight = static_cast<LearnFloatType>(distribution(rng));
-          weights_[kInputDimensions * i + j] = weight;
-          sum += weight;
+            return std::shared_ptr<Trainer>(
+                new Trainer(target_layer, ft));
        }
-        biases_[i] = static_cast<LearnFloatType>(0.5 - 0.5 * sum);
-      }
-    }
-    QuantizeParameters();
-  }

-  // forward propagation
-  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
-    if (output_.size() < kOutputDimensions * batch.size()) {
-      output_.resize(kOutputDimensions * batch.size());
-      gradients_.resize(kInputDimensions * batch.size());
-    }
-    batch_size_ = static_cast<IndexType>(batch.size());
-    batch_input_ = previous_layer_trainer_->Propagate(batch);
+        // Set options such as hyperparameters
+        void send_message(Message* message) {
+            previous_layer_trainer_->send_message(message);
+
+            if (receive_message("momentum", message)) {
+                momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
+            }
+
+            if (receive_message("learning_rate_scale", message)) {
+                learning_rate_scale_ =
+                    static_cast<LearnFloatType>(std::stod(message->value));
+            }
+
+            if (receive_message("reset", message)) {
+                dequantize_parameters();
+            }
+
+            if (receive_message("quantize_parameters", message)) {
+                quantize_parameters();
+            }
+
+            if (receive_message("check_health", message)) {
+                check_health();
+            }
+        }
+
+        // Initialize the parameters with random numbers
+        template <typename RNG>
+        void initialize(RNG& rng) {
+            previous_layer_trainer_->initialize(rng);
+
+            if (kIsOutputLayer) {
+                // Initialize output layer with 0
+                std::fill(std::begin(biases_), std::end(biases_),
+                          static_cast<LearnFloatType>(0.0));
+                std::fill(std::begin(weights_), std::end(weights_),
+                          static_cast<LearnFloatType>(0.0));
+            }
+            else {
+                // Assuming that the input distribution is unit-mean 0.5, equal variance,
+                // Initialize the output distribution so that each unit has a mean of 0.5 and the same variance as the input
+                const double kSigma = 1.0 / std::sqrt(kInputDimensions);
+                auto distribution = std::normal_distribution<double>(0.0, kSigma);
+
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                    double sum = 0.0;
+                      for (IndexType j = 0; j < kInputDimensions; ++j) {
+                          const auto weight = static_cast<LearnFloatType>(distribution(rng));
+                          weights_[kInputDimensions * i + j] = weight;
+                          sum += weight;
+                      }
+
+                    biases_[i] = static_cast<LearnFloatType>(0.5 - 0.5 * sum);
+                }
+            }
+
+            quantize_parameters();
+        }
+
+        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
+        {
+            const auto size = batch_end - batch_begin;
+
+            if ((long)output_.size() < (long)kOutputDimensions * size) {
+                output_.resize(kOutputDimensions * size);
+                gradients_.resize(kInputDimensions * size);
+            }
+
+            if (thread_states_.size() < thread_pool.size())
+            {
+                thread_states_.resize(thread_pool.size());
+            }
+
+            combined_batch_size_ = size;
+            combined_batch_input_ = previous_layer_trainer_->step_start(thread_pool, batch_begin, batch_end);
+
+            auto& main_thread_state = thread_states_[0];
+
 #if defined(USE_BLAS)
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      cblas_scopy(kOutputDimensions, biases_, 1, &output_[batch_offset], 1);
-    }
-    cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
-                kOutputDimensions, batch_size_, kInputDimensions, 1.0,
-                weights_, kInputDimensions,
-                batch_input_, kInputDimensions,
-                1.0, &output_[0], kOutputDimensions);
-#else
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType input_batch_offset = kInputDimensions * b;
-      const IndexType output_batch_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        double sum = biases_[i];
-        for (IndexType j = 0; j < kInputDimensions; ++j) {
-          const IndexType index = kInputDimensions * i + j;
-          sum += weights_[index] * batch_input_[input_batch_offset + j];
-        }
-        output_[output_batch_offset + i] = static_cast<LearnFloatType>(sum);
-      }
-    }
-#endif
-    return output_.data();
-  }

-  // backpropagation
-  void Backpropagate(const LearnFloatType* gradients,
-                     LearnFloatType learning_rate) {
-    const LearnFloatType local_learning_rate =
-        learning_rate * learning_rate_scale_;
+            // update
+            cblas_sscal(
+                kOutputDimensions, momentum_, main_thread_state.biases_diff_, 1
+            );
+
+#else
+
+            Blas::sscal(
+                kOutputDimensions, momentum_, main_thread_state.biases_diff_, 1
+            );
+
+#endif
+
+            for (IndexType i = 1; i < thread_states_.size(); ++i)
+                thread_states_[i].reset_biases();
+
+            return output_.data();
+        }
+
+        // forward propagation
+        void propagate(Thread& th, const uint64_t offset, const uint64_t count) {
+
+            previous_layer_trainer_->propagate(th, offset, count);
+
 #if defined(USE_BLAS)
-    // backpropagate
-    cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
-                kInputDimensions, batch_size_, kOutputDimensions, 1.0,
+
+            for (IndexType b = offset; b < offset + count; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                cblas_scopy(
+                    kOutputDimensions, biases_, 1, &output_[batch_offset], 1
+                );
+            }
+
+            cblas_sgemm(
+                CblasColMajor, CblasTrans, CblasNoTrans,
+                kOutputDimensions, count, kInputDimensions,
+                1.0,
                weights_, kInputDimensions,
-                gradients, kOutputDimensions,
-                0.0, &gradients_[0], kInputDimensions);
-    // update
-    cblas_sscal(kOutputDimensions, momentum_, biases_diff_, 1);
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      cblas_saxpy(kOutputDimensions, 1.0,
-                  &gradients[batch_offset], 1, biases_diff_, 1);
-    }
-    cblas_saxpy(kOutputDimensions, -local_learning_rate,
-                biases_diff_, 1, biases_, 1);
-    cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
-                kOutputDimensions, kInputDimensions, batch_size_, 1.0,
-                gradients, kOutputDimensions,
-                batch_input_, kInputDimensions,
-                momentum_, weights_diff_, kInputDimensions);
-    cblas_saxpy(kOutputDimensions * kInputDimensions, -local_learning_rate,
-                weights_diff_, 1, weights_, 1);
+                combined_batch_input_ + offset * kInputDimensions, kInputDimensions,
+                1.0,
+                &output_[offset * kOutputDimensions], kOutputDimensions
+            );
 #else
-    // backpropagate
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType input_batch_offset = kInputDimensions * b;
-      const IndexType output_batch_offset = kOutputDimensions * b;
-      for (IndexType j = 0; j < kInputDimensions; ++j) {
-        double sum = 0.0;
-        for (IndexType i = 0; i < kOutputDimensions; ++i) {
-          const IndexType index = kInputDimensions * i + j;
-          sum += weights_[index] * gradients[output_batch_offset + i];
-        }
-        gradients_[input_batch_offset + j] = static_cast<LearnFloatType>(sum);
-      }
-    }
-    // update
-    for (IndexType i = 0; i < kOutputDimensions; ++i) {
-      biases_diff_[i] *= momentum_;
-    }
-    for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
-      weights_diff_[i] *= momentum_;
-    }
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType input_batch_offset = kInputDimensions * b;
-      const IndexType output_batch_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        biases_diff_[i] += gradients[output_batch_offset + i];
-      }
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        for (IndexType j = 0; j < kInputDimensions; ++j) {
-          const IndexType index = kInputDimensions * i + j;
-          weights_diff_[index] += gradients[output_batch_offset + i] *
-              batch_input_[input_batch_offset + j];
-        }
-      }
-    }
-    for (IndexType i = 0; i < kOutputDimensions; ++i) {
-      biases_[i] -= local_learning_rate * biases_diff_[i];
-    }
-    for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
-      weights_[i] -= local_learning_rate * weights_diff_[i];
-    }
-#endif
-    previous_layer_trainer_->Backpropagate(gradients_.data(), learning_rate);
-  }

- private:
-  // constructor
-  Trainer(LayerType* target_layer, FeatureTransformer* feature_transformer) :
-      batch_size_(0),
-      batch_input_(nullptr),
-      previous_layer_trainer_(Trainer<PreviousLayer>::Create(
-          &target_layer->previous_layer_, feature_transformer)),
-      target_layer_(target_layer),
-      biases_(),
-      weights_(),
-      biases_diff_(),
-      weights_diff_(),
-      momentum_(0.0),
-      learning_rate_scale_(1.0) {
-    DequantizeParameters();
-  }
+            for (IndexType b = offset; b < offset + count; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                Blas::scopy(
+                    kOutputDimensions, biases_, 1, &output_[batch_offset], 1
+                );
+            }

-  // Weight saturation and parameterization
-  void QuantizeParameters() {
-    for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
-      weights_[i] = std::max(-kMaxWeightMagnitude,
-                             std::min(+kMaxWeightMagnitude, weights_[i]));
-    }
-    for (IndexType i = 0; i < kOutputDimensions; ++i) {
-      target_layer_->biases_[i] =
-          Round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
-    }
-    for (IndexType i = 0; i < kOutputDimensions; ++i) {
-      const auto offset = kInputDimensions * i;
-      const auto padded_offset = LayerType::kPaddedInputDimensions * i;
-      for (IndexType j = 0; j < kInputDimensions; ++j) {
-        target_layer_->weights_[padded_offset + j] =
-            Round<typename LayerType::WeightType>(
-                weights_[offset + j] * kWeightScale);
-      }
-    }
-  }
-
-  // read parameterized integer
-  void DequantizeParameters() {
-    for (IndexType i = 0; i < kOutputDimensions; ++i) {
-      biases_[i] = static_cast<LearnFloatType>(
-          target_layer_->biases_[i] / kBiasScale);
-    }
-    for (IndexType i = 0; i < kOutputDimensions; ++i) {
-      const auto offset = kInputDimensions * i;
-      const auto padded_offset = LayerType::kPaddedInputDimensions * i;
-      for (IndexType j = 0; j < kInputDimensions; ++j) {
-        weights_[offset + j] = static_cast<LearnFloatType>(
-            target_layer_->weights_[padded_offset + j] / kWeightScale);
-      }
-    }
-    std::fill(std::begin(biases_diff_), std::end(biases_diff_),
-              static_cast<LearnFloatType>(0.0));
-    std::fill(std::begin(weights_diff_), std::end(weights_diff_),
-              static_cast<LearnFloatType>(0.0));
-  }
-
-  // number of input/output dimensions
-  static constexpr IndexType kInputDimensions = LayerType::kInputDimensions;
-  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
-
-  // If the output dimensionality is 1, the output layer
-  static constexpr bool kIsOutputLayer = kOutputDimensions == 1;
-
-  // Coefficient used for parameterization
-  static constexpr LearnFloatType kActivationScale =
-      std::numeric_limits<std::int8_t>::max();
-  static constexpr LearnFloatType kBiasScale = kIsOutputLayer ?
-      (kPonanzaConstant * FV_SCALE) :
-      ((1 << kWeightScaleBits) * kActivationScale);
-  static constexpr LearnFloatType kWeightScale = kBiasScale / kActivationScale;
-
-  // Upper limit of absolute value of weight used to prevent overflow when parameterizing integers
-  static constexpr LearnFloatType kMaxWeightMagnitude =
-      std::numeric_limits<typename LayerType::WeightType>::max() / kWeightScale;
-
-  // number of samples in mini-batch
-  IndexType batch_size_;
-
-  // Input mini batch
-  const LearnFloatType* batch_input_;
-
-  // Trainer of the previous layer
-  const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
-
-  // layer to learn
-  LayerType* const target_layer_;
-
-  // parameter
-  LearnFloatType biases_[kOutputDimensions];
-  LearnFloatType weights_[kOutputDimensions * kInputDimensions];
-
-  // Buffer used for updating parameters
-  LearnFloatType biases_diff_[kOutputDimensions];
-  LearnFloatType weights_diff_[kOutputDimensions * kInputDimensions];
-
-  // Forward propagation buffer
-  std::vector<LearnFloatType> output_;
-
-  // buffer for back propagation
-  std::vector<LearnFloatType> gradients_;
-
-  // hyper parameter
-  LearnFloatType momentum_;
-  LearnFloatType learning_rate_scale_;
-};
-
-}  // namespace NNUE
-
-}  // namespace Eval
-
-#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+            Blas::sgemm(
+                Blas::MatrixLayout::ColMajor, Blas::MatrixTranspose::Trans, Blas::MatrixTranspose::NoTrans,
+                kOutputDimensions, count, kInputDimensions,
+                1.0,
+                weights_, kInputDimensions,
+                combined_batch_input_ + offset * kInputDimensions, kInputDimensions,
+                1.0,
+                &output_[offset * kOutputDimensions], kOutputDimensions
+            );
+
+#endif
+        }
+
+        // backpropagation
+        void backpropagate(Thread& th,
+                           const LearnFloatType* gradients,
+                           uint64_t offset,
+                           uint64_t count) {
+
+            auto& thread_state = thread_states_[th.thread_idx()];
+            const auto momentum = th.thread_idx() == 0 ? momentum_ : 0.0f;
+#if defined(USE_BLAS)
+
+            cblas_sgemm(
+                CblasColMajor, CblasNoTrans, CblasNoTrans,
+                kInputDimensions, count, kOutputDimensions,
+                1.0,
+                weights_, kInputDimensions,
+                gradients + offset * kOutputDimensions, kOutputDimensions,
+                0.0,
+                &gradients_[offset * kInputDimensions], kInputDimensions
+            );
+
+            for (IndexType b = offset; b < offset + count; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                cblas_saxpy(
+                    kOutputDimensions, 1.0,
+                    &gradients[batch_offset], 1, thread_state.biases_diff_, 1
+                );
+            }
+
+            cblas_sgemm(
+                CblasRowMajor, CblasTrans, CblasNoTrans,
+                kOutputDimensions, kInputDimensions, count,
+                1.0,
+                gradients + offset * kOutputDimensions, kOutputDimensions,
+                combined_batch_input_ + offset * kInputDimensions, kInputDimensions,
+                momentum,
+                thread_state.weights_diff_, kInputDimensions
+            );
+
+#else
+
+            // backpropagate
+            Blas::sgemm(
+                Blas::MatrixLayout::ColMajor, Blas::MatrixTranspose::NoTrans, Blas::MatrixTranspose::NoTrans,
+                kInputDimensions, count, kOutputDimensions,
+                1.0,
+                weights_, kInputDimensions,
+                gradients + offset * kOutputDimensions, kOutputDimensions,
+                0.0,
+                &gradients_[offset * kInputDimensions], kInputDimensions
+            );
+
+            for (IndexType b = offset; b < offset + count; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                Blas::saxpy(kOutputDimensions, 1.0,
+                          &gradients[batch_offset], 1, thread_state.biases_diff_, 1);
+            }
+
+            Blas::sgemm(
+                Blas::MatrixLayout::RowMajor, Blas::MatrixTranspose::Trans, Blas::MatrixTranspose::NoTrans,
+                kOutputDimensions, kInputDimensions, count,
+                1.0,
+                gradients + offset * kOutputDimensions, kOutputDimensions,
+                combined_batch_input_ + offset * kInputDimensions, kInputDimensions,
+                momentum,
+                thread_state.weights_diff_, kInputDimensions
+            );
+
+#endif
+
+            previous_layer_trainer_->backpropagate(th, gradients_.data(), offset, count);
+        }
+
+        void reduce_thread_state()
+        {
+            for (IndexType i = 1; i < thread_states_.size(); ++i)
+            {
+                thread_states_[0] += thread_states_[i];
+            }
+        }
+
+        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate)
+        {
+            const LearnFloatType local_learning_rate =
+                learning_rate * learning_rate_scale_;
+
+            reduce_thread_state();
+
+            auto& main_thread_state = thread_states_[0];
+
+            for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                const double d = local_learning_rate * main_thread_state.biases_diff_[i];
+                biases_[i] -= d;
+                abs_biases_diff_sum_ += std::abs(d);
+            }
+            num_biases_diffs_ += kOutputDimensions;
+
+            for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
+                const double d = local_learning_rate * main_thread_state.weights_diff_[i];
+                weights_[i] -= d;
+                abs_weights_diff_sum_ += std::abs(d);
+            }
+            num_weights_diffs_ += kOutputDimensions * kInputDimensions;
+
+            previous_layer_trainer_->step_end(thread_pool, learning_rate);
+        }
+
+    private:
+        // constructor
+        Trainer(LayerType* target_layer, FeatureTransformer* ft) :
+            combined_batch_size_(0),
+            combined_batch_input_(nullptr),
+            previous_layer_trainer_(Trainer<PreviousLayer>::create(
+                &target_layer->previous_layer_, ft)),
+            target_layer_(target_layer),
+            biases_(),
+            weights_(),
+            momentum_(0.2),
+            learning_rate_scale_(1.0) {
+
+            dequantize_parameters();
+        }
+
+        void reset_stats() {
+            abs_biases_diff_sum_ = 0.0;
+            abs_weights_diff_sum_ = 0.0;
+            num_biases_diffs_ = 0;
+            num_weights_diffs_ = 0;
+        }
+
+        void check_health() {
+
+            double abs_bias_sum = 0.0;
+            double abs_weight_sum = 0.0;
+
+            for(auto b : biases_)
+                abs_bias_sum += std::abs(b);
+
+            for(auto w : weights_)
+                abs_weight_sum += std::abs(w);
+
+            auto out = sync_region_cout.new_region();
+
+            out << "INFO (check_health):"
+                << " layer " << LayerType::kLayerIndex
+                << " - " << LayerType::get_name()
+                << std::endl;
+
+            out << "  - avg_abs_bias        = " << abs_bias_sum / std::size(biases_) << std::endl;
+            out << "  - avg_abs_bias_diff   = " << abs_biases_diff_sum_ / num_biases_diffs_ << std::endl;
+            out << "  - avg_abs_weight      = " << abs_weight_sum / std::size(weights_) << std::endl;
+            out << "  - avg_abs_weight_diff = " << abs_weights_diff_sum_ / num_weights_diffs_ << std::endl;
+
+            out.unlock();
+
+            reset_stats();
+        }
+
+        // Weight saturation and parameterization
+        void quantize_parameters() {
+            for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
+                weights_[i] = std::max(-kMaxWeightMagnitude,
+                                       std::min(+kMaxWeightMagnitude, weights_[i]));
+            }
+
+            for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                target_layer_->biases_[i] =
+                    round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
+            }
+
+            for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                const auto offset = kInputDimensions * i;
+                const auto padded_offset = LayerType::kPaddedInputDimensions * i;
+                for (IndexType j = 0; j < kInputDimensions; ++j) {
+                    target_layer_->weights_[padded_offset + j] =
+                        round<typename LayerType::WeightType>(
+                            weights_[offset + j] * kWeightScale);
+                }
+            }
+        }
+
+        // read parameterized integer
+        void dequantize_parameters() {
+            for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                biases_[i] = static_cast<LearnFloatType>(
+                    target_layer_->biases_[i] / kBiasScale);
+            }
+
+            for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                const auto offset = kInputDimensions * i;
+                const auto padded_offset = LayerType::kPaddedInputDimensions * i;
+                for (IndexType j = 0; j < kInputDimensions; ++j) {
+                    weights_[offset + j] = static_cast<LearnFloatType>(
+                        target_layer_->weights_[padded_offset + j] / kWeightScale);
+                }
+            }
+
+            for (auto& state : thread_states_)
+            {
+                state.reset_weights();
+                state.reset_biases();
+            }
+
+
+            reset_stats();
+        }
+
+        // number of input/output dimensions
+        static constexpr IndexType kInputDimensions = LayerType::kInputDimensions;
+        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+
+        // If the output dimensionality is 1, the output layer
+        static constexpr bool kIsOutputLayer = kOutputDimensions == 1;
+
+        // Coefficient used for parameterization
+        static constexpr LearnFloatType kActivationScale =
+            std::numeric_limits<std::int8_t>::max();
+
+        static constexpr LearnFloatType kBiasScale = kIsOutputLayer ?
+            (kPonanzaConstant * FV_SCALE) :
+            ((1 << kWeightScaleBits) * kActivationScale);
+
+        static constexpr LearnFloatType kWeightScale = kBiasScale / kActivationScale;
+
+        // Upper limit of absolute value of weight used to prevent overflow when parameterizing integers
+        static constexpr LearnFloatType kMaxWeightMagnitude =
+            std::numeric_limits<typename LayerType::WeightType>::max() / kWeightScale;
+
+        // number of samples in mini-batch
+        IndexType combined_batch_size_;
+
+        double abs_biases_diff_sum_;
+        double abs_weights_diff_sum_;
+        uint64_t num_biases_diffs_;
+        uint64_t num_weights_diffs_;
+
+        // Input mini batch
+        const LearnFloatType* combined_batch_input_;
+
+        // Trainer of the previous layer
+        const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
+
+        // layer to learn
+        LayerType* const target_layer_;
+
+        // parameter
+        struct alignas(kCacheLineSize) ThreadState
+        {
+            // Buffer used for updating parameters
+            alignas(kCacheLineSize) LearnFloatType biases_diff_[kOutputDimensions];
+            alignas(kCacheLineSize) LearnFloatType weights_diff_[kOutputDimensions * kInputDimensions];
+
+            ThreadState() { reset_weights(); reset_biases(); }
+
+            ThreadState& operator+=(const ThreadState& other)
+            {
+                for (IndexType i = 0; i < kOutputDimensions; ++i)
+                {
+                    biases_diff_[i] += other.biases_diff_[i];
+                }
+
+                for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i)
+                {
+                    weights_diff_[i] += other.weights_diff_[i];
+                }
+
+                return *this;
+            }
+
+            void reset_weights()
+            {
+                std::fill(std::begin(weights_diff_), std::end(weights_diff_), 0.0f);
+            }
+
+            void reset_biases()
+            {
+                std::fill(std::begin(biases_diff_), std::end(biases_diff_), 0.0f);
+            }
+        };
+
+        alignas(kCacheLineSize) LearnFloatType biases_[kOutputDimensions];
+        alignas(kCacheLineSize) LearnFloatType weights_[kOutputDimensions * kInputDimensions];
+
+        std::vector<ThreadState, CacheLineAlignedAllocator<ThreadState>> thread_states_;
+
+        // Forward propagation buffer
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
+
+        // buffer for back propagation
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
+
+        // hyper parameter
+        LearnFloatType momentum_;
+        LearnFloatType learning_rate_scale_;
+    };
+
+}  // namespace Eval::NNUE

 #endif
@@ -1,142 +1,356 @@
-// Specialization of NNUE evaluation function learning class template for ClippedReLU
-
-#ifndef _NNUE_TRAINER_CLIPPED_RELU_H_
+#ifndef _NNUE_TRAINER_CLIPPED_RELU_H_
 #define _NNUE_TRAINER_CLIPPED_RELU_H_

-#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
-
-#include "../../learn/learn.h"
-#include "../layers/clipped_relu.h"
 #include "trainer.h"

-namespace Eval {
+#include "learn/learn.h"

-namespace NNUE {
+#include "nnue/layers/clipped_relu.h"

-// Learning: Affine transformation layer
-template <typename PreviousLayer>
-class Trainer<Layers::ClippedReLU<PreviousLayer>> {
- private:
-  // Type of layer to learn
-  using LayerType = Layers::ClippedReLU<PreviousLayer>;
+#include "thread.h"

- public:
-  // factory function
-  static std::shared_ptr<Trainer> Create(
-      LayerType* target_layer, FeatureTransformer* feature_transformer) {
-    return std::shared_ptr<Trainer>(
-        new Trainer(target_layer, feature_transformer));
-  }
+// Specialization of NNUE evaluation function learning class template for ClippedReLU
+namespace Eval::NNUE {

-  // Set options such as hyperparameters
-  void SendMessage(Message* message) {
-    previous_layer_trainer_->SendMessage(message);
-    if (ReceiveMessage("check_health", message)) {
-      CheckHealth();
-    }
-  }
+    // Learning: Affine transformation layer
+    template <typename PreviousLayer>
+    class Trainer<Layers::ClippedReLU<PreviousLayer>> {
+    private:
+        // Type of layer to learn
+        using LayerType = Layers::ClippedReLU<PreviousLayer>;

-  // Initialize the parameters with random numbers
-  template <typename RNG>
-  void Initialize(RNG& rng) {
-    previous_layer_trainer_->Initialize(rng);
-  }
+    public:
+        // factory function
+        static std::shared_ptr<Trainer> create(
+            LayerType* target_layer, FeatureTransformer* ft) {

-  // forward propagation
-  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
-    if (output_.size() < kOutputDimensions * batch.size()) {
-      output_.resize(kOutputDimensions * batch.size());
-      gradients_.resize(kInputDimensions * batch.size());
-    }
-    const auto input = previous_layer_trainer_->Propagate(batch);
-    batch_size_ = static_cast<IndexType>(batch.size());
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        const IndexType index = batch_offset + i;
-        output_[index] = std::max(+kZero, std::min(+kOne, input[index]));
-        min_activations_[i] = std::min(min_activations_[i], output_[index]);
-        max_activations_[i] = std::max(max_activations_[i], output_[index]);
-      }
-    }
-    return output_.data();
-  }
+            return std::shared_ptr<Trainer>(
+                new Trainer(target_layer, ft));
+        }

-  // backpropagation
-  void Backpropagate(const LearnFloatType* gradients,
-                     LearnFloatType learning_rate) {
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        const IndexType index = batch_offset + i;
-        gradients_[index] = gradients[index] *
-            (output_[index] > kZero) * (output_[index] < kOne);
-      }
-    }
-    previous_layer_trainer_->Backpropagate(gradients_.data(), learning_rate);
-  }
+        // Set options such as hyperparameters
+        void send_message(Message* message) {
+            previous_layer_trainer_->send_message(message);
+            if (receive_message("check_health", message)) {
+                check_health();
+            }
+        }

- private:
-  // constructor
-  Trainer(LayerType* target_layer, FeatureTransformer* feature_transformer) :
-      batch_size_(0),
-      previous_layer_trainer_(Trainer<PreviousLayer>::Create(
-          &target_layer->previous_layer_, feature_transformer)),
-      target_layer_(target_layer) {
-    std::fill(std::begin(min_activations_), std::end(min_activations_),
-              std::numeric_limits<LearnFloatType>::max());
-    std::fill(std::begin(max_activations_), std::end(max_activations_),
-              std::numeric_limits<LearnFloatType>::lowest());
-  }
+        // Initialize the parameters with random numbers
+        template <typename RNG>
+        void initialize(RNG& rng) {
+            previous_layer_trainer_->initialize(rng);
+        }

-  // Check if there are any problems with learning
-  void CheckHealth() {
-    const auto largest_min_activation = *std::max_element(
-        std::begin(min_activations_), std::end(min_activations_));
-    const auto smallest_max_activation = *std::min_element(
-        std::begin(max_activations_), std::end(max_activations_));
-    std::cout << "INFO: largest min activation = " << largest_min_activation
-              << ", smallest max activation = " << smallest_max_activation
-              << std::endl;
+        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
+        {
+            const auto size = batch_end - batch_begin;

-    std::fill(std::begin(min_activations_), std::end(min_activations_),
-              std::numeric_limits<LearnFloatType>::max());
-    std::fill(std::begin(max_activations_), std::end(max_activations_),
-              std::numeric_limits<LearnFloatType>::lowest());
-  }
+            if ((long)output_.size() < (long)kOutputDimensions * size) {
+              output_.resize(kOutputDimensions * size);
+              gradients_.resize(kInputDimensions * size);
+            }

-  // number of input/output dimensions
-  static constexpr IndexType kInputDimensions = LayerType::kOutputDimensions;
-  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+            if (thread_states_.size() < thread_pool.size())
+            {
+                thread_states_.resize(thread_pool.size());
+            }

-  // LearnFloatType constant
-  static constexpr LearnFloatType kZero = static_cast<LearnFloatType>(0.0);
-  static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
+            input_ = previous_layer_trainer_->step_start(thread_pool, batch_begin, batch_end);

-  // number of samples in mini-batch
-  IndexType batch_size_;
+            batch_size_ = size;

-  // Trainer of the previous layer
-  const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
+            return output_.data();
+        }

-  // layer to learn
-  LayerType* const target_layer_;
+        // forward propagation
+        void propagate(Thread& th, const uint64_t offset, const uint64_t count) {

-  // Forward propagation buffer
-  std::vector<LearnFloatType> output_;
+            auto& thread_state = thread_states_[th.thread_idx()];

-  // buffer for back propagation
-  std::vector<LearnFloatType> gradients_;
+            previous_layer_trainer_->propagate(th, offset, count);

-  // Health check statistics
-  LearnFloatType min_activations_[kOutputDimensions];
-  LearnFloatType max_activations_[kOutputDimensions];
-};
+#if defined (USE_SSE2)

-}  // namespace NNUE
+            {
+                static_assert(kOutputDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time");

-}  // namespace Eval
+                const __m128 kZero4 = _mm_set1_ps(+kZero);
+                const __m128 kOne4 = _mm_set1_ps(+kOne);

-#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+                for (IndexType b = offset; b < offset + count; ++b)
+                {
+                    const IndexType batch_offset = kOutputDimensions * b;
+
+                    for (IndexType i = 0; i < kOutputDimensions; i += 16)
+                    {
+                        __m128 out0 = _mm_loadu_ps(&input_[i + 0 + batch_offset]);
+                        __m128 out1 = _mm_loadu_ps(&input_[i + 4 + batch_offset]);
+                        __m128 out2 = _mm_loadu_ps(&input_[i + 8 + batch_offset]);
+                        __m128 out3 = _mm_loadu_ps(&input_[i + 12 + batch_offset]);
+
+                        out0 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out0));
+                        out1 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out1));
+                        out2 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out2));
+                        out3 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out3));
+
+                        _mm_storeu_ps(&output_[i + 0 + batch_offset], out0);
+                        _mm_storeu_ps(&output_[i + 4 + batch_offset], out1);
+                        _mm_storeu_ps(&output_[i + 8 + batch_offset], out2);
+                        _mm_storeu_ps(&output_[i + 12 + batch_offset], out3);
+
+                        __m128 minact0 = _mm_loadu_ps(&thread_state.min_activations_[i + 0]);
+                        __m128 minact1 = _mm_loadu_ps(&thread_state.min_activations_[i + 4]);
+                        __m128 minact2 = _mm_loadu_ps(&thread_state.min_activations_[i + 8]);
+                        __m128 minact3 = _mm_loadu_ps(&thread_state.min_activations_[i + 12]);
+
+                        __m128 maxact0 = _mm_loadu_ps(&thread_state.max_activations_[i + 0]);
+                        __m128 maxact1 = _mm_loadu_ps(&thread_state.max_activations_[i + 4]);
+                        __m128 maxact2 = _mm_loadu_ps(&thread_state.max_activations_[i + 8]);
+                        __m128 maxact3 = _mm_loadu_ps(&thread_state.max_activations_[i + 12]);
+
+                        minact0 = _mm_min_ps(out0, minact0);
+                        minact1 = _mm_min_ps(out1, minact1);
+                        minact2 = _mm_min_ps(out2, minact2);
+                        minact3 = _mm_min_ps(out3, minact3);
+
+                        maxact0 = _mm_max_ps(out0, maxact0);
+                        maxact1 = _mm_max_ps(out1, maxact1);
+                        maxact2 = _mm_max_ps(out2, maxact2);
+                        maxact3 = _mm_max_ps(out3, maxact3);
+
+                        _mm_storeu_ps(&thread_state.min_activations_[i + 0], minact0);
+                        _mm_storeu_ps(&thread_state.min_activations_[i + 4], minact1);
+                        _mm_storeu_ps(&thread_state.min_activations_[i + 8], minact2);
+                        _mm_storeu_ps(&thread_state.min_activations_[i + 12], minact3);
+
+                        _mm_storeu_ps(&thread_state.max_activations_[i + 0], maxact0);
+                        _mm_storeu_ps(&thread_state.max_activations_[i + 4], maxact1);
+                        _mm_storeu_ps(&thread_state.max_activations_[i + 8], maxact2);
+                        _mm_storeu_ps(&thread_state.max_activations_[i + 12], maxact3);
+                    }
+                }
+            }
+
+#else
+
+            for (IndexType b = offset; b < offset + count; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                    const IndexType index = batch_offset + i;
+                    output_[index] = std::max(+kZero, std::min(+kOne, input_[index]));
+                    thread_state.min_activations_[i] = std::min(thread_state.min_activations_[i], output_[index]);
+                    thread_state.max_activations_[i] = std::max(thread_state.max_activations_[i], output_[index]);
+                }
+            }
+
+#endif
+        }
+
+        // backpropagation
+        void backpropagate(Thread& th,
+                           const LearnFloatType* gradients,
+                           const uint64_t offset,
+                           const uint64_t count) {
+
+            auto& thread_state = thread_states_[th.thread_idx()];
+
+#if defined (USE_SSE2)
+
+            {
+                static_assert(kOutputDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time");
+
+                const __m128 kZero4 = _mm_set1_ps(+kZero);
+                const __m128 kOne4 = _mm_set1_ps(+kOne);
+
+                for (IndexType b = offset; b < offset + count; ++b)
+                {
+                    const IndexType batch_offset = kOutputDimensions * b;
+
+                    for (IndexType i = 0; i < kOutputDimensions; i += 16)
+                    {
+                        __m128 out0 = _mm_loadu_ps(&output_[batch_offset + i + 0]);
+                        __m128 out1 = _mm_loadu_ps(&output_[batch_offset + i + 4]);
+                        __m128 out2 = _mm_loadu_ps(&output_[batch_offset + i + 8]);
+                        __m128 out3 = _mm_loadu_ps(&output_[batch_offset + i + 12]);
+
+                        __m128 clipped0 = _mm_or_ps(_mm_cmple_ps(out0, kZero4), _mm_cmpge_ps(out0, kOne4));
+                        __m128 clipped1 = _mm_or_ps(_mm_cmple_ps(out1, kZero4), _mm_cmpge_ps(out1, kOne4));
+                        __m128 clipped2 = _mm_or_ps(_mm_cmple_ps(out2, kZero4), _mm_cmpge_ps(out2, kOne4));
+                        __m128 clipped3 = _mm_or_ps(_mm_cmple_ps(out3, kZero4), _mm_cmpge_ps(out3, kOne4));
+
+                        __m128 grad0 = _mm_loadu_ps(&gradients[batch_offset + i + 0]);
+                        __m128 grad1 = _mm_loadu_ps(&gradients[batch_offset + i + 4]);
+                        __m128 grad2 = _mm_loadu_ps(&gradients[batch_offset + i + 8]);
+                        __m128 grad3 = _mm_loadu_ps(&gradients[batch_offset + i + 12]);
+
+                        grad0 = _mm_andnot_ps(clipped0, grad0);
+                        grad1 = _mm_andnot_ps(clipped1, grad1);
+                        grad2 = _mm_andnot_ps(clipped2, grad2);
+                        grad3 = _mm_andnot_ps(clipped3, grad3);
+
+                        _mm_storeu_ps(&gradients_[batch_offset + i + 0], grad0);
+                        _mm_storeu_ps(&gradients_[batch_offset + i + 4], grad1);
+                        _mm_storeu_ps(&gradients_[batch_offset + i + 8], grad2);
+                        _mm_storeu_ps(&gradients_[batch_offset + i + 12], grad3);
+
+                        const int clipped_mask =
+                            (_mm_movemask_ps(clipped0) << 0)
+                            | (_mm_movemask_ps(clipped1) << 4)
+                            | (_mm_movemask_ps(clipped2) << 8)
+                            | (_mm_movemask_ps(clipped3) << 12);
+
+                        thread_state.num_clipped_ += popcount(clipped_mask);
+                    }
+                }
+            }
+
+#else
+
+            for (IndexType b = offset; b < offset + count; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                    const IndexType index = batch_offset + i;
+                    const bool clipped = (output_[index] <= kZero) | (output_[index] >= kOne);
+                    gradients_[index] = gradients[index] * !clipped;
+                    thread_state.num_clipped_ += clipped;
+                }
+            }
+
+#endif
+
+            thread_state.num_total_ += count * kOutputDimensions;
+
+            previous_layer_trainer_->backpropagate(th, gradients_.data(), offset, count);
+        }
+
+        void reduce_thread_state()
+        {
+            for (IndexType i = 1; i < thread_states_.size(); ++i)
+            {
+                thread_states_[0] += thread_states_[i];
+            }
+        }
+
+        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate)
+        {
+            previous_layer_trainer_->step_end(thread_pool, learning_rate);
+        }
+
+    private:
+        // constructor
+        Trainer(LayerType* target_layer, FeatureTransformer* ft) :
+            batch_size_(0),
+            previous_layer_trainer_(Trainer<PreviousLayer>::create(
+                &target_layer->previous_layer_, ft)),
+            target_layer_(target_layer) {
+
+            reset_stats();
+        }
+
+        void reset_stats() {
+            for(auto& state : thread_states_)
+                state.reset();
+        }
+
+        // Check if there are any problems with learning
+        void check_health() {
+
+            reduce_thread_state();
+
+            auto& main_thread_state = thread_states_[0];
+
+            const auto largest_min_activation = *std::max_element(
+                std::begin(main_thread_state.min_activations_), std::end(main_thread_state.min_activations_));
+            const auto smallest_max_activation = *std::min_element(
+                std::begin(main_thread_state.max_activations_), std::end(main_thread_state.max_activations_));
+
+            auto out = sync_region_cout.new_region();
+
+            out << "INFO (check_health):"
+                << " layer " << LayerType::kLayerIndex
+                << " - " << LayerType::get_name()
+                << std::endl;
+
+            out << "  - largest min activation = " << largest_min_activation
+                << " , smallest max activation = " << smallest_max_activation
+                << std::endl;
+
+            out << "  - clipped " << static_cast<double>(main_thread_state.num_clipped_) / main_thread_state.num_total_ * 100.0 << "% of outputs"
+                << std::endl;
+
+            out.unlock();
+
+            reset_stats();
+        }
+
+        // number of input/output dimensions
+        static constexpr IndexType kInputDimensions = LayerType::kOutputDimensions;
+        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+
+        // LearnFloatType constant
+        static constexpr LearnFloatType kZero = static_cast<LearnFloatType>(0.0);
+        static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
+
+        // number of samples in mini-batch
+        IndexType batch_size_;
+
+        IndexType num_total_;
+
+        const LearnFloatType* input_;
+
+        // Trainer of the previous layer
+        const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
+
+        // layer to learn
+        LayerType* const target_layer_;
+
+        // Forward propagation buffer
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
+
+        // buffer for back propagation
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
+
+        struct alignas(kCacheLineSize) ThreadState
+        {
+            // Health check statistics
+            LearnFloatType min_activations_[kOutputDimensions];
+            LearnFloatType max_activations_[kOutputDimensions];
+            IndexType num_clipped_;
+            IndexType num_total_;
+
+            ThreadState() { reset(); }
+
+            ThreadState& operator+=(const ThreadState& other)
+            {
+                for (IndexType i = 0; i < kOutputDimensions; ++i)
+                {
+                    min_activations_[i] = std::min(min_activations_[i], other.min_activations_[i]);
+                }
+
+                for (IndexType i = 0; i < kOutputDimensions; ++i)
+                {
+                    max_activations_[i] = std::max(max_activations_[i], other.max_activations_[i]);
+                }
+
+                num_clipped_ += other.num_clipped_;
+                num_total_ += other.num_total_;
+
+                return *this;
+            }
+
+            void reset()
+            {
+                std::fill(std::begin(min_activations_), std::end(min_activations_), std::numeric_limits<float>::max());
+                std::fill(std::begin(max_activations_), std::end(max_activations_), std::numeric_limits<float>::lowest());
+                num_clipped_ = 0;
+                num_total_ = 0;
+            }
+        };
+
+        std::vector<ThreadState, CacheLineAlignedAllocator<ThreadState>> thread_states_;
+    };
+
+}  // namespace Eval::NNUE

 #endif
@@ -1,251 +1,377 @@
-// Specialization of NNUE evaluation function learning class template for InputSlice
-
-#ifndef _NNUE_TRAINER_INPUT_SLICE_H_
+#ifndef _NNUE_TRAINER_INPUT_SLICE_H_
 #define _NNUE_TRAINER_INPUT_SLICE_H_

-#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
-
-#include "../../learn/learn.h"
-#include "../layers/input_slice.h"
 #include "trainer.h"

-namespace Eval {
+#include "extra/stockfish_blas.h"

-namespace NNUE {
+#include "learn/learn.h"

-// Learning: Input layer
-class SharedInputTrainer {
- public:
-  // factory function
-  static std::shared_ptr<SharedInputTrainer> Create(
-      FeatureTransformer* feature_transformer) {
-    static std::shared_ptr<SharedInputTrainer> instance;
-    if (!instance) {
-      instance.reset(new SharedInputTrainer(feature_transformer));
-    }
-    ++instance->num_referrers_;
-    return instance;
-  }
+#include "nnue/layers/input_slice.h"

-  // Set options such as hyperparameters
-  void SendMessage(Message* message) {
-    if (num_calls_ == 0) {
-      current_operation_ = Operation::kSendMessage;
-      feature_transformer_trainer_->SendMessage(message);
-    }
-    assert(current_operation_ == Operation::kSendMessage);
-    if (++num_calls_ == num_referrers_) {
-      num_calls_ = 0;
-      current_operation_ = Operation::kNone;
-    }
-  }
+#include "thread.h"

-  // Initialize the parameters with random numbers
-  template <typename RNG>
-  void Initialize(RNG& rng) {
-    if (num_calls_ == 0) {
-      current_operation_ = Operation::kInitialize;
-      feature_transformer_trainer_->Initialize(rng);
-    }
-    assert(current_operation_ == Operation::kInitialize);
-    if (++num_calls_ == num_referrers_) {
-      num_calls_ = 0;
-      current_operation_ = Operation::kNone;
-    }
-  }
+// Specialization of NNUE evaluation function learning class template for InputSlice
+namespace Eval::NNUE {

-  // forward propagation
-  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
-    if (gradients_.size() < kInputDimensions * batch.size()) {
-      gradients_.resize(kInputDimensions * batch.size());
-    }
-    batch_size_ = static_cast<IndexType>(batch.size());
-    if (num_calls_ == 0) {
-      current_operation_ = Operation::kPropagate;
-      output_ = feature_transformer_trainer_->Propagate(batch);
-    }
-    assert(current_operation_ == Operation::kPropagate);
-    if (++num_calls_ == num_referrers_) {
-      num_calls_ = 0;
-      current_operation_ = Operation::kNone;
-    }
-    return output_;
-  }
+    // Learning: Input layer
+    // This is tricky. It exists because when there's more than one trainer
+    // on top of a single feature transformer we want to only call propagate/backpropagate
+    // on the feature transformer once. This is straightforward in the old
+    // multithreading case, because propagate/backpropagate is called just once from the
+    // main thread. But with the current implementation of coarser multithreading
+    // we end up calling each method from each thread. Therefore we have to keep
+    // the num_calls and current_operation per thread basis, each thread must work
+    // on its designated batch slice, and the only synchronization points are
+    // step_start and step_end - for which we use state of the first thread.
+    // Each thread requires their own bookkeeping because it's possible that
+    // one thread is still in propagate of some batch slice while the other thread
+    // is doing backpropagate of some other slice. We also ensure the thread state
+    // isn't suspectible to false sharing by using a full cache line for the state.
+    class SharedInputTrainer {
+    public:
+        // factory function
+        static std::shared_ptr<SharedInputTrainer> create(
+            FeatureTransformer* ft) {

-  // backpropagation
-  void Backpropagate(const LearnFloatType* gradients,
-                     LearnFloatType learning_rate) {
-    if (num_referrers_ == 1) {
-      feature_transformer_trainer_->Backpropagate(gradients, learning_rate);
-      return;
-    }
-    if (num_calls_ == 0) {
-      current_operation_ = Operation::kBackPropagate;
-      for (IndexType b = 0; b < batch_size_; ++b) {
-        const IndexType batch_offset = kInputDimensions * b;
-        for (IndexType i = 0; i < kInputDimensions; ++i) {
-          gradients_[batch_offset + i] = static_cast<LearnFloatType>(0.0);
+            static std::shared_ptr<SharedInputTrainer> instance;
+
+            if (!instance) {
+                instance.reset(new SharedInputTrainer(ft));
+            }
+
+            ++instance->num_referrers_;
+
+            return instance;
        }
-      }
-    }
-    assert(current_operation_ == Operation::kBackPropagate);
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType batch_offset = kInputDimensions * b;
-      for (IndexType i = 0; i < kInputDimensions; ++i) {
-        gradients_[batch_offset + i] += gradients[batch_offset + i];
-      }
-    }
-    if (++num_calls_ == num_referrers_) {
-      feature_transformer_trainer_->Backpropagate(
-          gradients_.data(), learning_rate);
-      num_calls_ = 0;
-      current_operation_ = Operation::kNone;
-    }
-  }

- private:
-  // constructor
-  SharedInputTrainer(FeatureTransformer* feature_transformer) :
-      batch_size_(0),
-      num_referrers_(0),
-      num_calls_(0),
-      current_operation_(Operation::kNone),
-      feature_transformer_trainer_(Trainer<FeatureTransformer>::Create(
-          feature_transformer)),
-      output_(nullptr) {
-  }
+        // Set options such as hyperparameters
+        void send_message(Message* message) {
+            auto& thread_state = thread_states_[0];

-  // number of input/output dimensions
-  static constexpr IndexType kInputDimensions =
-      FeatureTransformer::kOutputDimensions;
+            if (thread_state.num_calls == 0) {
+                thread_state.current_operation = Operation::kSendMessage;
+                feature_transformer_trainer_->send_message(message);
+            }

-  // type of processing
-  enum class Operation {
-    kNone,
-    kSendMessage,
-    kInitialize,
-    kPropagate,
-    kBackPropagate,
-  };
+            assert(thread_state.current_operation == Operation::kSendMessage);

-  // number of samples in mini-batch
-  IndexType batch_size_;
+            if (++thread_state.num_calls == num_referrers_) {
+                thread_state.num_calls = 0;
+                thread_state.current_operation = Operation::kNone;
+            }
+        }

-  // number of layers sharing this layer as input
-  std::uint32_t num_referrers_;
+        // Initialize the parameters with random numbers
+        template <typename RNG>
+        void initialize(RNG& rng) {
+            auto& thread_state = thread_states_[0];

-  // Number of times the current process has been called
-  std::uint32_t num_calls_;
+            if (thread_state.num_calls == 0) {
+                thread_state.current_operation = Operation::kInitialize;
+                feature_transformer_trainer_->initialize(rng);
+            }

-  // current processing type
-  Operation current_operation_;
+            assert(thread_state.current_operation == Operation::kInitialize);

-  // Trainer of input feature converter
-  const std::shared_ptr<Trainer<FeatureTransformer>>
-      feature_transformer_trainer_;
+            if (++thread_state.num_calls == num_referrers_) {
+                thread_state.num_calls = 0;
+                thread_state.current_operation = Operation::kNone;
+            }
+        }

-  // pointer to output shared for forward propagation
-  const LearnFloatType* output_;
+        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
+        {
+            const auto size = batch_end - batch_begin;

-  // buffer for back propagation
-  std::vector<LearnFloatType> gradients_;
-};
+            if ((long)gradients_.size() < (long)kInputDimensions * size) {
+                gradients_.resize(kInputDimensions * size);
+            }

-// Learning: Input layer
-template <IndexType OutputDimensions, IndexType Offset>
-class Trainer<Layers::InputSlice<OutputDimensions, Offset>> {
- private:
-  // Type of layer to learn
-  using LayerType = Layers::InputSlice<OutputDimensions, Offset>;
+            if (thread_states_.size() < thread_pool.size())
+            {
+                thread_states_.resize(thread_pool.size());
+            }

- public:
-  // factory function
-  static std::shared_ptr<Trainer> Create(
-      LayerType* /*target_layer*/, FeatureTransformer* feature_transformer) {
-    return std::shared_ptr<Trainer>(new Trainer(feature_transformer));
-  }
+            batch_size_ = size;

-  // Set options such as hyperparameters
-  void SendMessage(Message* message) {
-    shared_input_trainer_->SendMessage(message);
-  }
+            auto& thread_state = thread_states_[0];

-  // Initialize the parameters with random numbers
-  template <typename RNG>
-  void Initialize(RNG& rng) {
-    shared_input_trainer_->Initialize(rng);
-  }
+            if (thread_state.num_calls == 0) {
+                thread_state.current_operation = Operation::kStepStart;
+                output_ = feature_transformer_trainer_->step_start(thread_pool, batch_begin, batch_end);
+            }
+
+            assert(thread_state.current_operation == Operation::kStepStart);
+
+            if (++thread_state.num_calls == num_referrers_) {
+                thread_state.num_calls = 0;
+                thread_state.current_operation = Operation::kNone;
+            }
+
+            return output_;
+        }
+
+        // forward propagation
+        void propagate(Thread& th, uint64_t offset, uint64_t count) {
+            const auto thread_id = th.thread_idx();
+
+            auto& thread_state = thread_states_[thread_id];
+
+            if (thread_state.num_calls == 0) {
+                thread_state.current_operation = Operation::kPropagate;
+                feature_transformer_trainer_->propagate(th, offset, count);
+            }
+
+            assert(thread_state.current_operation == Operation::kPropagate);
+
+            if (++thread_state.num_calls == num_referrers_) {
+                thread_state.num_calls = 0;
+                thread_state.current_operation = Operation::kNone;
+            }
+        }
+
+        // backpropagation
+        void backpropagate(Thread& th,
+                           const LearnFloatType* gradients,
+                           uint64_t offset,
+                           uint64_t count) {
+
+            const auto thread_id = th.thread_idx();
+
+            auto& thread_state = thread_states_[thread_id];
+
+            if (num_referrers_ == 1) {
+                feature_transformer_trainer_->backpropagate(th, gradients, offset, count);
+                return;
+            }
+
+            if (thread_state.num_calls == 0) {
+                thread_state.current_operation = Operation::kBackPropagate;
+                for (IndexType b = offset; b < offset + count; ++b) {
+                    const IndexType batch_offset = kInputDimensions * b;
+                    for (IndexType i = 0; i < kInputDimensions; ++i) {
+                        gradients_[batch_offset + i] = static_cast<LearnFloatType>(0.0);
+                    }
+                }
+            }
+
+            assert(thread_state.current_operation == Operation::kBackPropagate);
+
+            for (IndexType b = offset; b < offset + count; ++b) {
+                const IndexType batch_offset = kInputDimensions * b;
+                for (IndexType i = 0; i < kInputDimensions; ++i) {
+                    gradients_[batch_offset + i] += gradients[batch_offset + i];
+                }
+            }
+
+            if (++thread_state.num_calls == num_referrers_) {
+                feature_transformer_trainer_->backpropagate(
+                    th, gradients_.data(), offset, count);
+                thread_state.num_calls = 0;
+                thread_state.current_operation = Operation::kNone;
+            }
+        }
+
+        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate) {
+            auto& thread_state = thread_states_[0];
+
+            if (thread_state.num_calls == 0) {
+                thread_state.current_operation = Operation::kStepEnd;
+                feature_transformer_trainer_->step_end(thread_pool, learning_rate);
+            }
+
+            assert(thread_state.current_operation == Operation::kStepEnd);
+
+            if (++thread_state.num_calls == num_referrers_) {
+                thread_state.num_calls = 0;
+                thread_state.current_operation = Operation::kNone;
+            }
+        }
+
+    private:
+        // constructor
+        SharedInputTrainer(FeatureTransformer* ft) :
+            batch_size_(0),
+            num_referrers_(0),
+            thread_states_(1),
+            feature_transformer_trainer_(Trainer<FeatureTransformer>::create(
+                ft)),
+            output_(nullptr) {
+        }
+
+        // number of input/output dimensions
+        static constexpr IndexType kInputDimensions =
+            FeatureTransformer::kOutputDimensions;
+
+        // type of processing
+        enum class Operation {
+            kNone,
+            kSendMessage,
+            kInitialize,
+            kStepStart,
+            kPropagate,
+            kBackPropagate,
+            kStepEnd,
+        };
+
+        // number of samples in mini-batch
+        IndexType batch_size_;
+
+        // number of layers sharing this layer as input
+        std::uint32_t num_referrers_;
+
+        struct alignas(kCacheLineSize) ThreadState
+        {
+            std::uint32_t num_calls{0};
+
+            // current processing type
+            Operation current_operation = Operation::kNone;
+        };
+
+        // Number of times the current process has been called
+        std::vector<ThreadState, CacheLineAlignedAllocator<ThreadState>> thread_states_;
+
+        // Trainer of input feature converter
+        const std::shared_ptr<Trainer<FeatureTransformer>>
+            feature_transformer_trainer_;
+
+        // pointer to output shared for forward propagation
+        const LearnFloatType* output_;
+
+        // buffer for back propagation
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
+    };
+
+    // Learning: Input layer
+    template <IndexType OutputDimensions, IndexType Offset>
+    class Trainer<Layers::InputSlice<OutputDimensions, Offset>> {
+    private:
+        // Type of layer to learn
+        using LayerType = Layers::InputSlice<OutputDimensions, Offset>;
+
+    public:
+        // factory function
+        static std::shared_ptr<Trainer> create(
+            LayerType* /*target_layer*/, FeatureTransformer* ft) {
+
+            return std::shared_ptr<Trainer>(new Trainer(ft));
+        }
+
+        // Set options such as hyperparameters
+        void send_message(Message* message) {
+            shared_input_trainer_->send_message(message);
+        }
+
+        // Initialize the parameters with random numbers
+        template <typename RNG>
+        void initialize(RNG& rng) {
+            shared_input_trainer_->initialize(rng);
+        }
+
+        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
+        {
+            const auto size = batch_end - batch_begin;
+
+            if ((long)output_.size() < (long)kOutputDimensions * size) {
+              output_.resize(kOutputDimensions * size);
+              gradients_.resize(kInputDimensions * size);
+            }
+
+            batch_size_ = size;
+
+            input_ = shared_input_trainer_->step_start(thread_pool, batch_begin, batch_end);
+
+            return output_.data();
+        }
+
+        // forward propagation
+        void propagate(Thread& th, uint64_t offset, uint64_t count) {
+
+            shared_input_trainer_->propagate(th, offset, count);
+
+            for (IndexType b = offset; b < offset + count; ++b) {
+                const IndexType input_offset = kInputDimensions * b;
+                const IndexType output_offset = kOutputDimensions * b;

-  // forward propagation
-  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
-    if (output_.size() < kOutputDimensions * batch.size()) {
-      output_.resize(kOutputDimensions * batch.size());
-      gradients_.resize(kInputDimensions * batch.size());
-    }
-    batch_size_ = static_cast<IndexType>(batch.size());
-    const auto input = shared_input_trainer_->Propagate(batch);
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType input_offset = kInputDimensions * b;
-      const IndexType output_offset = kOutputDimensions * b;
 #if defined(USE_BLAS)
-      cblas_scopy(kOutputDimensions, &input[input_offset + Offset], 1,
-                  &output_[output_offset], 1);
+
+                cblas_scopy(
+                    kOutputDimensions, &input_[input_offset + Offset], 1,
+                    &output_[output_offset], 1
+                );
 #else
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        output_[output_offset + i] = input[input_offset + Offset + i];
-      }
-#endif
-    }
-    return output_.data();
-  }

-  // backpropagation
-  void Backpropagate(const LearnFloatType* gradients,
-                     LearnFloatType learning_rate) {
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType input_offset = kInputDimensions * b;
-      const IndexType output_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kInputDimensions; ++i) {
-        if (i < Offset || i >= Offset + kOutputDimensions) {
-          gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
-        } else {
-          gradients_[input_offset + i] = gradients[output_offset + i - Offset];
+                Blas::scopy(
+                    kOutputDimensions, &input_[input_offset + Offset], 1,
+                    &output_[output_offset], 1
+                );
+
+#endif
+            }
        }
-      }
-    }
-    shared_input_trainer_->Backpropagate(gradients_.data(), learning_rate);
-  }

- private:
-  // constructor
-  Trainer(FeatureTransformer* feature_transformer):
-      batch_size_(0),
-      shared_input_trainer_(SharedInputTrainer::Create(feature_transformer)) {
-  }
+        // backpropagation
+        void backpropagate(Thread& th,
+                           const LearnFloatType* gradients,
+                           uint64_t offset,
+                           uint64_t count) {

-  // number of input/output dimensions
-  static constexpr IndexType kInputDimensions =
-      FeatureTransformer::kOutputDimensions;
-  static constexpr IndexType kOutputDimensions = OutputDimensions;
-  static_assert(Offset + kOutputDimensions <= kInputDimensions, "");
+            for (IndexType b = offset; b < offset + count; ++b)
+            {
+                const IndexType input_offset = kInputDimensions * b;
+                const IndexType output_offset = kOutputDimensions * b;

-  // number of samples in mini-batch
-  IndexType batch_size_;
+                IndexType i = 0;
+                for (; i < Offset; ++i) {
+                    gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
+                }

-  // Trainer of shared input layer
-  const std::shared_ptr<SharedInputTrainer> shared_input_trainer_;
+                for (; i < Offset + kOutputDimensions; ++i) {
+                    gradients_[input_offset + i] = gradients[output_offset + i - Offset];
+                }

-  // Forward propagation buffer
-  std::vector<LearnFloatType> output_;
+                for (; i < kInputDimensions; ++i)
+                {
+                    gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
+                }
+            }

-  // buffer for back propagation
-  std::vector<LearnFloatType> gradients_;
-};
+            shared_input_trainer_->backpropagate(th, gradients_.data(), offset, count);
+        }

-}  // namespace NNUE
+        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate) {
+            shared_input_trainer_->step_end(thread_pool, learning_rate);
+        }

-}  // namespace Eval
+    private:
+        // constructor
+        Trainer(FeatureTransformer* ft) :
+            batch_size_(0),
+            shared_input_trainer_(SharedInputTrainer::create(ft)) {
+        }

-#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+        // number of input/output dimensions
+        static constexpr IndexType kInputDimensions =
+            FeatureTransformer::kOutputDimensions;
+        static constexpr IndexType kOutputDimensions = OutputDimensions;
+        static_assert(Offset + kOutputDimensions <= kInputDimensions, "");
+
+        // number of samples in mini-batch
+        IndexType batch_size_;
+
+        const LearnFloatType* input_;
+
+        // Trainer of shared input layer
+        const std::shared_ptr<SharedInputTrainer> shared_input_trainer_;
+
+        // Forward propagation buffer
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
+
+        // buffer for back propagation
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
+    };
+
+}  // namespace Eval::NNUE

 #endif
@@ -1,190 +1,201 @@
-// Specialization of NNUE evaluation function learning class template for Sum
-
-#ifndef _NNUE_TRAINER_SUM_H_
+#ifndef _NNUE_TRAINER_SUM_H_
 #define _NNUE_TRAINER_SUM_H_

-#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
-
-#include "../../learn/learn.h"
-#include "../layers/sum.h"
 #include "trainer.h"

-namespace Eval {
+#include "extra/stockfish_blas.h"

-namespace NNUE {
+#include "learn/learn.h"

-// Learning: A layer that sums the outputs of multiple layers
-template <typename FirstPreviousLayer, typename... RemainingPreviousLayers>
-class Trainer<Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>> :
-      Trainer<Layers::Sum<RemainingPreviousLayers...>> {
- private:
-  // Type of layer to learn
-  using LayerType = Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>;
-  using Tail = Trainer<Layers::Sum<RemainingPreviousLayers...>>;
+#include "nnue/layers/sum.h"

- public:
-  // factory function
-  static std::shared_ptr<Trainer> Create(
-      LayerType* target_layer, FeatureTransformer* feature_transformer) {
-    return std::shared_ptr<Trainer>(
-        new Trainer(target_layer, feature_transformer));
-  }
+#include "thread.h"

-  // Set options such as hyperparameters
-  void SendMessage(Message* message) {
-    // The results of other member functions do not depend on the processing order, so
-    // Tail is processed first for the purpose of simplifying the implementation, but
-    // SendMessage processes Head first to make it easier to understand subscript correspondence
-    previous_layer_trainer_->SendMessage(message);
-    Tail::SendMessage(message);
-  }
+// Specialization of NNUE evaluation function learning class template for Sum
+namespace Eval::NNUE {

-  // Initialize the parameters with random numbers
-  template <typename RNG>
-  void Initialize(RNG& rng) {
-    Tail::Initialize(rng);
-    previous_layer_trainer_->Initialize(rng);
-  }
+    // Learning: A layer that sums the outputs of multiple layers
+    template <typename FirstPreviousLayer, typename... RemainingPreviousLayers>
+    class Trainer<Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>> :
+          Trainer<Layers::Sum<RemainingPreviousLayers...>> {
+    private:
+        // Type of layer to learn
+        using LayerType = Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>;
+        using Tail = Trainer<Layers::Sum<RemainingPreviousLayers...>>;
+
+    public:
+        // factory function
+        static std::shared_ptr<Trainer> create(
+            LayerType* target_layer, FeatureTransformer* ft) {
+
+            return std::shared_ptr<Trainer>(
+                new Trainer(target_layer, ft));
+        }
+
+        // Set options such as hyperparameters
+        void send_message(Message* message) {
+            // The results of other member functions do not depend on the processing order, so
+            // Tail is processed first for the purpose of simplifying the implementation, but
+            // SendMessage processes Head first to make it easier to understand subscript correspondence
+            previous_layer_trainer_->send_message(message);
+            Tail::send_message(message);
+        }
+
+        // Initialize the parameters with random numbers
+        template <typename RNG>
+        void initialize(RNG& rng) {
+            Tail::initialize(rng);
+            previous_layer_trainer_->initialize(rng);
+        }
+
+        // forward propagation
+        /*const*/ LearnFloatType* propagate(ThreadPool& thread_pool, const std::vector<Example>& batch) {
+            batch_size_ = static_cast<IndexType>(batch.size());
+            auto output = Tail::propagate(thread_pool, batch);
+            const auto head_output = previous_layer_trainer_->propagate(thread_pool, batch);

-  // forward propagation
-  /*const*/ LearnFloatType* Propagate(const std::vector<Example>& batch) {
-    batch_size_ = static_cast<IndexType>(batch.size());
-    auto output = Tail::Propagate(batch);
-    const auto head_output = previous_layer_trainer_->Propagate(batch);
 #if defined(USE_BLAS)
-    cblas_saxpy(kOutputDimensions * batch_size_, 1.0,
-                head_output, 1, output, 1);
+
+            cblas_saxpy(
+                kOutputDimensions * batch_size_, 1.0,
+                head_output, 1, output, 1
+            );
+
 #else
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        output[batch_offset + i] += head_output[batch_offset + i];
-      }
-    }
+
+            Blas::saxpy(
+                thread_pool,
+                kOutputDimensions * batch_size_, 1.0,
+                head_output, 1, output, 1
+            );
+
 #endif
-    return output;
-  }
+            return output;
+        }

-  // backpropagation
-  void Backpropagate(const LearnFloatType* gradients,
-                     LearnFloatType learning_rate) {
-    Tail::Backpropagate(gradients, learning_rate);
-    previous_layer_trainer_->Backpropagate(gradients, learning_rate);
-  }
+        // backpropagation
+        void backpropagate(ThreadPool& thread_pool,
+                           const LearnFloatType* gradients,
+                           LearnFloatType learning_rate) {

- private:
-  // constructor
-  Trainer(LayerType* target_layer, FeatureTransformer* feature_transformer):
-      Tail(target_layer, feature_transformer),
-      batch_size_(0),
-      previous_layer_trainer_(Trainer<FirstPreviousLayer>::Create(
-          &target_layer->previous_layer_, feature_transformer)),
-      target_layer_(target_layer) {
-  }
+            Tail::backpropagate(thread_pool, gradients, learning_rate);
+            previous_layer_trainer_->backpropagate(thread_pool, gradients, learning_rate);
+        }

-  // number of input/output dimensions
-  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+    private:
+        // constructor
+        Trainer(LayerType* target_layer, FeatureTransformer* ft):
+            Tail(target_layer, ft),
+            batch_size_(0),
+            previous_layer_trainer_(Trainer<FirstPreviousLayer>::create(
+                &target_layer->previous_layer_, ft)),
+            target_layer_(target_layer) {
+        }

-  // make subclass friend
-  template <typename SumLayer>
-  friend class Trainer;
+        // number of input/output dimensions
+        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;

-  // number of samples in mini-batch
-  IndexType batch_size_;
+        // make subclass friend
+        template <typename SumLayer>
+        friend class Trainer;

-  // Trainer of the previous layer
-  const std::shared_ptr<Trainer<FirstPreviousLayer>> previous_layer_trainer_;
+        // number of samples in mini-batch
+        IndexType batch_size_;

-  // layer to learn
-  LayerType* const target_layer_;
-};
+        // Trainer of the previous layer
+        const std::shared_ptr<Trainer<FirstPreviousLayer>> previous_layer_trainer_;
+
+        // layer to learn
+        LayerType* const target_layer_;
+    };


-// Learning: Layer that takes the sum of the outputs of multiple layers (when there is one template argument)
-template <typename PreviousLayer>
-class Trainer<Layers::Sum<PreviousLayer>> {
- private:
-  // Type of layer to learn
-  using LayerType = Layers::Sum<PreviousLayer>;
+    // Learning: Layer that takes the sum of the outputs of multiple layers (when there is one template argument)
+    template <typename PreviousLayer>
+    class Trainer<Layers::Sum<PreviousLayer>> {
+    private:
+        // Type of layer to learn
+        using LayerType = Layers::Sum<PreviousLayer>;

- public:
-  // factory function
-  static std::shared_ptr<Trainer> Create(
-      LayerType* target_layer, FeatureTransformer* feature_transformer) {
-    return std::shared_ptr<Trainer>(
-        new Trainer(target_layer, feature_transformer));
-  }
+    public:
+        // factory function
+        static std::shared_ptr<Trainer> create(
+            LayerType* target_layer, FeatureTransformer* ft) {

-  // Set options such as hyperparameters
-  void SendMessage(Message* message) {
-    previous_layer_trainer_->SendMessage(message);
-  }
+            return std::shared_ptr<Trainer>(
+                new Trainer(target_layer, ft));
+        }

-  // Initialize the parameters with random numbers
-  template <typename RNG>
-  void Initialize(RNG& rng) {
-    previous_layer_trainer_->Initialize(rng);
-  }
+        // Set options such as hyperparameters
+        void send_message(Message* message) {
+            previous_layer_trainer_->send_message(message);
+        }
+
+        // Initialize the parameters with random numbers
+        template <typename RNG>
+        void initialize(RNG& rng) {
+            previous_layer_trainer_->initialize(rng);
+        }
+
+        // forward propagation
+        /*const*/ LearnFloatType* propagate(const std::vector<Example>& batch) {
+            if (output_.size() < kOutputDimensions * batch.size()) {
+                output_.resize(kOutputDimensions * batch.size());
+            }
+
+            batch_size_ = static_cast<IndexType>(batch.size());
+            const auto output = previous_layer_trainer_->propagate(batch);

-  // forward propagation
-  /*const*/ LearnFloatType* Propagate(const std::vector<Example>& batch) {
-    if (output_.size() < kOutputDimensions * batch.size()) {
-      output_.resize(kOutputDimensions * batch.size());
-    }
-    batch_size_ = static_cast<IndexType>(batch.size());
-    const auto output = previous_layer_trainer_->Propagate(batch);
 #if defined(USE_BLAS)
-    cblas_scopy(kOutputDimensions * batch_size_, output, 1, &output_[0], 1);
+            cblas_scopy(kOutputDimensions * batch_size_, output, 1, &output_[0], 1);
 #else
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        output_[batch_offset + i] = output[batch_offset + i];
-      }
-    }
-#endif
-    return output_.data();
-  }
-
-  // backpropagation
-  void Backpropagate(const LearnFloatType* gradients,
-                     LearnFloatType learning_rate) {
-    previous_layer_trainer_->Backpropagate(gradients, learning_rate);
-  }
-
- private:
-  // constructor
-  Trainer(LayerType* target_layer, FeatureTransformer* feature_transformer) :
-      batch_size_(0),
-      previous_layer_trainer_(Trainer<PreviousLayer>::Create(
-          &target_layer->previous_layer_, feature_transformer)),
-      target_layer_(target_layer) {
-  }
-
-  // number of input/output dimensions
-  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
-
-  // make subclass friend
-  template <typename SumLayer>
-  friend class Trainer;
-
-  // number of samples in mini-batch
-  IndexType batch_size_;
-
-  // Trainer of the previous layer
-  const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
-
-  // layer to learn
-  LayerType* const target_layer_;
-
-  // Forward propagation buffer
-  std::vector<LearnFloatType> output_;
-};
-
-}  // namespace NNUE
-
-}  // namespace Eval
-
-#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+            for (IndexType b = 0; b < batch_size_; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                    output_[batch_offset + i] = output[batch_offset + i];
+                }
+            }
+
+#endif
+            return output_.data();
+        }
+
+        // backpropagation
+        void backpropagate(const LearnFloatType* gradients,
+                           LearnFloatType learning_rate) {
+
+            previous_layer_trainer_->backpropagate(gradients, learning_rate);
+        }
+
+    private:
+        // constructor
+        Trainer(LayerType* target_layer, FeatureTransformer* ft) :
+            batch_size_(0),
+            previous_layer_trainer_(Trainer<PreviousLayer>::create(
+                &target_layer->previous_layer_, ft)),
+            target_layer_(target_layer) {
+        }
+
+        // number of input/output dimensions
+        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+
+        // make subclass friend
+        template <typename SumLayer>
+        friend class Trainer;
+
+        // number of samples in mini-batch
+        IndexType batch_size_;
+
+        // Trainer of the previous layer
+        const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
+
+        // layer to learn
+        LayerType* const target_layer_;
+
+        // Forward propagation buffer
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
+    };
+
+}  // namespace Eval::NNUE

 #endif
@@ -30,29 +30,29 @@ namespace {
  #define S(mg, eg) make_score(mg, eg)

  // Pawn penalties
-  constexpr Score Backward      = S( 8, 27);
-  constexpr Score Doubled       = S(11, 55);
-  constexpr Score Isolated      = S( 5, 17);
-  constexpr Score WeakLever     = S( 2, 54);
-  constexpr Score WeakUnopposed = S(15, 25);
+  constexpr Score Backward      = S( 8, 25);
+  constexpr Score Doubled       = S(10, 55);
+  constexpr Score Isolated      = S( 3, 15);
+  constexpr Score WeakLever     = S( 3, 55);
+  constexpr Score WeakUnopposed = S(13, 25);

  // Bonus for blocked pawns at 5th or 6th rank
-  constexpr Score BlockedPawn[2] = { S(-13, -4), S(-4, 3) };
+  constexpr Score BlockedPawn[2] = { S(-13, -4), S(-5, 2) };

  constexpr Score BlockedStorm[RANK_NB] = {
    S(0, 0), S(0, 0), S(76, 78), S(-10, 15), S(-7, 10), S(-4, 6), S(-1, 2)
  };

  // Connected pawn bonus
-  constexpr int Connected[RANK_NB] = { 0, 7, 8, 11, 24, 45, 85 };
+  constexpr int Connected[RANK_NB] = { 0, 5, 7, 11, 24, 48, 86 };

  // Strength of pawn shelter for our king by [distance from edge][rank].
  // RANK_1 = 0 is used for files where we have no pawn, or pawn is behind our king.
  constexpr Value ShelterStrength[int(FILE_NB) / 2][RANK_NB] = {
-    { V( -6), V( 81), V( 93), V( 58), V( 39), V( 18), V(  25) },
-    { V(-43), V( 61), V( 35), V(-49), V(-29), V(-11), V( -63) },
-    { V(-10), V( 75), V( 23), V( -2), V( 32), V(  3), V( -45) },
-    { V(-39), V(-13), V(-29), V(-52), V(-48), V(-67), V(-166) }
+    { V( -5), V( 82), V( 92), V( 54), V( 36), V( 22), V(  28) },
+    { V(-44), V( 63), V( 33), V(-50), V(-30), V(-12), V( -62) },
+    { V(-11), V( 77), V( 22), V( -6), V( 31), V(  8), V( -45) },
+    { V(-39), V(-12), V(-29), V(-50), V(-43), V(-68), V(-164) }
  };

  // Danger of enemy pawns moving toward our king by [distance from edge][rank].
@@ -60,12 +60,17 @@ namespace {
  // is behind our king. Note that UnblockedStorm[0][1-2] accommodate opponent pawn
  // on edge, likely blocked by our king.
  constexpr Value UnblockedStorm[int(FILE_NB) / 2][RANK_NB] = {
-    { V( 85), V(-289), V(-166), V(97), V(50), V( 45), V( 50) },
-    { V( 46), V( -25), V( 122), V(45), V(37), V(-10), V( 20) },
-    { V( -6), V(  51), V( 168), V(34), V(-2), V(-22), V(-14) },
-    { V(-15), V( -11), V( 101), V( 4), V(11), V(-15), V(-29) }
+    { V( 87), V(-288), V(-168), V( 96), V( 47), V( 44), V( 46) },
+    { V( 42), V( -25), V( 120), V( 45), V( 34), V( -9), V( 24) },
+    { V( -8), V(  51), V( 167), V( 35), V( -4), V(-16), V(-12) },
+    { V(-17), V( -13), V( 100), V(  4), V(  9), V(-16), V(-31) }
  };

+  // KingOnFile[semi-open Us][semi-open Them] contains bonuses/penalties
+  // for king when the king is on a semi-open or open file.
+  constexpr Score KingOnFile[2][2] = {{ S(-19,12), S(-6, 7)  },
+                                     {  S(  0, 2), S( 6,-5) }};
+
  #undef S
  #undef V

@@ -147,7 +152,7 @@ namespace {
        if (support | phalanx)
        {
            int v =  Connected[r] * (2 + bool(phalanx) - bool(opposed))
-                   + 21 * popcount(support);
+                   + 22 * popcount(support);

            score += make_score(v, v * (r - 2) / 4);
        }
@@ -171,8 +176,8 @@ namespace {
            score -=  Doubled * doubled
                    + WeakLever * more_than_one(lever);

-        if (blocked && r > RANK_4)
-            score += BlockedPawn[r-4];
+        if (blocked && r >= RANK_5)
+            score += BlockedPawn[r - RANK_5];
    }

    return score;
@@ -237,6 +242,9 @@ Score Entry::evaluate_shelter(const Position& pos, Square ksq) const {
          bonus -= make_score(UnblockedStorm[d][theirRank], 0);
  }

+  // King On File
+  bonus -= KingOnFile[pos.is_on_semiopen_file(Us, ksq)][pos.is_on_semiopen_file(Them, ksq)];
+
  return bonus;
 }

@@ -23,6 +23,8 @@
 #include <iomanip>
 #include <sstream>

+#include "nnue/evaluate_nnue.h"
+
 #include "bitboard.h"
 #include "misc.h"
 #include "movegen.h"
@@ -32,6 +34,9 @@
 #include "uci.h"
 #include "syzygy/tbprobe.h"

+#include "learn/packed_sfen.h"
+#include "learn/sfen_packer.h"
+
 using std::string;

 namespace Zobrist {
@@ -77,6 +82,8 @@ std::ostream& operator<<(std::ostream& os, const Position& pos) {
      && !pos.can_castle(ANY_CASTLING))
  {
      StateInfo st;
+      ASSERT_ALIGNED(&st, Eval::NNUE::kCacheLineSize);
+
      Position p;
      p.set(pos.fen(), pos.is_chess960(), &st, pos.this_thread());
      Tablebases::ProbeState s1, s2;
@@ -704,7 +711,6 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {

  // Used by NNUE
  st->accumulator.computed_accumulation = false;
-  st->accumulator.computed_score = false;
  auto& dp = st->dirtyPiece;
  dp.dirty_num = 1;

@@ -755,7 +761,7 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
      else
          st->nonPawnMaterial[them] -= PieceValue[MG][captured];

-      if (Eval::useNNUE)
+      if (Eval::NNUE::useNNUE != Eval::NNUE::UseNNUEMode::False)
      {
          dp.dirty_num = 2;  // 1 piece moved, 1 piece captured
          dp.piece[1] = captured;
@@ -799,7 +805,7 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
  // Move the piece. The tricky Chess960 castling is handled earlier
  if (type_of(m) != CASTLING)
  {
-      if (Eval::useNNUE)
+      if (Eval::NNUE::useNNUE != Eval::NNUE::UseNNUEMode::False)
      {
          dp.piece[0] = pc;
          dp.from[0] = from;
@@ -830,7 +836,7 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
          remove_piece(to);
          put_piece(promotion, to);

-          if (Eval::useNNUE)
+          if (Eval::NNUE::useNNUE != Eval::NNUE::UseNNUEMode::False)
          {
              // Promoting pawn to SQ_NONE, promoted piece from SQ_NONE
              dp.to[0] = SQ_NONE;
@@ -968,7 +974,7 @@ void Position::do_castling(Color us, Square from, Square& to, Square& rfrom, Squ
  rto = relative_square(us, kingSide ? SQ_F1 : SQ_D1);
  to = relative_square(us, kingSide ? SQ_G1 : SQ_C1);

-  if (Do && Eval::useNNUE)
+  if (Do && Eval::NNUE::useNNUE != Eval::NNUE::UseNNUEMode::False)
  {
      auto& dp = st->dirtyPiece;
      dp.piece[0] = make_piece(us, KING);
@@ -997,17 +1003,16 @@ void Position::do_null_move(StateInfo& newSt) {
  assert(!checkers());
  assert(&newSt != st);

-  if (Eval::useNNUE)
-  {
-      std::memcpy(&newSt, st, sizeof(StateInfo));
-      st->accumulator.computed_score = false;
-  }
-  else
-      std::memcpy(&newSt, st, offsetof(StateInfo, accumulator));
+  std::memcpy(&newSt, st, offsetof(StateInfo, accumulator));

  newSt.previous = st;
  st = &newSt;

+  // Used by NNUE
+  st->accumulator.computed_accumulation = false;
+  auto& dp = st->dirtyPiece;
+  dp.dirty_num = 0;
+
  if (st->epSquare != SQ_NONE)
  {
      st->key ^= Zobrist::enpassant[file_of(st->epSquare)];
@@ -1317,6 +1322,8 @@ bool Position::pos_is_ok() const {
              assert(0 && "pos_is_ok: Bitboards");

  StateInfo si = *st;
+  ASSERT_ALIGNED(&si, Eval::NNUE::kCacheLineSize);
+
  set_state(&si);
  if (std::memcmp(&si, st, sizeof(StateInfo)))
      assert(0 && "pos_is_ok: State");
@@ -1346,3 +1353,17 @@ bool Position::pos_is_ok() const {

  return true;
 }
+
+// Add a function that directly unpacks for speed. It's pretty tough.
+// Write it by combining packer::unpack() and Position::set().
+// If there is a problem with the passed phase and there is an error, non-zero is returned.
+int Position::set_from_packed_sfen(const Learner::PackedSfen& sfen , StateInfo* si, Thread* th)
+{
+  return Learner::set_from_packed_sfen(*this, sfen, si, th);
+}
+
+// Get the packed sfen. Returns to the buffer specified in the argument.
+void Position::sfen_pack(Learner::PackedSfen& sfen)
+{
+  sfen = Learner::sfen_pack(*this);
+}
@@ -30,6 +30,9 @@

 #include "nnue/nnue_accumulator.h"

+#include "learn/packed_sfen.h"
+#include "learn/sfen_packer.h"
+

 /// StateInfo struct stores information needed to restore a Position object to
 /// its previous state when we retract a move. Whenever a move is made on the
@@ -75,9 +78,6 @@ typedef std::unique_ptr<std::deque<StateInfo>> StateListPtr;
 /// traversing the search tree.
 class Thread;

-// packed sfen
-struct PackedSfen { uint8_t data[32]; }; 
-
 class Position {
 public:
  static void init();
@@ -175,25 +175,27 @@ public:
  // Used by NNUE
  StateInfo* state() const;

-#if defined(EVAL_LEARN)
  // --sfenization helper

+  friend int Learner::set_from_packed_sfen(Position& pos, const Learner::PackedSfen& sfen, StateInfo* si, Thread* th);
+
  // Get the packed sfen. Returns to the buffer specified in the argument.
  // Do not include gamePly in pack.
-  void sfen_pack(PackedSfen& sfen);
+  void sfen_pack(Learner::PackedSfen& sfen);

  // It is slow to go through sfen, so I made a function to set packed sfen directly.
  // Equivalent to pos.set(sfen_unpack(data),si,th);.
  // If there is a problem with the passed phase and there is an error, non-zero is returned.
  // PackedSfen does not include gamePly so it cannot be restored. If you want to set it, specify it with an argument.
-  int set_from_packed_sfen(const PackedSfen& sfen, StateInfo* si, Thread* th, bool mirror = false);
+  int set_from_packed_sfen(const Learner::PackedSfen& sfen, StateInfo* si, Thread* th);
+
+  void clear() { std::memset(this, 0, sizeof(Position)); }

  // Give the board, hand piece, and turn, and return the sfen.
  //static std::string sfen_from_rawdata(Piece board[81], Hand hands[2], Color turn, int gamePly);

  // Returns the position of the ball on the c side.
  Square king_square(Color c) const { return pieceList[make_piece(c, KING)][0]; }
-#endif // EVAL_LEARN

 private:
  // Initialization helpers (used while setting up a position)
@@ -24,6 +24,7 @@
 #include "misc.h"
 #include "movepick.h"
 #include "types.h"
+#include "uci.h"

 class Position;

@@ -32,6 +33,7 @@ namespace Search {
 /// Threshold used for countermoves based pruning
 constexpr int CounterMovePruneThreshold = 0;

+extern bool prune_at_shallow_depth;

 /// Stack struct keeps track of the information we need to remember from nodes
 /// shallower and deeper in the tree during the search. Each search thread has
@@ -48,6 +50,8 @@ struct Stack {
  int statScore;
  int moveCount;
  bool inCheck;
+  bool ttPv;
+  bool ttHit;
 };


@@ -69,7 +73,6 @@ struct RootMove {
  Value previousScore = -VALUE_INFINITE;
  int selDepth = 0;
  int tbRank = 0;
-  int bestMoveCount = 0;
  Value tbScore;
  std::vector<Move> pv;
 };
@@ -86,9 +89,7 @@ struct LimitsType {
    time[WHITE] = time[BLACK] = inc[WHITE] = inc[BLACK] = npmsec = movetime = TimePoint(0);
    movestogo = depth = mate = perft = infinite = 0;
    nodes = 0;
-#if defined (EVAL_LEARN)
    silent = false;
-#endif
  }

  bool use_time_management() const {
@@ -99,11 +100,9 @@ struct LimitsType {
  TimePoint time[COLOR_NB], inc[COLOR_NB], npmsec, movetime, startTime;
  int movestogo, depth, mate, perft, infinite;
  int64_t nodes;
-#if defined (EVAL_LEARN)
  // Silent mode that does not output to the screen (for continuous self-play in process)
  // Do not output PV at this time.
  bool silent;
-#endif
 };

 extern LimitsType Limits;
@@ -111,6 +110,12 @@ extern LimitsType Limits;
 void init();
 void clear();

-} // namespace Search
+// A pair of reader and evaluation value. Returned by Learner::search(),Learner::qsearch().
+using ValueAndPV = std::pair<Value, std::vector<Move>>;
+
+ValueAndPV qsearch(Position& pos);
+ValueAndPV search(Position& pos, int depth_, size_t multiPV = 1, uint64_t nodesLimit = 0);
+
+}

 #endif // #ifndef SEARCH_H_INCLUDED
@@ -28,12 +28,12 @@
 #include <type_traits>
 #include <mutex>

-#include "../bitboard.h"
-#include "../movegen.h"
-#include "../position.h"
-#include "../search.h"
-#include "../types.h"
-#include "../uci.h"
+#include "bitboard.h"
+#include "movegen.h"
+#include "position.h"
+#include "search.h"
+#include "types.h"
+#include "uci.h"

 #include "tbprobe.h"

@@ -52,7 +52,7 @@

 using namespace Tablebases;

-int Tablebases::MaxCardinality;
+int Tablebases::MaxCardinality = 0;

 namespace {

@@ -223,7 +223,9 @@ public:

        *mapping = statbuf.st_size;
        *baseAddress = mmap(nullptr, statbuf.st_size, PROT_READ, MAP_SHARED, fd, 0);
+#if defined(MADV_RANDOM)
        madvise(*baseAddress, statbuf.st_size, MADV_RANDOM);
+#endif
        ::close(fd);

        if (*baseAddress == MAP_FAILED)
@@ -758,7 +760,7 @@ Ret do_probe_table(const Position& pos, T* entry, WDLScore wdl, ProbeState* resu
    if (entry->hasPawns) {
        idx = LeadPawnIdx[leadPawnsCnt][squares[0]];

-        std::sort(squares + 1, squares + leadPawnsCnt, pawns_comp);
+        std::stable_sort(squares + 1, squares + leadPawnsCnt, pawns_comp);

        for (int i = 1; i < leadPawnsCnt; ++i)
            idx += Binomial[i][MapPawns[squares[i]]];
@@ -859,7 +861,7 @@ encode_remaining:

    while (d->groupLen[++next])
    {
-        std::sort(groupSq, groupSq + d->groupLen[next]);
+        std::stable_sort(groupSq, groupSq + d->groupLen[next]);
        uint64_t n = 0;

        // Map down a square if "comes later" than a square in the previous
@@ -21,7 +21,7 @@

 #include <ostream>

-#include "../search.h"
+#include "search.h"

 namespace Tablebases {

@@ -35,6 +35,7 @@ ThreadPool Threads; // Global object
 Thread::Thread(size_t n) : idx(n), stdThread(&Thread::idle_loop, this) {

  wait_for_search_finished();
+  wait_for_worker_finished();
 }


@@ -51,17 +52,6 @@ Thread::~Thread() {
 }


-/// Thread::bestMoveCount(Move move) return best move counter for the given root move
-
-int Thread::best_move_count(Move move) const {
-
-  auto rm = std::find(rootMoves.begin() + pvIdx,
-                      rootMoves.begin() + pvLast, move);
-
-  return rm != rootMoves.begin() + pvLast ? rm->bestMoveCount : 0;
-}
-
-
 /// Thread::clear() reset histories, usually before a new game

 void Thread::clear() {
@@ -91,6 +81,14 @@ void Thread::start_searching() {
  cv.notify_one(); // Wake up the thread in idle_loop()
 }

+void Thread::execute_with_worker(std::function<void(Thread&)> t)
+{
+  std::lock_guard<std::mutex> lk(mutex);
+  worker = std::move(t);
+  searching = true;
+  cv.notify_one(); // Wake up the thread in idle_loop()
+}
+

 /// Thread::wait_for_search_finished() blocks on the condition variable
 /// until the thread has finished searching.
@@ -102,6 +100,12 @@ void Thread::wait_for_search_finished() {
 }


+void Thread::wait_for_worker_finished() {
+
+  std::unique_lock<std::mutex> lk(mutex);
+  cv.wait(lk, [&]{ return !searching; });
+}
+
 /// Thread::idle_loop() is where the thread is parked, blocked on the
 /// condition variable, when it has no work to do.

@@ -119,15 +123,25 @@ void Thread::idle_loop() {
  {
      std::unique_lock<std::mutex> lk(mutex);
      searching = false;
+      worker = nullptr;
      cv.notify_one(); // Wake up anyone waiting for search finished
      cv.wait(lk, [&]{ return searching; });

      if (exit)
          return;

+      auto wrk = std::move(worker);
+
      lk.unlock();

-      search();
+      if (wrk)
+      {
+        wrk(*this);
+      }
+      else
+      {
+        search();
+      }
  }
 }

@@ -172,6 +186,13 @@ void ThreadPool::clear() {
  main()->previousTimeReduction = 1.0;
 }

+void ThreadPool::execute_with_workers(const std::function<void(Thread&)>& worker)
+{
+  for(Thread* th : *this)
+  {
+    th->execute_with_worker(worker);
+  }
+}

 /// ThreadPool::start_thinking() wakes up main thread waiting in idle_loop() and
 /// returns immediately. Main thread will wake up other threads and start the search.
@@ -192,9 +213,6 @@ void ThreadPool::start_thinking(Position& pos, StateListPtr& states,
          || std::count(limits.searchmoves.begin(), limits.searchmoves.end(), m))
          rootMoves.emplace_back(m);

-  if (!rootMoves.empty())
-      Tablebases::rank_root_moves(pos, rootMoves);
-
  // After ownership transfer 'states' becomes empty, so if we stop the search
  // and call 'go' again without setting a new position states.get() == NULL.
  assert(states.get() || setupStates.get());
@@ -214,6 +232,24 @@ void ThreadPool::start_thinking(Position& pos, StateListPtr& states,
      th->rootMoves = rootMoves;
      th->rootPos.set(pos.fen(), pos.is_chess960(), &th->rootState, th);
      th->rootState = setupStates->back();
+      // This is also set by rank_root_moves but we need to set it
+      // also when there is no legal moves.
+      th->rootInTB = false;
+      th->UseRule50 = bool(Options["Syzygy50MoveRule"]);
+      th->ProbeDepth = int(Options["SyzygyProbeDepth"]);
+      th->Cardinality = int(Options["SyzygyProbeLimit"]);
+
+      // Tables with fewer pieces than SyzygyProbeLimit are searched with
+      // ProbeDepth == DEPTH_ZERO
+      if (th->Cardinality > Tablebases::MaxCardinality)
+      {
+          th->Cardinality = Tablebases::MaxCardinality;
+          th->ProbeDepth = 0;
+      }
+
+      if (!rootMoves.empty())
+          Tablebases::rank_root_moves(pos, rootMoves);
+
  }

  main()->start_searching();
@@ -235,16 +271,16 @@ Thread* ThreadPool::get_best_thread() const {
        votes[th->rootMoves[0].pv[0]] +=
            (th->rootMoves[0].score - minScore + 14) * int(th->completedDepth);

-          if (abs(bestThread->rootMoves[0].score) >= VALUE_TB_WIN_IN_MAX_PLY)
-          {
-              // Make sure we pick the shortest mate / TB conversion or stave off mate the longest
-              if (th->rootMoves[0].score > bestThread->rootMoves[0].score)
-                  bestThread = th;
-          }
-          else if (   th->rootMoves[0].score >= VALUE_TB_WIN_IN_MAX_PLY
-                   || (   th->rootMoves[0].score > VALUE_TB_LOSS_IN_MAX_PLY
-                       && votes[th->rootMoves[0].pv[0]] > votes[bestThread->rootMoves[0].pv[0]]))
-              bestThread = th;
+        if (abs(bestThread->rootMoves[0].score) >= VALUE_TB_WIN_IN_MAX_PLY)
+        {
+            // Make sure we pick the shortest mate / TB conversion or stave off mate the longest
+            if (th->rootMoves[0].score > bestThread->rootMoves[0].score)
+                bestThread = th;
+        }
+        else if (   th->rootMoves[0].score >= VALUE_TB_WIN_IN_MAX_PLY
+                 || (   th->rootMoves[0].score > VALUE_TB_LOSS_IN_MAX_PLY
+                     && votes[th->rootMoves[0].pv[0]] > votes[bestThread->rootMoves[0].pv[0]]))
+            bestThread = th;
    }

    return bestThread;
@@ -269,3 +305,10 @@ void ThreadPool::wait_for_search_finished() const {
        if (th != front())
            th->wait_for_search_finished();
 }
+
+
+void ThreadPool::wait_for_workers_finished() const {
+
+    for (Thread* th : *this)
+        th->wait_for_worker_finished();
+}
@@ -24,6 +24,7 @@
 #include <mutex>
 #include <thread>
 #include <vector>
+#include <functional>

 #include "material.h"
 #include "movepick.h"
@@ -38,23 +39,41 @@
 /// pointer to an entry its life time is unlimited and we don't have
 /// to care about someone changing the entry under our feet.

+namespace Detail {
+
+  template <typename T>
+  struct TypeIdentity {
+    using Type = T;
+  };
+
+}
+
 class Thread {

  std::mutex mutex;
  std::condition_variable cv;
  size_t idx;
  bool exit = false, searching = true; // Set before starting std::thread
+  std::function<void(Thread&)> worker;
  NativeThread stdThread;

 public:
  explicit Thread(size_t);
  virtual ~Thread();
  virtual void search();
+
+  // The function object to be executed is taken by value to remove
+  // the need for separate lvalue and rvalue overloads.
+  // The worker thread needs to have ownership of the task
+  // to be executed because otherwise there's no way to manage its lifetime.
+  virtual void execute_with_worker(std::function<void(Thread&)> t);
+
  void clear();
  void idle_loop();
  void start_searching();
  void wait_for_search_finished();
-  int best_move_count(Move move) const;
+  void wait_for_worker_finished();
+  size_t thread_idx() const { return idx; }

  Pawns::Table pawnsTable;
  Material::Table materialTable;
@@ -74,6 +93,11 @@ public:
  CapturePieceToHistory captureHistory;
  ContinuationHistory continuationHistory[2][2];
  Score contempt;
+  int failedHighCnt;
+  bool rootInTB;
+  int Cardinality;
+  bool UseRule50;
+  Depth ProbeDepth;
 };


@@ -101,6 +125,61 @@ struct MainThread : public Thread {

 struct ThreadPool : public std::vector<Thread*> {

+  // Each thread gets its own copy of the `worker` function object.
+  // This means that each worker thread will have exclusive access
+  // to the state of the `worker` function object.
+  void execute_with_workers(const std::function<void(Thread&)>& worker);
+
+  template <typename IndexT, typename FuncT>
+  void for_each_index_with_workers(
+    IndexT begin,
+    typename Detail::TypeIdentity<IndexT>::Type end,
+    FuncT func)
+  {
+    // This value must outlive the function call.
+    // It's fairly safe if we make it static
+    // because for_each_index_with_workers
+    // is not reentrant nor thread safe.
+    static std::atomic<IndexT> i_atomic;
+    i_atomic.store(begin);
+
+    execute_with_workers(
+      [end, func](Thread& th) mutable {
+        for(;;) {
+          const auto i = i_atomic.fetch_add(1);
+          if (i >= end)
+            break;
+
+          func(th, i);
+        }
+      });
+  }
+
+  template <typename IndexT, typename FuncT>
+  void for_each_index_chunk_with_workers(
+    IndexT begin,
+    typename Detail::TypeIdentity<IndexT>::Type end,
+    FuncT func)
+  {
+    // This value must outlive the function call.
+    // It's fairly safe if we make it static
+    // because for_each_index_with_workers
+    // is not reentrant nor thread safe.
+    const IndexT size = end - begin;
+    const IndexT chunk_size = (size + this->size()) / this->size();
+
+    execute_with_workers(
+      [chunk_size, end, func](Thread& th) mutable {
+        const IndexT thread_id = th.thread_idx();
+        const IndexT offset = chunk_size * thread_id;
+        if (offset >= end)
+          return;
+
+        const IndexT count = offset + chunk_size > end ? end - offset : chunk_size;
+        func(th, offset, count);
+      });
+  }
+
  void start_thinking(Position&, StateListPtr&, const Search::LimitsType&, bool = false);
  void clear();
  void set(size_t);
@@ -111,6 +190,7 @@ struct ThreadPool : public std::vector<Thread*> {
  Thread* get_best_thread() const;
  void start_searching();
  void wait_for_search_finished() const;
+  void wait_for_workers_finished() const;

  std::atomic_bool stop, increaseDepth;

--- a/Show More
+++ b/Show More