From 406979ea12ee7828e079871b0f9f3dc8f127a741 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ste=CC=81phane=20Nicolet?= <cassio@free.fr>
Date: Sun, 23 Aug 2020 13:43:38 +0200
Subject: [PATCH 001/398] Embed default net, and simplify using non-default
 nets

covers the most important cases from the user perspective:

It embeds the default net in the binary, so a download of that binary will result
in a working engine with the default net. The engine will be functional in the default mode
without any additional user action.

It allows non-default nets to be used, which will be looked for in up to
three directories (working directory, location of the binary, and optionally a specific default directory).
This mechanism is also kept for those developers that use MSVC,
the one compiler that doesn't have an easy mechanism for embedding data.

It is possible to disable embedding, and instead specify a specific directory, e.g. linux distros might want to use
CXXFLAGS="-DNNUE_EMBEDDING_OFF -DDEFAULT_NNUE_DIRECTORY=/usr/share/games/stockfish/" make -j ARCH=x86-64 profile-build

passed STC non-regression:
https://tests.stockfishchess.org/tests/view/5f4a581c150f0aef5f8ae03a
LLR: 2.95 (-2.94,2.94) {-1.25,-0.25}
Total: 66928 W: 7202 L: 7147 D: 52579
Ptnml(0-2): 291, 5309, 22211, 5360, 293

closes https://github.com/official-stockfish/Stockfish/pull/3070

fixes https://github.com/official-stockfish/Stockfish/issues/3030

No functional change.
---
 AUTHORS                    |   3 +-
 README.md                  |  18 +-
 appveyor.yml               |   2 +-
 src/Makefile               |  10 +-
 src/evaluate.cpp           | 103 +++++++++--
 src/evaluate.h             |   7 +-
 src/incbin/UNLICENCE       |  26 +++
 src/incbin/incbin.h        | 368 +++++++++++++++++++++++++++++++++++++
 src/main.cpp               |   1 +
 src/misc.cpp               |  59 ++++++
 src/misc.h                 |   9 +-
 src/nnue/evaluate_nnue.cpp |  14 +-
 src/ucioption.cpp          |   5 +-
 13 files changed, 582 insertions(+), 43 deletions(-)
 create mode 100644 src/incbin/UNLICENCE
 create mode 100755 src/incbin/incbin.h

diff --git a/AUTHORS b/AUTHORS
index c96f870a..c00ab657 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -36,10 +36,11 @@ Bryan Cross (crossbr)
 candirufish
 Chess13234
 Chris Cain (ceebo)
+Dale Weiler (graphitemaster)
 Dan Schmidt (dfannius)
 Daniel Axtens (daxtens)
 Daniel Dugovic (ddugovic)
-Dariusz Orzechowski
+Dariusz Orzechowski (dorzechowski)
 David Zar
 Daylen Yang (daylen)
 DiscanX
diff --git a/README.md b/README.md
index 2cc88bf4..96a495ae 100644
--- a/README.md
+++ b/README.md
@@ -12,9 +12,9 @@ about how to use Stockfish with it.
 
 The Stockfish engine features two evaluation functions for chess, the classical
 evaluation based on handcrafted terms, and the NNUE evaluation based on efficiently
-updateable neural networks. The classical evaluation runs efficiently on most 64bit
-CPU architectures, while the NNUE evaluation benefits strongly from the vector
-intrinsics available on modern CPUs (avx2 or similar).
+updateable neural networks. The classical evaluation runs efficiently on almost all
+CPU architectures, while the NNUE evaluation benefits from the vector
+intrinsics available on most CPUs (sse2, avx2, neon, or similar).
 
 
 ## Files
@@ -29,10 +29,11 @@ This distribution of Stockfish consists of the following files:
     that can be used to compile Stockfish on Unix-like systems.
 
   * a file with the .nnue extension, storing the neural network for the NNUE 
-    evaluation.
+    evaluation. Binary distributions will have this file embedded.
 
 Note: to use the NNUE evaluation, the additional data file with neural network parameters
-needs to be downloaded. The filename for the default net can be found as the default
+needs to be available. Normally, this file is already embedded in the binary or it can be downloaded.
+The filename for the default (recommended) net can be found as the default
 value of the `EvalFile` UCI option, with the format `nn-[SHA256 first 12 digits].nnue`
 (for instance, `nn-c157e0a5755b.nnue`). This file can be downloaded from
 ```
@@ -61,11 +62,14 @@ Currently, Stockfish has the following UCI options:
 
   * #### Use NNUE
     Toggle between the NNUE and classical evaluation functions. If set to "true",
-    the network parameters must be available to load from file (see also EvalFile).
+    the network parameters must be available to load from file (see also EvalFile),
+    if they are not embedded in the binary.
 
   * #### EvalFile
     The name of the file of the NNUE evaluation parameters. Depending on the GUI the
-    filename should include the full path to the folder/directory that contains the file.
+    filename might have to include the full path to the folder/directory that contains the file.
+    Other locations, such as the directory that contains the binary and the working directory,
+    are also searched.
 
   * #### UCI_AnalyseMode
     An option handled by your GUI.
diff --git a/appveyor.yml b/appveyor.yml
index a3732a23..ab608409 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -63,7 +63,7 @@ build_script:
   - cmake --build . --config %CONFIGURATION% -- /verbosity:minimal
   - ps: |
       # Download default NNUE net from fishtest
-      $nnuenet = Get-Content -Path src\ucioption.cpp | Select-String -CaseSensitive -Pattern "Option" | Select-String -CaseSensitive -Pattern "nn-[a-z0-9]{12}.nnue"
+      $nnuenet = Get-Content -Path src\evaluate.h | Select-String -CaseSensitive -Pattern "EvalFileDefaultName" | Select-String -CaseSensitive -Pattern "nn-[a-z0-9]{12}.nnue"
       $dummy = $nnuenet -match "(?<nnuenet>nn-[a-z0-9]{12}.nnue)"
       $nnuenet = $Matches.nnuenet
       Write-Host "Default net:" $nnuenet
diff --git a/src/Makefile b/src/Makefile
index 703aa230..5f363f02 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -614,10 +614,12 @@ ifeq ($(debug), no)
 # So, only enable it for a cross from Linux by default.
 	else ifeq ($(comp),mingw)
 	ifeq ($(KERNEL),Linux)
+	ifneq ($(arch),i386)
 		CXXFLAGS += -flto
 		LDFLAGS += $(CXXFLAGS) -flto=jobserver
 	endif
 	endif
+	endif
 endif
 endif
 
@@ -705,7 +707,7 @@ endif
         config-sanity icc-profile-use icc-profile-make gcc-profile-use gcc-profile-make \
         clang-profile-use clang-profile-make
 
-build: config-sanity
+build: config-sanity net
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) all
 
 profile-build: net config-sanity objclean profileclean
@@ -731,12 +733,13 @@ install:
 	-cp $(EXE) $(BINDIR)
 	-strip $(BINDIR)/$(EXE)
 
-#clean all
+# clean all
 clean: objclean profileclean
 	@rm -f .depend *~ core
 
+# evaluation network (nnue)
 net:
-	$(eval nnuenet := $(shell grep EvalFile ucioption.cpp | grep Option | sed 's/.*\(nn-[a-z0-9]\{12\}.nnue\).*/\1/'))
+	$(eval nnuenet := $(shell grep EvalFileDefaultName evaluate.h | grep define | sed 's/.*\(nn-[a-z0-9]\{12\}.nnue\).*/\1/'))
 	@echo "Default net: $(nnuenet)"
 	$(eval nnuedownloadurl := https://tests.stockfishchess.org/api/nn/$(nnuenet))
 	$(eval curl_or_wget := $(shell if hash curl 2>/dev/null; then echo "curl -skL"; elif hash wget 2>/dev/null; then echo "wget -qO-"; fi))
@@ -758,7 +761,6 @@ net:
             echo "shasum / sha256sum not found, skipping net validation"; \
         fi
 
-
 # clean binaries and objects
 objclean:
 	@rm -f $(EXE) *.o ./syzygy/*.o ./nnue/*.o ./nnue/features/*.o
diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index ce92db9a..67154751 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -20,51 +20,126 @@
 #include <cassert>
 #include <cstdlib>
 #include <cstring>   // For std::memset
+#include <fstream>
 #include <iomanip>
 #include <sstream>
 #include <iostream>
+#include <streambuf>
+#include <vector>
 
 #include "bitboard.h"
 #include "evaluate.h"
 #include "material.h"
+#include "misc.h"
 #include "pawns.h"
 #include "thread.h"
 #include "uci.h"
+#include "incbin/incbin.h"
+
+
+// Macro to embed the default NNUE file data in the engine binary (using incbin.h, by Dale Weiler).
+// This macro invocation will declare the following three variables
+//     const unsigned char        gEmbeddedNNUEData[];  // a pointer to the embedded data
+//     const unsigned char *const gEmbeddedNNUEEnd;     // a marker to the end
+//     const unsigned int         gEmbeddedNNUESize;    // the size of the embedded file
+// Note that this does not work in Microsof Visual Studio.
+#if !defined(_MSC_VER) && !defined(NNUE_EMBEDDING_OFF)
+  INCBIN(EmbeddedNNUE, EvalFileDefaultName);
+#else
+  const unsigned char        gEmbeddedNNUEData[1] = {0x0};
+  const unsigned char *const gEmbeddedNNUEEnd = &gEmbeddedNNUEData[1];
+  const unsigned int         gEmbeddedNNUESize = 1;
+#endif
+
+
+using namespace std;
+using namespace Eval::NNUE;
 
 namespace Eval {
 
   bool useNNUE;
-  std::string eval_file_loaded="None";
+  string eval_file_loaded = "None";
+
+  /// init_NNUE() tries to load a nnue network at startup time, or when the engine
+  /// receives a UCI command "setoption name EvalFile value nn-[a-z0-9]{12}.nnue"
+  /// The name of the nnue network is always retrieved from the EvalFile option.
+  /// We search the given network in three locations: internally (the default
+  /// network may be embedded in the binary), in the active working directory and
+  /// in the engine directory. Distro packagers may define the DEFAULT_NNUE_DIRECTORY
+  /// variable to have the engine search in a special directory in their distro.
 
   void init_NNUE() {
 
     useNNUE = Options["Use NNUE"];
-    std::string eval_file = std::string(Options["EvalFile"]);
-    if (useNNUE && eval_file_loaded != eval_file)
-        if (Eval::NNUE::load_eval_file(eval_file))
-            eval_file_loaded = eval_file;
+    if (!useNNUE)
+        return;
+
+    string eval_file = string(Options["EvalFile"]);
+
+    #if defined(DEFAULT_NNUE_DIRECTORY)
+    #define stringify2(x) #x
+    #define stringify(x) stringify2(x)
+    vector<string> dirs = { "<internal>" , "" , CommandLine::binaryDirectory , stringify(DEFAULT_NNUE_DIRECTORY) };
+    #else
+    vector<string> dirs = { "<internal>" , "" , CommandLine::binaryDirectory };
+    #endif
+
+    for (string directory : dirs)
+        if (eval_file_loaded != eval_file)
+        {
+            if (directory != "<internal>")
+            {
+                ifstream stream(directory + eval_file, ios::binary);
+                if (load_eval(eval_file, stream))
+                    eval_file_loaded = eval_file;
+            }
+
+            if (directory == "<internal>" && eval_file == EvalFileDefaultName)
+            {
+                // C++ way to prepare a buffer for a memory stream
+                class MemoryBuffer : public basic_streambuf<char> {
+                    public: MemoryBuffer(char* p, size_t n) { setg(p, p, p + n); setp(p, p + n); }
+                };
+
+                MemoryBuffer buffer(const_cast<char*>(reinterpret_cast<const char*>(gEmbeddedNNUEData)),
+                                    size_t(gEmbeddedNNUESize));
+
+                istream stream(&buffer);
+                if (load_eval(eval_file, stream))
+                    eval_file_loaded = eval_file;
+            }
+        }
   }
 
+  /// verify_NNUE() verifies that the last net used was loaded successfully
   void verify_NNUE() {
 
-    std::string eval_file = std::string(Options["EvalFile"]);
+    string eval_file = string(Options["EvalFile"]);
+
     if (useNNUE && eval_file_loaded != eval_file)
     {
         UCI::OptionsMap defaults;
         UCI::init(defaults);
 
-        sync_cout << "info string ERROR: NNUE evaluation used, but the network file " << eval_file << " was not loaded successfully." << sync_endl;
-        sync_cout << "info string ERROR: The UCI option EvalFile might need to specify the full path, including the directory/folder name, to the file." << sync_endl;
-        sync_cout << "info string ERROR: The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/"+std::string(defaults["EvalFile"]) << sync_endl;
-        sync_cout << "info string ERROR: If the UCI option Use NNUE is set to true, network evaluation parameters compatible with the program must be available." << sync_endl;
-        sync_cout << "info string ERROR: The engine will be terminated now." << sync_endl;
-        std::exit(EXIT_FAILURE);
+        string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available.";
+        string msg2 = "The option is set to true, but the network file " + eval_file + " was not loaded successfully.";
+        string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file.";
+        string msg4 = "The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/" + string(defaults["EvalFile"]);
+        string msg5 = "The engine will be terminated now.";
+
+        sync_cout << "info string ERROR: " << msg1 << sync_endl;
+        sync_cout << "info string ERROR: " << msg2 << sync_endl;
+        sync_cout << "info string ERROR: " << msg3 << sync_endl;
+        sync_cout << "info string ERROR: " << msg4 << sync_endl;
+        sync_cout << "info string ERROR: " << msg5 << sync_endl;
+
+        exit(EXIT_FAILURE);
     }
 
     if (useNNUE)
-        sync_cout << "info string NNUE evaluation using " << eval_file << " enabled." << sync_endl;
+        sync_cout << "info string NNUE evaluation using " << eval_file << " enabled" << sync_endl;
     else
-        sync_cout << "info string classical evaluation enabled." << sync_endl;
+        sync_cout << "info string classical evaluation enabled" << sync_endl;
   }
 }
 
diff --git a/src/evaluate.h b/src/evaluate.h
index e808068d..d701f5a7 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -35,12 +35,17 @@ namespace Eval {
   void init_NNUE();
   void verify_NNUE();
 
+  // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
+  // for the build process (profile-build and fishtest) to work. Do not change the
+  // name of the macro, as it is used in the Makefile.
+  #define EvalFileDefaultName   "nn-82215d0fd0df.nnue"
+
   namespace NNUE {
 
     Value evaluate(const Position& pos);
     Value compute_eval(const Position& pos);
     void  update_eval(const Position& pos);
-    bool  load_eval_file(const std::string& evalFile);
+    bool  load_eval(std::string streamName, std::istream& stream);
 
   } // namespace NNUE
 
diff --git a/src/incbin/UNLICENCE b/src/incbin/UNLICENCE
new file mode 100644
index 00000000..32484ab5
--- /dev/null
+++ b/src/incbin/UNLICENCE
@@ -0,0 +1,26 @@
+The file "incbin.h" is free and unencumbered software released into
+the public domain by Dale Weiler, see:
+   <https://github.com/graphitemaster/incbin>
+
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <http://unlicense.org/>
diff --git a/src/incbin/incbin.h b/src/incbin/incbin.h
new file mode 100755
index 00000000..c19684d7
--- /dev/null
+++ b/src/incbin/incbin.h
@@ -0,0 +1,368 @@
+/**
+ * @file incbin.h
+ * @author Dale Weiler
+ * @brief Utility for including binary files
+ *
+ * Facilities for including binary files into the current translation unit and
+ * making use from them externally in other translation units.
+ */
+#ifndef INCBIN_HDR
+#define INCBIN_HDR
+#include <limits.h>
+#if   defined(__AVX512BW__) || \
+      defined(__AVX512CD__) || \
+      defined(__AVX512DQ__) || \
+      defined(__AVX512ER__) || \
+      defined(__AVX512PF__) || \
+      defined(__AVX512VL__) || \
+      defined(__AVX512F__)
+# define INCBIN_ALIGNMENT_INDEX 6
+#elif defined(__AVX__)      || \
+      defined(__AVX2__)
+# define INCBIN_ALIGNMENT_INDEX 5
+#elif defined(__SSE__)      || \
+      defined(__SSE2__)     || \
+      defined(__SSE3__)     || \
+      defined(__SSSE3__)    || \
+      defined(__SSE4_1__)   || \
+      defined(__SSE4_2__)   || \
+      defined(__neon__)
+# define INCBIN_ALIGNMENT_INDEX 4
+#elif ULONG_MAX != 0xffffffffu
+# define INCBIN_ALIGNMENT_INDEX 3
+# else
+# define INCBIN_ALIGNMENT_INDEX 2
+#endif
+
+/* Lookup table of (1 << n) where `n' is `INCBIN_ALIGNMENT_INDEX' */
+#define INCBIN_ALIGN_SHIFT_0 1
+#define INCBIN_ALIGN_SHIFT_1 2
+#define INCBIN_ALIGN_SHIFT_2 4
+#define INCBIN_ALIGN_SHIFT_3 8
+#define INCBIN_ALIGN_SHIFT_4 16
+#define INCBIN_ALIGN_SHIFT_5 32
+#define INCBIN_ALIGN_SHIFT_6 64
+
+/* Actual alignment value */
+#define INCBIN_ALIGNMENT \
+    INCBIN_CONCATENATE( \
+        INCBIN_CONCATENATE(INCBIN_ALIGN_SHIFT, _), \
+        INCBIN_ALIGNMENT_INDEX)
+
+/* Stringize */
+#define INCBIN_STR(X) \
+    #X
+#define INCBIN_STRINGIZE(X) \
+    INCBIN_STR(X)
+/* Concatenate */
+#define INCBIN_CAT(X, Y) \
+    X ## Y
+#define INCBIN_CONCATENATE(X, Y) \
+    INCBIN_CAT(X, Y)
+/* Deferred macro expansion */
+#define INCBIN_EVAL(X) \
+    X
+#define INCBIN_INVOKE(N, ...) \
+    INCBIN_EVAL(N(__VA_ARGS__))
+
+/* Green Hills uses a different directive for including binary data */
+#if defined(__ghs__)
+#  if (__ghs_asm == 2)
+#    define INCBIN_MACRO ".file"
+/* Or consider the ".myrawdata" entry in the ld file */
+#  else
+#    define INCBIN_MACRO "\tINCBIN"
+#  endif
+#else
+#  define INCBIN_MACRO ".incbin"
+#endif
+
+#ifndef _MSC_VER
+#  define INCBIN_ALIGN \
+    __attribute__((aligned(INCBIN_ALIGNMENT)))
+#else
+#  define INCBIN_ALIGN __declspec(align(INCBIN_ALIGNMENT))
+#endif
+
+#if defined(__arm__) || /* GNU C and RealView */ \
+    defined(__arm) || /* Diab */ \
+    defined(_ARM) /* ImageCraft */
+#  define INCBIN_ARM
+#endif
+
+#ifdef __GNUC__
+/* Utilize .balign where supported */
+#  define INCBIN_ALIGN_HOST ".balign " INCBIN_STRINGIZE(INCBIN_ALIGNMENT) "\n"
+#  define INCBIN_ALIGN_BYTE ".balign 1\n"
+#elif defined(INCBIN_ARM)
+/*
+ * On arm assemblers, the alignment value is calculated as (1 << n) where `n' is
+ * the shift count. This is the value passed to `.align'
+ */
+#  define INCBIN_ALIGN_HOST ".align " INCBIN_STRINGIZE(INCBIN_ALIGNMENT_INDEX) "\n"
+#  define INCBIN_ALIGN_BYTE ".align 0\n"
+#else
+/* We assume other inline assembler's treat `.align' as `.balign' */
+#  define INCBIN_ALIGN_HOST ".align " INCBIN_STRINGIZE(INCBIN_ALIGNMENT) "\n"
+#  define INCBIN_ALIGN_BYTE ".align 1\n"
+#endif
+
+/* INCBIN_CONST is used by incbin.c generated files */
+#if defined(__cplusplus)
+#  define INCBIN_EXTERNAL extern "C"
+#  define INCBIN_CONST    extern const
+#else
+#  define INCBIN_EXTERNAL extern
+#  define INCBIN_CONST    const
+#endif
+
+/**
+ * @brief Optionally override the linker section into which data is emitted.
+ *
+ * @warning If you use this facility, you'll have to deal with platform-specific linker output
+ * section naming on your own
+ *
+ * Overriding the default linker output section, e.g for esp8266/Arduino:
+ * @code
+ * #define INCBIN_OUTPUT_SECTION ".irom.text"
+ * #include "incbin.h"
+ * INCBIN(Foo, "foo.txt");
+ * // Data is emitted into program memory that never gets copied to RAM
+ * @endcode
+ */
+#if !defined(INCBIN_OUTPUT_SECTION)
+#  if defined(__APPLE__)
+#    define INCBIN_OUTPUT_SECTION         ".const_data"
+#  else
+#    define INCBIN_OUTPUT_SECTION         ".rodata"
+#  endif
+#endif
+
+#if defined(__APPLE__)
+/* The directives are different for Apple branded compilers */
+#  define INCBIN_SECTION         INCBIN_OUTPUT_SECTION "\n"
+#  define INCBIN_GLOBAL(NAME)    ".globl " INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME "\n"
+#  define INCBIN_INT             ".long "
+#  define INCBIN_MANGLE          "_"
+#  define INCBIN_BYTE            ".byte "
+#  define INCBIN_TYPE(...)
+#else
+#  define INCBIN_SECTION         ".section " INCBIN_OUTPUT_SECTION "\n"
+#  define INCBIN_GLOBAL(NAME)    ".global " INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME "\n"
+#  if defined(__ghs__)
+#    define INCBIN_INT           ".word "
+#  else
+#    define INCBIN_INT           ".int "
+#  endif
+#  if defined(__USER_LABEL_PREFIX__)
+#    define INCBIN_MANGLE        INCBIN_STRINGIZE(__USER_LABEL_PREFIX__)
+#  else
+#    define INCBIN_MANGLE        ""
+#  endif
+#  if defined(INCBIN_ARM)
+/* On arm assemblers, `@' is used as a line comment token */
+#    define INCBIN_TYPE(NAME)    ".type " INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME ", %object\n"
+#  elif defined(__MINGW32__) || defined(__MINGW64__)
+/* Mingw doesn't support this directive either */
+#    define INCBIN_TYPE(NAME)
+#  else
+/* It's safe to use `@' on other architectures */
+#    define INCBIN_TYPE(NAME)    ".type " INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME ", @object\n"
+#  endif
+#  define INCBIN_BYTE            ".byte "
+#endif
+
+/* List of style types used for symbol names */
+#define INCBIN_STYLE_CAMEL 0
+#define INCBIN_STYLE_SNAKE 1
+
+/**
+ * @brief Specify the prefix to use for symbol names.
+ *
+ * By default this is `g', producing symbols of the form:
+ * @code
+ * #include "incbin.h"
+ * INCBIN(Foo, "foo.txt");
+ *
+ * // Now you have the following symbols:
+ * // const unsigned char gFooData[];
+ * // const unsigned char *const gFooEnd;
+ * // const unsigned int gFooSize;
+ * @endcode
+ *
+ * If however you specify a prefix before including: e.g:
+ * @code
+ * #define INCBIN_PREFIX incbin
+ * #include "incbin.h"
+ * INCBIN(Foo, "foo.txt");
+ *
+ * // Now you have the following symbols instead:
+ * // const unsigned char incbinFooData[];
+ * // const unsigned char *const incbinFooEnd;
+ * // const unsigned int incbinFooSize;
+ * @endcode
+ */
+#if !defined(INCBIN_PREFIX)
+#  define INCBIN_PREFIX g
+#endif
+
+/**
+ * @brief Specify the style used for symbol names.
+ *
+ * Possible options are
+ * - INCBIN_STYLE_CAMEL "CamelCase"
+ * - INCBIN_STYLE_SNAKE "snake_case"
+ *
+ * Default option is *INCBIN_STYLE_CAMEL* producing symbols of the form:
+ * @code
+ * #include "incbin.h"
+ * INCBIN(Foo, "foo.txt");
+ *
+ * // Now you have the following symbols:
+ * // const unsigned char <prefix>FooData[];
+ * // const unsigned char *const <prefix>FooEnd;
+ * // const unsigned int <prefix>FooSize;
+ * @endcode
+ *
+ * If however you specify a style before including: e.g:
+ * @code
+ * #define INCBIN_STYLE INCBIN_STYLE_SNAKE
+ * #include "incbin.h"
+ * INCBIN(foo, "foo.txt");
+ *
+ * // Now you have the following symbols:
+ * // const unsigned char <prefix>foo_data[];
+ * // const unsigned char *const <prefix>foo_end;
+ * // const unsigned int <prefix>foo_size;
+ * @endcode
+ */
+#if !defined(INCBIN_STYLE)
+#  define INCBIN_STYLE INCBIN_STYLE_CAMEL
+#endif
+
+/* Style lookup tables */
+#define INCBIN_STYLE_0_DATA Data
+#define INCBIN_STYLE_0_END End
+#define INCBIN_STYLE_0_SIZE Size
+#define INCBIN_STYLE_1_DATA _data
+#define INCBIN_STYLE_1_END _end
+#define INCBIN_STYLE_1_SIZE _size
+
+/* Style lookup: returning identifier */
+#define INCBIN_STYLE_IDENT(TYPE) \
+    INCBIN_CONCATENATE( \
+        INCBIN_STYLE_, \
+        INCBIN_CONCATENATE( \
+            INCBIN_EVAL(INCBIN_STYLE), \
+            INCBIN_CONCATENATE(_, TYPE)))
+
+/* Style lookup: returning string literal */
+#define INCBIN_STYLE_STRING(TYPE) \
+    INCBIN_STRINGIZE( \
+        INCBIN_STYLE_IDENT(TYPE)) \
+
+/* Generate the global labels by indirectly invoking the macro with our style
+ * type and concatenating the name against them. */
+#define INCBIN_GLOBAL_LABELS(NAME, TYPE) \
+    INCBIN_INVOKE( \
+        INCBIN_GLOBAL, \
+        INCBIN_CONCATENATE( \
+            NAME, \
+            INCBIN_INVOKE( \
+                INCBIN_STYLE_IDENT, \
+                TYPE))) \
+    INCBIN_INVOKE( \
+        INCBIN_TYPE, \
+        INCBIN_CONCATENATE( \
+            NAME, \
+            INCBIN_INVOKE( \
+                INCBIN_STYLE_IDENT, \
+                TYPE)))
+
+/**
+ * @brief Externally reference binary data included in another translation unit.
+ *
+ * Produces three external symbols that reference the binary data included in
+ * another translation unit.
+ *
+ * The symbol names are a concatenation of `INCBIN_PREFIX' before *NAME*; with
+ * "Data", as well as "End" and "Size" after. An example is provided below.
+ *
+ * @param NAME The name given for the binary data
+ *
+ * @code
+ * INCBIN_EXTERN(Foo);
+ *
+ * // Now you have the following symbols:
+ * // extern const unsigned char <prefix>FooData[];
+ * // extern const unsigned char *const <prefix>FooEnd;
+ * // extern const unsigned int <prefix>FooSize;
+ * @endcode
+ */
+#define INCBIN_EXTERN(NAME) \
+    INCBIN_EXTERNAL const INCBIN_ALIGN unsigned char \
+        INCBIN_CONCATENATE( \
+            INCBIN_CONCATENATE(INCBIN_PREFIX, NAME), \
+            INCBIN_STYLE_IDENT(DATA))[]; \
+    INCBIN_EXTERNAL const INCBIN_ALIGN unsigned char *const \
+    INCBIN_CONCATENATE( \
+        INCBIN_CONCATENATE(INCBIN_PREFIX, NAME), \
+        INCBIN_STYLE_IDENT(END)); \
+    INCBIN_EXTERNAL const unsigned int \
+        INCBIN_CONCATENATE( \
+            INCBIN_CONCATENATE(INCBIN_PREFIX, NAME), \
+            INCBIN_STYLE_IDENT(SIZE))
+
+/**
+ * @brief Include a binary file into the current translation unit.
+ *
+ * Includes a binary file into the current translation unit, producing three symbols
+ * for objects that encode the data and size respectively.
+ *
+ * The symbol names are a concatenation of `INCBIN_PREFIX' before *NAME*; with
+ * "Data", as well as "End" and "Size" after. An example is provided below.
+ *
+ * @param NAME The name to associate with this binary data (as an identifier.)
+ * @param FILENAME The file to include (as a string literal.)
+ *
+ * @code
+ * INCBIN(Icon, "icon.png");
+ *
+ * // Now you have the following symbols:
+ * // const unsigned char <prefix>IconData[];
+ * // const unsigned char *const <prefix>IconEnd;
+ * // const unsigned int <prefix>IconSize;
+ * @endcode
+ *
+ * @warning This must be used in global scope
+ * @warning The identifiers may be different if INCBIN_STYLE is not default
+ *
+ * To externally reference the data included by this in another translation unit
+ * please @see INCBIN_EXTERN.
+ */
+#ifdef _MSC_VER
+#define INCBIN(NAME, FILENAME) \
+    INCBIN_EXTERN(NAME)
+#else
+#define INCBIN(NAME, FILENAME) \
+    __asm__(INCBIN_SECTION \
+            INCBIN_GLOBAL_LABELS(NAME, DATA) \
+            INCBIN_ALIGN_HOST \
+            INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(DATA) ":\n" \
+            INCBIN_MACRO " \"" FILENAME "\"\n" \
+            INCBIN_GLOBAL_LABELS(NAME, END) \
+            INCBIN_ALIGN_BYTE \
+            INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(END) ":\n" \
+                INCBIN_BYTE "1\n" \
+            INCBIN_GLOBAL_LABELS(NAME, SIZE) \
+            INCBIN_ALIGN_HOST \
+            INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(SIZE) ":\n" \
+                INCBIN_INT INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(END) " - " \
+                           INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(DATA) "\n" \
+            INCBIN_ALIGN_HOST \
+            ".text\n" \
+    ); \
+    INCBIN_EXTERN(NAME)
+
+#endif
+#endif
diff --git a/src/main.cpp b/src/main.cpp
index fbad6622..f95db1c2 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -35,6 +35,7 @@ int main(int argc, char* argv[]) {
 
   std::cout << engine_info() << std::endl;
 
+  CommandLine::init(argc, argv);
   UCI::init(Options);
   Tune::init();
   PSQT::init();
diff --git a/src/misc.cpp b/src/misc.cpp
index 80c436ac..3fbdea35 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -132,6 +132,7 @@ public:
 
 } // namespace
 
+
 /// engine_info() returns the full name of the current Stockfish version. This
 /// will be either "Stockfish <Tag> DD-MM-YY" (where DD-MM-YY is the date when
 /// the program was compiled) or "Stockfish <Version>", depending on whether
@@ -589,3 +590,61 @@ void bindThisThread(size_t idx) {
 #endif
 
 } // namespace WinProcGroup
+
+#ifdef _WIN32
+#include <direct.h>
+#define GETCWD _getcwd
+#else
+#include <unistd.h>
+#define GETCWD getcwd
+#endif
+
+namespace CommandLine {
+
+string argv0;            // path+name of the executable binary, as given by argv[0]
+string binaryDirectory;  // path of the executable directory
+string workingDirectory; // path of the working directory
+string pathSeparator;    // Separator for our current OS
+
+void init(int argc, char* argv[]) {
+    (void)argc;
+    string separator;
+
+    // extract the path+name of the executable binary
+    argv0 = argv[0];
+
+#ifdef _WIN32
+    pathSeparator = "\\";
+  #ifdef _MSC_VER
+    // Under windows argv[0] may not have the extension. Also _get_pgmptr() had
+    // issues in some windows 10 versions, so check returned values carefully.
+    char* pgmptr = nullptr;
+    if (!_get_pgmptr(&pgmptr) && pgmptr != nullptr && *pgmptr)
+        argv0 = pgmptr;
+  #endif
+#else
+    pathSeparator = "/";
+#endif
+
+    // extract the working directory
+    workingDirectory = "";
+    char buff[40000];
+    char* cwd = GETCWD(buff, 40000);
+    if (cwd)
+        workingDirectory = cwd;
+
+    // extract the binary directory path from argv0
+    binaryDirectory = argv0;
+    size_t pos = binaryDirectory.find_last_of("\\/");
+    if (pos == std::string::npos)
+        binaryDirectory = "." + pathSeparator;
+    else
+        binaryDirectory.resize(pos + 1);
+
+    // pattern replacement: "./" at the start of path is replaced by the working directory
+    if (binaryDirectory.find("." + pathSeparator) == 0)
+        binaryDirectory.replace(0, 1, workingDirectory);
+}
+
+
+} // namespace CommandLine
diff --git a/src/misc.h b/src/misc.h
index 8ad17b50..68b9c884 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -42,9 +42,7 @@ void dbg_mean_of(int v);
 void dbg_print();
 
 typedef std::chrono::milliseconds::rep TimePoint; // A value in milliseconds
-
 static_assert(sizeof(TimePoint) == sizeof(int64_t), "TimePoint should be 64 bits");
-
 inline TimePoint now() {
   return std::chrono::duration_cast<std::chrono::milliseconds>
         (std::chrono::steady_clock::now().time_since_epoch()).count();
@@ -126,4 +124,11 @@ namespace WinProcGroup {
   void bindThisThread(size_t idx);
 }
 
+namespace CommandLine {
+  void init(int argc, char* argv[]);
+
+  extern std::string binaryDirectory;  // path of the executable directory
+  extern std::string workingDirectory; // path of the working directory
+}
+
 #endif // #ifndef MISC_H_INCLUDED
diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index e6619089..d6ac9894 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -18,7 +18,6 @@
 
 // Code for calculating NNUE evaluation function
 
-#include <fstream>
 #include <iostream>
 #include <set>
 
@@ -143,17 +142,12 @@ namespace Eval::NNUE {
     return accumulator.score;
   }
 
-  // Load the evaluation function file
-  bool load_eval_file(const std::string& evalFile) {
+  // Load eval, from a file stream or a memory stream
+  bool load_eval(std::string streamName, std::istream& stream) {
 
     Initialize();
-    fileName = evalFile;
-
-    std::ifstream stream(evalFile, std::ios::binary);
-
-    const bool result = ReadParameters(stream);
-
-    return result;
+    fileName = streamName;
+    return ReadParameters(stream);
   }
 
   // Evaluation function. Perform differential calculation.
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index ec83c7c8..5e747a7f 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -21,6 +21,7 @@
 #include <ostream>
 #include <sstream>
 
+#include "evaluate.h"
 #include "misc.h"
 #include "search.h"
 #include "thread.h"
@@ -79,9 +80,7 @@ void init(OptionsMap& o) {
   o["Syzygy50MoveRule"]      << Option(true);
   o["SyzygyProbeLimit"]      << Option(7, 0, 7);
   o["Use NNUE"]              << Option(true, on_use_NNUE);
-  // The default must follow the format nn-[SHA256 first 12 digits].nnue
-  // for the build process (profile-build and fishtest) to work.
-  o["EvalFile"]              << Option("nn-82215d0fd0df.nnue", on_eval_file);
+  o["EvalFile"]              << Option(EvalFileDefaultName, on_eval_file);
 }
 
 
From 7d6668515c5b044df66ad1cdc3a1f75843cf5f56 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Sun, 30 Aug 2020 14:54:07 +0900
Subject: [PATCH 002/398] Added -static link option to the learn and
 profile-learn targets.

---
 src/Makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 9372b915..cc63ab15 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -889,7 +889,7 @@ icc-profile-use:
 learn: config-sanity
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
 	EXTRACXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS -I/mingw64/include/OpenBLAS -fopenmp ' \
-	EXTRALDFLAGS=' -lopenblas -fopenmp -Wl,-s ' \
+	EXTRALDFLAGS=' -lopenblas -fopenmp -Wl,-s -static ' \
 	all
 	
 profile-learn: config-sanity objclean profileclean
@@ -897,7 +897,7 @@ profile-learn: config-sanity objclean profileclean
 	@echo "Step 1/4. Building instrumented executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make) \
 	LEARNCXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS -I/mingw64/include/OpenBLAS -fopenmp ' \
-	LEARNLDFLAGS=' -lopenblas -fopenmp -Wl,-s '
+	LEARNLDFLAGS=' -lopenblas -fopenmp -Wl,-s -static '
 	@echo ""
 	@echo "Step 2/4. Running benchmark for pgo-build ..."
 	$(PGOGENSFEN) 
@@ -906,7 +906,7 @@ profile-learn: config-sanity objclean profileclean
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) objclean
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_use) \
 	LEARNCXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS -I/mingw64/include/OpenBLAS -fopenmp ' \
-	LEARNLDFLAGS=' -lopenblas -fopenmp -Wl,-s '
+	LEARNLDFLAGS=' -lopenblas -fopenmp -Wl,-s -static '
 	@echo ""
 	@echo "Step 4/4. Deleting profile data ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) profileclean

From e4ed7d3dd7b8895ce523180cb3da3ec2714050fc Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Wed, 26 Aug 2020 18:00:54 +0200
Subject: [PATCH 003/398] Cleaner make help

do not print details if ARCH is an empty string. Follow up for b0b4ca17db49ed03057b5fa4ee4a12dab0e9c9e6

https://github.com/official-stockfish/Stockfish/pull/3071

No functional change
---
 src/Makefile | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 5f363f02..9ae5a51c 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -85,12 +85,17 @@ endif
 
 ### 2.1. General and architecture defaults
 
+ifeq ($(ARCH),)
+   ARCH = x86-64-modern
+   help_skip_sanity = yes
+endif
 # explicitly check for the list of supported architectures (as listed with make help),
 # the user can override with `make ARCH=x86-32-vnni256 SUPPORTED_ARCH=true`
-ifeq ($(ARCH),$(filter $(ARCH),x86-64-vnni512 x86-64-vnni256 x86-64-avx512 x86-64-bmi2 x86-64-avx2 \
-                               x86-64-sse41-popcnt x86-64-modern x86-64-ssse3 x86-64-sse3-popcnt \
-                               x86-64 x86-32-sse41-popcnt x86-32-sse2 x86-32 ppc-64 ppc-32 \
-                               armv7 armv7-neon armv8 apple-silicon general-64 general-32))
+ifeq ($(ARCH), $(filter $(ARCH), \
+                 x86-64-vnni512 x86-64-vnni256 x86-64-avx512 x86-64-bmi2 x86-64-avx2 \
+                 x86-64-sse41-popcnt x86-64-modern x86-64-ssse3 x86-64-sse3-popcnt \
+                 x86-64 x86-32-sse41-popcnt x86-32-sse2 x86-32 ppc-64 ppc-32 \
+                 armv7 armv7-neon armv8 apple-silicon general-64 general-32))
    SUPPORTED_ARCH=true
 else
    SUPPORTED_ARCH=false
@@ -113,7 +118,6 @@ avx512 = no
 vnni256 = no
 vnni512 = no
 neon = no
-ARCH = x86-64-modern
 STRIP = strip
 
 ### 2.2 Architecture specific
@@ -695,11 +699,12 @@ help:
 	@echo "make -j build ARCH=x86-64-ssse3 COMP=clang"
 	@echo ""
 	@echo "-------------------------------"
-ifeq ($(SUPPORTED_ARCH), true)
+ifeq ($(SUPPORTED_ARCH)$(help_skip_sanity), true)
 	@echo "The selected architecture $(ARCH) will enable the following configuration: "
 	@$(MAKE) ARCH=$(ARCH) COMP=$(COMP) config-sanity
 else
 	@echo "Specify a supported architecture with the ARCH option for more details"
+	@echo ""
 endif
 
 
From d90d893b5eeea020b2d59d1372f5aa0a20b45412 Mon Sep 17 00:00:00 2001
From: Unai Corzo <corzounai@gmail.com>
Date: Fri, 28 Aug 2020 09:27:15 +0200
Subject: [PATCH 004/398] Reintroduce depth reduction

Reintroduce depth reduction if the position is not in TT.

STC https://tests.stockfishchess.org/tests/view/5f4652e85089a564a10d868c
LLR: 2.97 (-2.94,2.94) {-0.25,1.25}
Total: 40240 W: 4535 L: 4331 D: 31374
Ptnml(0-2): 215, 3276, 12969, 3410, 250

LTC https://tests.stockfishchess.org/tests/view/5f46ca5e5089a564a10d86f3
LLR: 2.93 (-2.94,2.94) {0.25,1.25}
Total: 63096 W: 3426 L: 3188 D: 56482
Ptnml(0-2): 51, 2798, 25645, 2970, 84

closes https://github.com/official-stockfish/Stockfish/pull/3072

bench: 3611906
---
 src/search.cpp | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index cae8a684..77447043 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -939,6 +939,12 @@ namespace {
                 }
             }
     }
+    
+    // Step 11. If the position is not in TT, decrease depth by 2
+    if (   PvNode
+        && depth >= 6
+        && !ttMove)
+        depth -= 2;
 
 moves_loop: // When in check, search starts from here
 
@@ -963,7 +969,7 @@ moves_loop: // When in check, search starts from here
     // Mark this node as being searched
     ThreadHolding th(thisThread, posKey, ss->ply);
 
-    // Step 11. Loop through all pseudo-legal moves until no moves remain
+    // Step 12. Loop through all pseudo-legal moves until no moves remain
     // or a beta cutoff occurs.
     while ((move = mp.next_move(moveCountPruning)) != MOVE_NONE)
     {
@@ -1001,7 +1007,7 @@ moves_loop: // When in check, search starts from here
       // Calculate new depth for this move
       newDepth = depth - 1;
 
-      // Step 12. Pruning at shallow depth (~200 Elo)
+      // Step 13. Pruning at shallow depth (~200 Elo)
       if (  !rootNode
           && pos.non_pawn_material(us)
           && bestValue > VALUE_TB_LOSS_IN_MAX_PLY)
@@ -1059,7 +1065,7 @@ moves_loop: // When in check, search starts from here
           }
       }
 
-      // Step 13. Extensions (~75 Elo)
+      // Step 14. Extensions (~75 Elo)
 
       // Singular extension search (~70 Elo). If all moves but one fail low on a
       // search of (alpha-s, beta-s), and just one fails high on (alpha, beta),
@@ -1142,10 +1148,10 @@ moves_loop: // When in check, search starts from here
                                                                 [movedPiece]
                                                                 [to_sq(move)];
 
-      // Step 14. Make the move
+      // Step 15. Make the move
       pos.do_move(move, st, givesCheck);
 
-      // Step 15. Reduced depth search (LMR, ~200 Elo). If the move fails high it will be
+      // Step 16. Reduced depth search (LMR, ~200 Elo). If the move fails high it will be
       // re-searched at full depth.
       if (    depth >= 3
           &&  moveCount > 1 + 2 * rootNode + 2 * (PvNode && abs(bestValue) < 2)
@@ -1248,7 +1254,7 @@ moves_loop: // When in check, search starts from here
           didLMR = false;
       }
 
-      // Step 16. Full depth search when LMR is skipped or fails high
+      // Step 17. Full depth search when LMR is skipped or fails high
       if (doFullDepthSearch)
       {
           value = -search<NonPV>(pos, ss+1, -(alpha+1), -alpha, newDepth, !cutNode);
@@ -1276,12 +1282,12 @@ moves_loop: // When in check, search starts from here
           value = -search<PV>(pos, ss+1, -beta, -alpha, newDepth, false);
       }
 
-      // Step 17. Undo move
+      // Step 18. Undo move
       pos.undo_move(move);
 
       assert(value > -VALUE_INFINITE && value < VALUE_INFINITE);
 
-      // Step 18. Check for a new best move
+      // Step 19. Check for a new best move
       // Finished searching the move. If a stop occurred, the return value of
       // the search cannot be trusted, and we return immediately without
       // updating best move, PV and TT.
@@ -1358,7 +1364,7 @@ moves_loop: // When in check, search starts from here
         return VALUE_DRAW;
     */
 
-    // Step 19. Check for mate and stalemate
+    // Step 20. Check for mate and stalemate
     // All legal moves have been searched and if there are no legal moves, it
     // must be a mate or a stalemate. If we are in a singular extension search then
     // return a fail low score.

From c02b3a4c7a339d212d5c6f75b3b89c926d33a800 Mon Sep 17 00:00:00 2001
From: MJZ1977 <37274752+MJZ1977@users.noreply.github.com>
Date: Fri, 28 Aug 2020 12:06:36 +0200
Subject: [PATCH 005/398] Add / remove leaves from search tree ttPv

add if previous leaf is in search tree and we didn't find a counter move
else remove the position if the leaf is the last one in search tree.

STC : https://tests.stockfishchess.org/tests/view/5f49203c3def640786115314
LLR: 2.95 (-2.94,2.94) {-0.25,1.25}
Total: 29968 W: 3381 L: 3195 D: 23392
Ptnml(0-2): 146, 2432, 9671, 2560, 175

LTC : https://tests.stockfishchess.org/tests/view/5f494bea3def640786115336
LLR: 2.96 (-2.94,2.94) {0.25,1.25}
Total: 84952 W: 4619 L: 4333 D: 76000
Ptnml(0-2): 86, 3765, 34481, 4065, 79

closes https://github.com/official-stockfish/Stockfish/pull/3075

Bench 3527337
---
 src/search.cpp | 34 ++++++++++++++++++++++++----------
 src/search.h   |  1 +
 2 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index 77447043..a2342a3c 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -597,7 +597,7 @@ namespace {
     Move ttMove, move, excludedMove, bestMove;
     Depth extension, newDepth;
     Value bestValue, value, ttValue, eval, maxValue, probCutBeta;
-    bool ttHit, ttPv, formerPv, givesCheck, improving, didLMR, priorCapture;
+    bool ttHit, formerPv, givesCheck, improving, didLMR, priorCapture;
     bool captureOrPromotion, doFullDepthSearch, moveCountPruning,
          ttCapture, singularQuietLMR;
     Piece movedPiece;
@@ -644,6 +644,7 @@ namespace {
     assert(0 <= ss->ply && ss->ply < MAX_PLY);
 
     (ss+1)->ply = ss->ply + 1;
+    (ss+1)->ttPv = false;
     (ss+1)->excludedMove = bestMove = MOVE_NONE;
     (ss+2)->killers[0] = (ss+2)->killers[1] = MOVE_NONE;
     Square prevSq = to_sq((ss-1)->currentMove);
@@ -667,10 +668,11 @@ namespace {
     ttValue = ttHit ? value_from_tt(tte->value(), ss->ply, pos.rule50_count()) : VALUE_NONE;
     ttMove =  rootNode ? thisThread->rootMoves[thisThread->pvIdx].pv[0]
             : ttHit    ? tte->move() : MOVE_NONE;
-    ttPv = PvNode || (ttHit && tte->is_pv());
-    formerPv = ttPv && !PvNode;
+    if (!excludedMove)
+        ss->ttPv = PvNode || (ttHit && tte->is_pv());
+    formerPv = ss->ttPv && !PvNode;
 
-    if (   ttPv
+    if (   ss->ttPv
         && depth > 12
         && ss->ply - 1 < MAX_LPH
         && !priorCapture
@@ -748,7 +750,7 @@ namespace {
                 if (    b == BOUND_EXACT
                     || (b == BOUND_LOWER ? value >= beta : value <= alpha))
                 {
-                    tte->save(posKey, value_to_tt(value, ss->ply), ttPv, b,
+                    tte->save(posKey, value_to_tt(value, ss->ply), ss->ttPv, b,
                               std::min(MAX_PLY - 1, depth + 6),
                               MOVE_NONE, VALUE_NONE);
 
@@ -798,7 +800,7 @@ namespace {
         else
             ss->staticEval = eval = -(ss-1)->staticEval + 2 * Tempo;
 
-        tte->save(posKey, VALUE_NONE, ttPv, BOUND_NONE, DEPTH_NONE, MOVE_NONE, eval);
+        tte->save(posKey, VALUE_NONE, ss->ttPv, BOUND_NONE, DEPTH_NONE, MOVE_NONE, eval);
     }
 
     // Step 7. Razoring (~1 Elo)
@@ -824,7 +826,7 @@ namespace {
         && (ss-1)->statScore < 22977
         &&  eval >= beta
         &&  eval >= ss->staticEval
-        &&  ss->staticEval >= beta - 30 * depth - 28 * improving + 84 * ttPv + 182
+        &&  ss->staticEval >= beta - 30 * depth - 28 * improving + 84 * ss->ttPv + 182
         && !excludedMove
         &&  pos.non_pawn_material(us)
         && (ss->ply >= thisThread->nmpMinPly || us != thisThread->nmpColor))
@@ -898,6 +900,8 @@ namespace {
         assert(probCutBeta < VALUE_INFINITE);
         MovePicker mp(pos, ttMove, probCutBeta - ss->staticEval, &captureHistory);
         int probCutCount = 0;
+        bool ttPv = ss->ttPv;
+        ss->ttPv = false;
 
         while (   (move = mp.next_move()) != MOVE_NONE
                && probCutCount < 2 + 2 * cutNode)
@@ -938,6 +942,7 @@ namespace {
                     return value;
                 }
             }
+         ss->ttPv = ttPv;
     }
     
     // Step 11. If the position is not in TT, decrease depth by 2
@@ -1180,7 +1185,7 @@ moves_loop: // When in check, search starts from here
               r++;
 
           // Decrease reduction if position is or has been on the PV (~10 Elo)
-          if (ttPv)
+          if (ss->ttPv)
               r -= 2;
 
           if (moveCountPruning && !formerPv)
@@ -1209,7 +1214,7 @@ moves_loop: // When in check, search starts from here
               // hence break make_move(). (~2 Elo)
               else if (    type_of(move) == NORMAL
                        && !pos.see_ge(reverse_move(move)))
-                  r -= 2 + ttPv - (type_of(movedPiece) == PAWN);
+                  r -= 2 + ss->ttPv - (type_of(movedPiece) == PAWN);
 
               ss->statScore =  thisThread->mainHistory[us][from_to(move)]
                              + (*contHist[0])[movedPiece][to_sq(move)]
@@ -1387,8 +1392,17 @@ moves_loop: // When in check, search starts from here
     if (PvNode)
         bestValue = std::min(bestValue, maxValue);
 
+    // If no good move is found and the previous position was ttPv, then the previous
+    // opponent move is probably good and the new position is added to the search tree.
+    if (bestValue <= alpha)
+        ss->ttPv = ss->ttPv || ((ss-1)->ttPv && depth > 3);
+    // Otherwise, a counter move has been found and if the position is the last leaf
+    // in the search tree, remove the position from the search tree.
+    else if (depth > 3)
+        ss->ttPv = ss->ttPv && (ss+1)->ttPv;
+
     if (!excludedMove && !(rootNode && thisThread->pvIdx))
-        tte->save(posKey, value_to_tt(bestValue, ss->ply), ttPv,
+        tte->save(posKey, value_to_tt(bestValue, ss->ply), ss->ttPv,
                   bestValue >= beta ? BOUND_LOWER :
                   PvNode && bestMove ? BOUND_EXACT : BOUND_UPPER,
                   depth, bestMove, ss->staticEval);
diff --git a/src/search.h b/src/search.h
index 2554f3fb..79085189 100644
--- a/src/search.h
+++ b/src/search.h
@@ -48,6 +48,7 @@ struct Stack {
   int statScore;
   int moveCount;
   bool inCheck;
+  bool ttPv;
 };
 
 
From 9b5b9ec9a6a3a0e46ac00f58976887560948a7e2 Mon Sep 17 00:00:00 2001
From: VoyagerOne <excelgeek@gmail.com>
Date: Sat, 29 Aug 2020 21:13:05 -0400
Subject: [PATCH 006/398] QS Pruning Simplification

Remove depth dependence in QS pruning

STC:
LLR: 2.95 (-2.94,2.94) {-1.25,0.25}
Total: 40536 W: 4442 L: 4358 D: 31736
Ptnml(0-2): 209, 3330, 13118, 3390, 221
https://tests.stockfishchess.org/tests/view/5f49035b3def6407861152f9

LTC:
LLR: 2.95 (-2.94,2.94) {-0.75,0.25}
Total: 97104 W: 5164 L: 5130 D: 86810
Ptnml(0-2): 103, 4478, 39377, 4470, 124
https://tests.stockfishchess.org/tests/view/5f4939d53def640786115322

closes https://github.com/official-stockfish/Stockfish/pull/3077

Bench: 3865238
---
 src/search.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/search.cpp b/src/search.cpp
index a2342a3c..b319dff5 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1585,7 +1585,7 @@ moves_loop: // When in check, search starts from here
                                                                 [to_sq(move)];
 
       if (  !captureOrPromotion
-          && moveCount >= abs(depth) + 1
+          && moveCount
           && (*contHist[0])[pos.moved_piece(move)][to_sq(move)] < CounterMovePruneThreshold
           && (*contHist[1])[pos.moved_piece(move)][to_sq(move)] < CounterMovePruneThreshold)
           continue;

From e0bafa1911ede61b9268e0b461a5d8856d6cd6be Mon Sep 17 00:00:00 2001
From: FauziAkram <fauzi.dabat@hotmail.com>
Date: Sun, 30 Aug 2020 13:58:05 +0300
Subject: [PATCH 007/398] Update parametes in classical evaluation.

Passed STC (NNUE=False):
https://tests.stockfishchess.org/tests/view/5f42edfe5089a564a10d84a0
LLR: 2.96 (-2.94,2.94) {-0.25,1.25}
Total: 13840 W: 2591 L: 2336 D: 8913
Ptnml(0-2): 194, 1453, 3387, 1676, 210

Passed LTC (NNUE=False):
https://tests.stockfishchess.org/tests/view/5f4369795089a564a10d84d8
LLR: 2.95 (-2.94,2.94) {0.25,1.25}
Total: 159744 W: 19430 L: 18850 D: 121464
Ptnml(0-2): 960, 14185, 49030, 14709, 988

closes https://github.com/official-stockfish/Stockfish/pull/3080

bench: 3736029
---
 src/evaluate.cpp | 10 +++++-----
 src/search.cpp   |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 67154751..09f36513 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -227,26 +227,26 @@ namespace {
 
   // Outpost[knight/bishop] contains bonuses for each knight or bishop occupying a
   // pawn protected square on rank 4 to 6 which is also safe from a pawn attack.
-  constexpr Score Outpost[] = { S(56, 36), S(30, 23) };
+  constexpr Score Outpost[] = { S(56, 34), S(31, 23) };
 
   // PassedRank[Rank] contains a bonus according to the rank of a passed pawn
   constexpr Score PassedRank[RANK_NB] = {
-    S(0, 0), S(10, 28), S(17, 33), S(15, 41), S(62, 72), S(168, 177), S(276, 260)
+    S(0, 0), S(9, 28), S(15, 31), S(17, 39), S(64, 70), S(171, 177), S(277, 260)
   };
 
   // RookOnFile[semiopen/open] contains bonuses for each rook when there is
   // no (friendly) pawn on the rook file.
-  constexpr Score RookOnFile[] = { S(19, 7), S(48, 29) };
+  constexpr Score RookOnFile[] = { S(19, 7), S(48, 27) };
 
   // ThreatByMinor/ByRook[attacked PieceType] contains bonuses according to
   // which piece type attacks which one. Attacks on lesser pieces which are
   // pawn-defended are not considered.
   constexpr Score ThreatByMinor[PIECE_TYPE_NB] = {
-    S(0, 0), S(5, 32), S(57, 41), S(77, 56), S(88, 119), S(79, 161)
+    S(0, 0), S(5, 32), S(55, 41), S(77, 56), S(89, 119), S(79, 162)
   };
 
   constexpr Score ThreatByRook[PIECE_TYPE_NB] = {
-    S(0, 0), S(3, 46), S(37, 68), S(42, 60), S(0, 38), S(58, 41)
+    S(0, 0), S(3, 44), S(37, 68), S(42, 60), S(0, 39), S(58, 43)
   };
 
   // Assorted bonuses and penalties
diff --git a/src/search.cpp b/src/search.cpp
index b319dff5..e6e53e7c 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -944,7 +944,7 @@ namespace {
             }
          ss->ttPv = ttPv;
     }
-    
+
     // Step 11. If the position is not in TT, decrease depth by 2
     if (   PvNode
         && depth >= 6

From a0afe32d16554ff3b5c74f34ae56400f35759edf Mon Sep 17 00:00:00 2001
From: mstembera <MissingEmail@email>
Date: Sun, 30 Aug 2020 18:40:49 -0700
Subject: [PATCH 008/398] Use stable sort to make sure bench with TB yields
 same results everywhere.

std::sort() is not stable so different implementations can produce different results:
use the stable version instead.

Observed for '8/6k1/5r2/8/8/8/1K6/Q7 w - - 0 1' yielding different bench results for gcc and MSVC
and 3-4-5 syzygy TB prior to this patch.

closes https://github.com/official-stockfish/Stockfish/pull/3083

No functional change.
---
 src/search.cpp         | 2 +-
 src/syzygy/tbprobe.cpp | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index e6e53e7c..c676bd6d 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1964,7 +1964,7 @@ void Tablebases::rank_root_moves(Position& pos, Search::RootMoves& rootMoves) {
     if (RootInTB)
     {
         // Sort moves according to TB rank
-        std::sort(rootMoves.begin(), rootMoves.end(),
+        std::stable_sort(rootMoves.begin(), rootMoves.end(),
                   [](const RootMove &a, const RootMove &b) { return a.tbRank > b.tbRank; } );
 
         // Probe during search only if DTZ is not available and we are winning
diff --git a/src/syzygy/tbprobe.cpp b/src/syzygy/tbprobe.cpp
index 20215b96..3dfe3e3e 100644
--- a/src/syzygy/tbprobe.cpp
+++ b/src/syzygy/tbprobe.cpp
@@ -758,7 +758,7 @@ Ret do_probe_table(const Position& pos, T* entry, WDLScore wdl, ProbeState* resu
     if (entry->hasPawns) {
         idx = LeadPawnIdx[leadPawnsCnt][squares[0]];
 
-        std::sort(squares + 1, squares + leadPawnsCnt, pawns_comp);
+        std::stable_sort(squares + 1, squares + leadPawnsCnt, pawns_comp);
 
         for (int i = 1; i < leadPawnsCnt; ++i)
             idx += Binomial[i][MapPawns[squares[i]]];
@@ -859,7 +859,7 @@ encode_remaining:
 
     while (d->groupLen[++next])
     {
-        std::sort(groupSq, groupSq + d->groupLen[next]);
+        std::stable_sort(groupSq, groupSq + d->groupLen[next]);
         uint64_t n = 0;
 
         // Map down a square if "comes later" than a square in the previous

From a057f170c6920fc4d1abdae619c5259e9d80703c Mon Sep 17 00:00:00 2001
From: mstembera <MissingEmail@email>
Date: Sun, 30 Aug 2020 20:48:10 -0700
Subject: [PATCH 009/398] Use llvm linker with clang on windows for LTO.

other linkers might fail to link during the LTO phase.

The linker might have to be installed using
`pacman -Syu mingw-w64-x86_64-lld`

closes https://github.com/official-stockfish/Stockfish/pull/3084

No functional change.
---
 src/Makefile | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/Makefile b/src/Makefile
index 9ae5a51c..340b3008 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -595,6 +595,11 @@ ifeq ($(debug), no)
 		LDFLAGS += $(CXXFLAGS)
 	else ifeq ($(comp),clang)
 		CXXFLAGS += -flto=thin
+		ifneq ($(findstring MINGW,$(KERNEL)),)
+			CXXFLAGS += -fuse-ld=lld
+		else ifneq ($(findstring MSYS,$(KERNEL)),)
+			CXXFLAGS += -fuse-ld=lld
+		endif
 		LDFLAGS += $(CXXFLAGS)
 
 # GCC and CLANG use different methods for parallelizing LTO and CLANG pretends to be

From 61381372ec896ae6b0f139555e6e3f816d8aa570 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Mon, 31 Aug 2020 22:53:20 +0200
Subject: [PATCH 010/398] Always print an info line before a bestmove

if very few nodes are being searched before a bestmove is reported,
an info line might be missing.

fixes https://github.com/official-stockfish/Stockfish/issues/2757

closes https://github.com/official-stockfish/Stockfish/pull/3088

no functional change
---
 src/search.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index c676bd6d..c15cd753 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1861,12 +1861,15 @@ string UCI::pv(const Position& pos, Depth depth, Value alpha, Value beta) {
   {
       bool updated = rootMoves[i].score != -VALUE_INFINITE;
 
-      if (depth == 1 && !updated)
+      if (depth == 1 && !updated && i > 0)
           continue;
 
-      Depth d = updated ? depth : depth - 1;
+      Depth d = updated ? depth : std::max(1, depth - 1);
       Value v = updated ? rootMoves[i].score : rootMoves[i].previousScore;
 
+      if (v == -VALUE_INFINITE)
+          v = VALUE_ZERO;
+
       bool tb = TB::RootInTB && abs(v) < VALUE_MATE_IN_MAX_PLY;
       v = tb ? rootMoves[i].tbScore : v;
 

From a8bbaa17954471cf3fd8d168f1cafe3f2034730e Mon Sep 17 00:00:00 2001
From: VoyagerOne <excelgeek@gmail.com>
Date: Sun, 30 Aug 2020 13:57:57 -0400
Subject: [PATCH 011/398] LMR Root Node Simplification

Simplify LMR at Root node

STC:
LLR: 2.94 (-2.94,2.94) {-1.25,0.25}
Total: 71520 W: 7649 L: 7614 D: 56257
Ptnml(0-2): 346, 5845, 23349, 5868, 352
https://tests.stockfishchess.org/tests/view/5f4be8c0ba100690c5cc5cbb

LTC:
LLR: 2.95 (-2.94,2.94) {-0.75,0.25}
Total: 74832 W: 3997 L: 3948 D: 66887
Ptnml(0-2): 77, 3422, 30362, 3485, 70
https://tests.stockfishchess.org/tests/view/5f4c603eba100690c5cc5d0e

closes https://github.com/official-stockfish/Stockfish/pull/3091

Bench: 3624569
---
 src/search.cpp |  1 -
 src/thread.cpp | 11 -----------
 src/thread.h   |  1 -
 3 files changed, 13 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index c15cd753..b79fa6be 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1160,7 +1160,6 @@ moves_loop: // When in check, search starts from here
       // re-searched at full depth.
       if (    depth >= 3
           &&  moveCount > 1 + 2 * rootNode + 2 * (PvNode && abs(bestValue) < 2)
-          && (!rootNode || thisThread->best_move_count(move) == 0)
           && (  !captureOrPromotion
               || moveCountPruning
               || ss->staticEval + PieceValue[EG][pos.captured_piece()] <= alpha
diff --git a/src/thread.cpp b/src/thread.cpp
index 1aa66a81..b46fce5e 100644
--- a/src/thread.cpp
+++ b/src/thread.cpp
@@ -51,17 +51,6 @@ Thread::~Thread() {
 }
 
 
-/// Thread::bestMoveCount(Move move) return best move counter for the given root move
-
-int Thread::best_move_count(Move move) const {
-
-  auto rm = std::find(rootMoves.begin() + pvIdx,
-                      rootMoves.begin() + pvLast, move);
-
-  return rm != rootMoves.begin() + pvLast ? rm->bestMoveCount : 0;
-}
-
-
 /// Thread::clear() reset histories, usually before a new game
 
 void Thread::clear() {
diff --git a/src/thread.h b/src/thread.h
index 042bc2e9..34b99015 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -54,7 +54,6 @@ public:
   void idle_loop();
   void start_searching();
   void wait_for_search_finished();
-  int best_move_count(Move move) const;
 
   Pawns::Table pawnsTable;
   Material::Table materialTable;

From be87517734e2a1b222d1a35e98764382b4176732 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Tue, 1 Sep 2020 12:22:47 +0200
Subject: [PATCH 012/398] Only use MADV_RANDOM if defined

needed to compile on Haiku.

fixes https://github.com/official-stockfish/Stockfish/issues/3093

closes https://github.com/official-stockfish/Stockfish/pull/3094

No functional change
---
 src/syzygy/tbprobe.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/syzygy/tbprobe.cpp b/src/syzygy/tbprobe.cpp
index 3dfe3e3e..4d682f1a 100644
--- a/src/syzygy/tbprobe.cpp
+++ b/src/syzygy/tbprobe.cpp
@@ -223,7 +223,9 @@ public:
 
         *mapping = statbuf.st_size;
         *baseAddress = mmap(nullptr, statbuf.st_size, PROT_READ, MAP_SHARED, fd, 0);
+#if defined(MADV_RANDOM)
         madvise(*baseAddress, statbuf.st_size, MADV_RANDOM);
+#endif
         ::close(fd);
 
         if (*baseAddress == MAP_FAILED)

From c17f2b15fdfdb44fc4ef2ca73c58d1d1097f101e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 2 Sep 2020 15:17:18 +0200
Subject: [PATCH 013/398] General cleanup of learner.cpp.

---
 src/Makefile               |    6 +-
 src/eval/evaluate_common.h |    2 +
 src/learn/convert.cpp      |  515 ++++
 src/learn/gensfen.cpp      | 1181 +++++++++
 src/learn/learn.h          |   42 +
 src/learn/learner.cpp      | 4886 +++++++++++++-----------------------
 src/learn/multi_think.h    |    5 +-
 7 files changed, 3452 insertions(+), 3185 deletions(-)
 create mode 100644 src/learn/convert.cpp
 create mode 100644 src/learn/gensfen.cpp

diff --git a/src/Makefile b/src/Makefile
index cc63ab15..0c6b21e5 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -52,6 +52,8 @@ SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp
 	extra/sfen_packer.cpp \
 	learn/gensfen2019.cpp \
 	learn/learner.cpp \
+	learn/gensfen.cpp \
+	learn/convert.cpp \
 	learn/learning_tools.cpp \
 	learn/multi_think.cpp
 
@@ -891,7 +893,7 @@ learn: config-sanity
 	EXTRACXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS -I/mingw64/include/OpenBLAS -fopenmp ' \
 	EXTRALDFLAGS=' -lopenblas -fopenmp -Wl,-s -static ' \
 	all
-	
+
 profile-learn: config-sanity objclean profileclean
 	@echo ""
 	@echo "Step 1/4. Building instrumented executable ..."
@@ -900,7 +902,7 @@ profile-learn: config-sanity objclean profileclean
 	LEARNLDFLAGS=' -lopenblas -fopenmp -Wl,-s -static '
 	@echo ""
 	@echo "Step 2/4. Running benchmark for pgo-build ..."
-	$(PGOGENSFEN) 
+	$(PGOGENSFEN)
 	@echo ""
 	@echo "Step 3/4. Building optimized executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) objclean
diff --git a/src/eval/evaluate_common.h b/src/eval/evaluate_common.h
index b043f2e1..dacbd2ba 100644
--- a/src/eval/evaluate_common.h
+++ b/src/eval/evaluate_common.h
@@ -15,6 +15,8 @@
 // KPP file name
 #define KPP_BIN "KPP_synthesized.bin"
 
+#include "../position.h"
+
 namespace Eval
 {
 
diff --git a/src/learn/convert.cpp b/src/learn/convert.cpp
new file mode 100644
index 00000000..ebee8a96
--- /dev/null
+++ b/src/learn/convert.cpp
@@ -0,0 +1,515 @@
+#define EVAL_LEARN
+
+#if defined(EVAL_LEARN)
+
+// evaluate header for learning
+#include "../eval/evaluate_common.h"
+
+#include "learn.h"
+#include "multi_think.h"
+#include "../uci.h"
+#include "../syzygy/tbprobe.h"
+#include "../misc.h"
+#include "../thread.h"
+#include "../position.h"
+#include "../tt.h"
+
+#include <sstream>
+#include <fstream>
+#include <unordered_set>
+#include <iomanip>
+#include <list>
+#include <cmath>    // std::exp(),std::pow(),std::log()
+#include <cstring>  // memcpy()
+#include <memory>
+#include <limits>
+#include <optional>
+#include <chrono>
+#include <random>
+#include <regex>
+
+#if defined (_OPENMP)
+#include <omp.h>
+#endif
+
+#if defined(_MSC_VER)
+// The C++ filesystem cannot be used unless it is C++17 or later or MSVC.
+// I tried to use windows.h, but with g++ of msys2 I can not get the files in the folder well.
+// Use dirent.h because there is no help for it.
+#include <filesystem>
+#elif defined(__GNUC__)
+#include <dirent.h>
+#endif
+
+using namespace std;
+
+namespace Learner
+{
+    bool fen_is_ok(Position& pos, std::string input_fen) {
+        std::string pos_fen = pos.fen();
+        std::istringstream ss_input(input_fen);
+        std::istringstream ss_pos(pos_fen);
+
+        // example : "2r4r/4kpp1/nb1np3/p2p3p/B2P1BP1/PP6/4NPKP/2R1R3 w - h6 0 24"
+        //       --> "2r4r/4kpp1/nb1np3/p2p3p/B2P1BP1/PP6/4NPKP/2R1R3"
+        std::string str_input, str_pos;
+        ss_input >> str_input;
+        ss_pos >> str_pos;
+
+        // Only compare "Piece placement field" between input_fen and pos.fen().
+        return str_input == str_pos;
+    }
+
+    void convert_bin(
+        const vector<string>& filenames, 
+        const string& output_file_name, 
+        const int ply_minimum, 
+        const int ply_maximum, 
+        const int interpolate_eval, 
+        const int src_score_min_value,
+        const int src_score_max_value,
+        const int dest_score_min_value,
+        const int dest_score_max_value,
+        const bool check_invalid_fen, 
+        const bool check_illegal_move)
+    {
+        std::cout << "check_invalid_fen=" << check_invalid_fen << std::endl;
+        std::cout << "check_illegal_move=" << check_illegal_move << std::endl;
+
+        std::fstream fs;
+        uint64_t data_size = 0;
+        uint64_t filtered_size = 0;
+        uint64_t filtered_size_fen = 0;
+        uint64_t filtered_size_move = 0;
+        uint64_t filtered_size_ply = 0;
+        auto th = Threads.main();
+        auto& tpos = th->rootPos;
+        // convert plain rag to packed sfenvalue for Yaneura king
+        fs.open(output_file_name, ios::app | ios::binary);
+        StateListPtr states;
+        for (auto filename : filenames) {
+            std::cout << "convert " << filename << " ... ";
+            std::string line;
+            ifstream ifs;
+            ifs.open(filename);
+            PackedSfenValue p;
+            data_size = 0;
+            filtered_size = 0;
+            filtered_size_fen = 0;
+            filtered_size_move = 0;
+            filtered_size_ply = 0;
+            p.gamePly = 1; // Not included in apery format. Should be initialized
+            bool ignore_flag_fen = false;
+            bool ignore_flag_move = false;
+            bool ignore_flag_ply = false;
+            while (std::getline(ifs, line)) {
+                std::stringstream ss(line);
+                std::string token;
+                std::string value;
+                ss >> token;
+                if (token == "fen") {
+                    states = StateListPtr(new std::deque<StateInfo>(1)); // Drop old and create a new one
+                    std::string input_fen = line.substr(4);
+                    tpos.set(input_fen, false, &states->back(), Threads.main());
+                    if (check_invalid_fen && !fen_is_ok(tpos, input_fen)) {
+                        ignore_flag_fen = true;
+                        filtered_size_fen++;
+                    }
+                    else {
+                        tpos.sfen_pack(p.sfen);
+                    }
+                }
+                else if (token == "move") {
+                    ss >> value;
+                    Move move = UCI::to_move(tpos, value);
+                    if (check_illegal_move && move == MOVE_NONE) {
+                        ignore_flag_move = true;
+                        filtered_size_move++;
+                    }
+                    else {
+                        p.move = move;
+                    }
+                }
+                else if (token == "score") {
+                    double score;
+                    ss >> score;
+                    // Training Formula � Issue #71 � nodchip/Stockfish https://github.com/nodchip/Stockfish/issues/71
+                    // Normalize to [0.0, 1.0].
+                    score = (score - src_score_min_value) / (src_score_max_value - src_score_min_value);
+                    // Scale to [dest_score_min_value, dest_score_max_value].
+                    score = score * (dest_score_max_value - dest_score_min_value) + dest_score_min_value;
+                    p.score = Math::clamp((int32_t)std::round(score), -(int32_t)VALUE_MATE, (int32_t)VALUE_MATE);
+                }
+                else if (token == "ply") {
+                    int temp;
+                    ss >> temp;
+                    if (temp < ply_minimum || temp > ply_maximum) {
+                        ignore_flag_ply = true;
+                        filtered_size_ply++;
+                    }
+                    p.gamePly = uint16_t(temp); // No cast here?
+                    if (interpolate_eval != 0) {
+                        p.score = min(3000, interpolate_eval * temp);
+                    }
+                }
+                else if (token == "result") {
+                    int temp;
+                    ss >> temp;
+                    p.game_result = int8_t(temp); // Do you need a cast here?
+                    if (interpolate_eval) {
+                        p.score = p.score * p.game_result;
+                    }
+                }
+                else if (token == "e") {
+                    if (!(ignore_flag_fen || ignore_flag_move || ignore_flag_ply)) {
+                        fs.write((char*)&p, sizeof(PackedSfenValue));
+                        data_size += 1;
+                        // debug
+                        // std::cout<<tpos<<std::endl;
+                        // std::cout<<p.score<<","<<int(p.gamePly)<<","<<int(p.game_result)<<std::endl;
+                    }
+                    else {
+                        filtered_size++;
+                    }
+                    ignore_flag_fen = false;
+                    ignore_flag_move = false;
+                    ignore_flag_ply = false;
+                }
+            }
+            std::cout << "done " << data_size << " parsed " << filtered_size << " is filtered"
+                << " (invalid fen:" << filtered_size_fen << ", illegal move:" << filtered_size_move << ", invalid ply:" << filtered_size_ply << ")" << std::endl;
+            ifs.close();
+        }
+        std::cout << "all done" << std::endl;
+        fs.close();
+    }
+
+    static inline void ltrim(std::string& s) {
+        s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](int ch) {
+            return !std::isspace(ch);
+            }));
+    }
+
+    static inline void rtrim(std::string& s) {
+        s.erase(std::find_if(s.rbegin(), s.rend(), [](int ch) {
+            return !std::isspace(ch);
+            }).base(), s.end());
+    }
+
+    static inline void trim(std::string& s) {
+        ltrim(s);
+        rtrim(s);
+    }
+
+    int parse_game_result_from_pgn_extract(std::string result) {
+        // White Win
+        if (result == "\"1-0\"") {
+            return 1;
+        }
+        // Black Win
+        else if (result == "\"0-1\"") {
+            return -1;
+        }
+        // Draw
+        else {
+            return 0;
+        }
+    }
+
+    // 0.25 -->  0.25 * PawnValueEg
+    // #-4  --> -mate_in(4)
+    // #3   -->  mate_in(3)
+    // -M4  --> -mate_in(4)
+    // +M3  -->  mate_in(3)
+    Value parse_score_from_pgn_extract(std::string eval, bool& success) {
+        success = true;
+
+        if (eval.substr(0, 1) == "#") {
+            if (eval.substr(1, 1) == "-") {
+                return -mate_in(stoi(eval.substr(2, eval.length() - 2)));
+            }
+            else {
+                return mate_in(stoi(eval.substr(1, eval.length() - 1)));
+            }
+        }
+        else if (eval.substr(0, 2) == "-M") {
+            //std::cout << "eval=" << eval << std::endl;
+            return -mate_in(stoi(eval.substr(2, eval.length() - 2)));
+        }
+        else if (eval.substr(0, 2) == "+M") {
+            //std::cout << "eval=" << eval << std::endl;
+            return mate_in(stoi(eval.substr(2, eval.length() - 2)));
+        }
+        else {
+            char* endptr;
+            double value = strtod(eval.c_str(), &endptr);
+
+            if (*endptr != '\0') {
+                success = false;
+                return VALUE_ZERO;
+            }
+            else {
+                return Value(value * static_cast<double>(PawnValueEg));
+            }
+        }
+    }
+
+    // for Debug
+    //#define DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT
+
+    bool is_like_fen(std::string fen) {
+        int count_space = std::count(fen.cbegin(), fen.cend(), ' ');
+        int count_slash = std::count(fen.cbegin(), fen.cend(), '/');
+
+#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
+        //std::cout << "count_space=" << count_space << std::endl;
+        //std::cout << "count_slash=" << count_slash << std::endl;
+#endif
+
+        return count_space == 5 && count_slash == 7;
+    }
+
+    void convert_bin_from_pgn_extract(
+        const vector<string>& filenames, 
+        const string& output_file_name, 
+        const bool pgn_eval_side_to_move, 
+        const bool convert_no_eval_fens_as_score_zero)
+    {
+        std::cout << "pgn_eval_side_to_move=" << pgn_eval_side_to_move << std::endl;
+        std::cout << "convert_no_eval_fens_as_score_zero=" << convert_no_eval_fens_as_score_zero << std::endl;
+
+        auto th = Threads.main();
+        auto& pos = th->rootPos;
+
+        std::fstream ofs;
+        ofs.open(output_file_name, ios::out | ios::binary);
+
+        int game_count = 0;
+        int fen_count = 0;
+
+        for (auto filename : filenames) {
+            std::cout << now_string() << " convert " << filename << std::endl;
+            ifstream ifs;
+            ifs.open(filename);
+
+            int game_result = 0;
+
+            std::string line;
+            while (std::getline(ifs, line)) {
+
+                if (line.empty()) {
+                    continue;
+                }
+
+                else if (line.substr(0, 1) == "[") {
+                    std::regex pattern_result(R"(\[Result (.+?)\])");
+                    std::smatch match;
+
+                    // example: [Result "1-0"]
+                    if (std::regex_search(line, match, pattern_result)) {
+                        game_result = parse_game_result_from_pgn_extract(match.str(1));
+#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
+                        std::cout << "game_result=" << game_result << std::endl;
+#endif
+                        game_count++;
+                        if (game_count % 10000 == 0) {
+                            std::cout << now_string() << " game_count=" << game_count << ", fen_count=" << fen_count << std::endl;
+                        }
+                    }
+
+                    continue;
+                }
+
+                else {
+                    int gamePly = 1;
+                    auto itr = line.cbegin();
+
+                    while (true) {
+                        gamePly++;
+
+                        PackedSfenValue psv;
+                        memset((char*)&psv, 0, sizeof(PackedSfenValue));
+
+                        // fen
+                        {
+                            bool fen_found = false;
+
+                            while (!fen_found) {
+                                std::regex pattern_bracket(R"(\{(.+?)\})");
+                                std::smatch match;
+                                if (!std::regex_search(itr, line.cend(), match, pattern_bracket)) {
+                                    break;
+                                }
+
+                                itr += match.position(0) + match.length(0) - 1;
+                                std::string str_fen = match.str(1);
+                                trim(str_fen);
+
+                                if (is_like_fen(str_fen)) {
+                                    fen_found = true;
+
+                                    StateInfo si;
+                                    pos.set(str_fen, false, &si, th);
+                                    pos.sfen_pack(psv.sfen);
+                                }
+
+#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
+                                std::cout << "str_fen=" << str_fen << std::endl;
+                                std::cout << "fen_found=" << fen_found << std::endl;
+#endif
+                            }
+
+                            if (!fen_found) {
+                                break;
+                            }
+                        }
+
+                        // move
+                        {
+                            std::regex pattern_move(R"(\}(.+?)\{)");
+                            std::smatch match;
+                            if (!std::regex_search(itr, line.cend(), match, pattern_move)) {
+                                break;
+                            }
+
+                            itr += match.position(0) + match.length(0) - 1;
+                            std::string str_move = match.str(1);
+                            trim(str_move);
+#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
+                            std::cout << "str_move=" << str_move << std::endl;
+#endif
+                            psv.move = UCI::to_move(pos, str_move);
+                        }
+
+                        // eval
+                        bool eval_found = false;
+                        {
+                            std::regex pattern_bracket(R"(\{(.+?)\})");
+                            std::smatch match;
+                            if (!std::regex_search(itr, line.cend(), match, pattern_bracket)) {
+                                break;
+                            }
+
+                            std::string str_eval_clk = match.str(1);
+                            trim(str_eval_clk);
+#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
+                            std::cout << "str_eval_clk=" << str_eval_clk << std::endl;
+#endif
+
+                            // example: { [%eval 0.25] [%clk 0:10:00] }
+                            // example: { [%eval #-4] [%clk 0:10:00] }
+                            // example: { [%eval #3] [%clk 0:10:00] }
+                            // example: { +0.71/22 1.2s }
+                            // example: { -M4/7 0.003s }
+                            // example: { M3/245 0.017s }
+                            // example: { +M1/245 0.010s, White mates }
+                            // example: { 0.60 }
+                            // example: { book }
+                            // example: { rnbqkb1r/pp3ppp/2p1pn2/3p4/2PP4/2N2N2/PP2PPPP/R1BQKB1R w KQkq - 0 5 }
+
+                            // Considering the absence of eval
+                            if (!is_like_fen(str_eval_clk)) {
+                                itr += match.position(0) + match.length(0) - 1;
+
+                                if (str_eval_clk != "book") {
+                                    std::regex pattern_eval1(R"(\[\%eval (.+?)\])");
+                                    std::regex pattern_eval2(R"((.+?)\/)");
+
+                                    std::string str_eval;
+                                    if (std::regex_search(str_eval_clk, match, pattern_eval1) ||
+                                        std::regex_search(str_eval_clk, match, pattern_eval2)) {
+                                        str_eval = match.str(1);
+                                        trim(str_eval);
+                                    }
+                                    else {
+                                        str_eval = str_eval_clk;
+                                    }
+
+                                    bool success = false;
+                                    Value value = parse_score_from_pgn_extract(str_eval, success);
+                                    if (success) {
+                                        eval_found = true;
+                                        psv.score = Math::clamp(value, -VALUE_MATE, VALUE_MATE);
+                                    }
+
+#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
+                                    std::cout << "str_eval=" << str_eval << std::endl;
+                                    std::cout << "success=" << success << ", psv.score=" << psv.score << std::endl;
+#endif
+                                }
+                            }
+                        }
+
+                        // write
+                        if (eval_found || convert_no_eval_fens_as_score_zero) {
+                            if (!eval_found && convert_no_eval_fens_as_score_zero) {
+                                psv.score = 0;
+                            }
+
+                            psv.gamePly = gamePly;
+                            psv.game_result = game_result;
+
+                            if (pos.side_to_move() == BLACK) {
+                                if (!pgn_eval_side_to_move) {
+                                    psv.score *= -1;
+                                }
+                                psv.game_result *= -1;
+                            }
+
+                            ofs.write((char*)&psv, sizeof(PackedSfenValue));
+
+                            fen_count++;
+                        }
+                    }
+
+                    game_result = 0;
+                }
+            }
+        }
+
+        std::cout << now_string() << " game_count=" << game_count << ", fen_count=" << fen_count << std::endl;
+        std::cout << now_string() << " all done" << std::endl;
+        ofs.close();
+    }
+
+    void convert_plain(
+        const vector<string>& filenames, 
+        const string& output_file_name)
+    {
+        Position tpos;
+        std::ofstream ofs;
+        ofs.open(output_file_name, ios::app);
+        auto th = Threads.main();
+        for (auto filename : filenames) {
+            std::cout << "convert " << filename << " ... ";
+
+            // Just convert packedsfenvalue to text
+            std::fstream fs;
+            fs.open(filename, ios::in | ios::binary);
+            PackedSfenValue p;
+            while (true)
+            {
+                if (fs.read((char*)&p, sizeof(PackedSfenValue))) {
+                    StateInfo si;
+                    tpos.set_from_packed_sfen(p.sfen, &si, th, false);
+
+                    // write as plain text
+                    ofs << "fen " << tpos.fen() << std::endl;
+                    ofs << "move " << UCI::move(Move(p.move), false) << std::endl;
+                    ofs << "score " << p.score << std::endl;
+                    ofs << "ply " << int(p.gamePly) << std::endl;
+                    ofs << "result " << int(p.game_result) << std::endl;
+                    ofs << "e" << std::endl;
+                }
+                else {
+                    break;
+                }
+            }
+            fs.close();
+            std::cout << "done" << std::endl;
+        }
+        ofs.close();
+        std::cout << "all done" << std::endl;
+    }
+}
+#endif
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
new file mode 100644
index 00000000..38bed2d5
--- /dev/null
+++ b/src/learn/gensfen.cpp
@@ -0,0 +1,1181 @@
+﻿#define EVAL_LEARN
+
+#if defined(EVAL_LEARN)
+
+#include "../eval/evaluate_common.h"
+
+#include "learn.h"
+#include "multi_think.h"
+#include "../misc.h"
+#include "../thread.h"
+#include "../position.h"
+#include "../tt.h"
+#include "../uci.h"
+#include "../syzygy/tbprobe.h"
+
+#if defined(USE_BOOK)
+#include "../extra/book/book.h"
+#endif
+
+#include <chrono>
+#include <random>
+#include <regex>
+#include <sstream>
+#include <fstream>
+#include <unordered_set>
+#include <iomanip>
+#include <list>
+#include <cmath>
+#include <cstring>
+#include <memory>
+#include <limits>
+#include <optional>
+
+#if defined (_OPENMP)
+#include <omp.h>
+#endif
+
+#if defined(_MSC_VER)
+// std::filesystem doesn't work on GCC even though it claims to support C++17.
+#include <filesystem>
+#elif defined(__GNUC__)
+#include <dirent.h>
+#endif
+
+#if defined(EVAL_NNUE)
+#include "../nnue/evaluate_nnue_learner.h"
+#include <climits>
+#include <shared_mutex>
+#endif
+
+using namespace std; 
+
+namespace Learner
+{
+    static bool write_out_draw_game_in_training_data_generation = false;
+    static bool detect_draw_by_consecutive_low_score = false;
+    static bool detect_draw_by_insufficient_mating_material = false;
+
+    // Use raw NNUE eval value in the Eval::evaluate(). 
+    // If hybrid eval is enabled, training data
+    // generation and training don't work well.
+    // https://discordapp.com/channels/435943710472011776/733545871911813221/748524079761326192
+    static bool use_raw_nnue_eval = true;
+
+    // Helper class for exporting Sfen
+    struct SfenWriter
+    {
+        // Amount of sfens required to flush the buffer.
+        static constexpr size_t SFEN_WRITE_SIZE = 5000;
+
+        // Current status is output after 
+        // each (SFEN_WRITE_SIZE * STATUS_OUTPUT_PERIOD) sfens
+        static constexpr uint64_t STATUS_OUTPUT_PERIOD = 40;
+
+        // File name to write and number of threads to create
+        SfenWriter(string filename_, int thread_num)
+        {
+            sfen_buffers_pool.reserve((size_t)thread_num * 10);
+            sfen_buffers.resize(thread_num);
+
+            output_file_stream.open(filename_, ios::out | ios::binary | ios::app);
+            filename = filename_;
+
+            finished = false;
+        }
+
+        ~SfenWriter()
+        {
+            finished = true;
+            file_worker_thread.join();
+            output_file_stream.close();
+
+#if !defined(DNDEBUG)
+            {
+                // All buffers should be empty since file_worker_thread
+                // should have written everything before exiting.
+                for (const auto& p : sfen_buffers) { assert(p == nullptr); }
+                assert(sfen_buffers_pool.empty());
+            }
+#endif
+        }
+
+        void write(size_t thread_id, const PackedSfenValue& psv)
+        {
+            // We have a buffer for each thread and add it there.
+            // If the buffer overflows, write it to a file.
+
+            // This buffer is prepared for each thread.
+            auto& buf = sfen_buffers[thread_id];
+
+            // Secure since there is no buf at the first time 
+            // and immediately after writing the thread buffer.
+            if (!buf)
+            {
+                buf = std::make_unique<PSVector>();
+                buf->reserve(SFEN_WRITE_SIZE);
+            }
+
+            // Buffer is exclusive to this thread.
+            // There is no need for a critical section.
+            buf->push_back(psv);
+
+            if (buf->size() >= SFEN_WRITE_SIZE)
+            {
+                // If you load it in sfen_buffers_pool, the worker will do the rest.
+
+                // Critical section since sfen_buffers_pool is shared among threads.
+                std::unique_lock<std::mutex> lk(mutex);
+                sfen_buffers_pool.emplace_back(std::move(buf));
+            }
+        }
+
+        // Move what remains in the buffer for your thread to a buffer for writing to a file.
+        void finalize(size_t thread_id)
+        {
+            std::unique_lock<std::mutex> lk(mutex);
+
+            auto& buf = sfen_buffers[thread_id];
+
+            // There is a case that buf==nullptr, so that check is necessary.
+            if (buf && buf->size() != 0)
+            {
+                sfen_buffers_pool.emplace_back(std::move(buf));
+            }
+        }
+
+        // Start the write_worker thread.
+        void start_file_write_worker()
+        {
+            file_worker_thread = std::thread([&] { this->file_write_worker(); });
+        }
+
+        // Dedicated thread to write to file
+        void file_write_worker()
+        {
+            auto output_status = [&]()
+            {
+                // Also output the current time to console.
+                sync_cout << endl << sfen_write_count << " sfens , at " << now_string() << sync_endl;
+
+                // This is enough for flush().
+                output_file_stream.flush();
+            };
+
+            while (!finished || sfen_buffers_pool.size())
+            {
+                vector<std::unique_ptr<PSVector>> buffers;
+                {
+                    std::unique_lock<std::mutex> lk(mutex);
+
+                    // Atomically swap take the filled buffers and
+                    // create a new buffer pool for threads to fill.
+                    buffers = std::move(sfen_buffers_pool);
+                    sfen_buffers_pool = std::vector<std::unique_ptr<PSVector>>();
+                }
+
+                if (!buffers.size())
+                {
+                    // Poor man's condition variable.
+                    sleep(100);
+                }
+                else
+                {
+                    for (auto& buf : buffers)
+                    {
+                        output_file_stream.write(reinterpret_cast<const char*>(buf->data()), sizeof(PackedSfenValue) * buf->size());
+
+                        sfen_write_count += buf->size();
+#if 1
+                        // Add the processed number here, and if it exceeds save_every, 
+                        // change the file name and reset this counter.
+                        sfen_write_count_current_file += buf->size();
+                        if (sfen_write_count_current_file >= save_every)
+                        {
+                            sfen_write_count_current_file = 0;
+
+                            output_file_stream.close();
+
+                            // Sequential number attached to the file
+                            int n = (int)(sfen_write_count / save_every);
+
+                            // Rename the file and open it again. 
+                            // Add ios::app in consideration of overwriting. 
+                            // (Depending on the operation, it may not be necessary.)
+                            string new_filename = filename + "_" + std::to_string(n);
+                            output_file_stream.open(new_filename, ios::out | ios::binary | ios::app);
+                            cout << endl << "output sfen file = " << new_filename << endl;
+                        }
+#endif
+                        // Output '.' every time when writing a game record.
+                        std::cout << ".";
+
+                        // Output the number of phases processed 
+                        // every STATUS_OUTPUT_PERIOD times
+                        // Finally, the remainder of the teacher phase 
+                        // of each thread is written out, 
+                        // so halfway numbers are displayed, but is it okay?
+                        // If you overuse the threads to the maximum number 
+                        // of logical cores, the console will be clogged, 
+                        // so it may be beneficial to increase that value.
+                        if ((++batch_counter % STATUS_OUTPUT_PERIOD) == 0)
+                        {
+                            output_status();
+                        }
+                    }
+                }
+            }
+
+            // Output the status again after whole processing is done.
+            output_status();
+        }
+
+        void set_save_interval(uint64_t v)
+        {
+            save_every = v;
+        }
+
+    private:
+
+        fstream output_file_stream;
+
+        // A new net is saved after every save_every sfens are processed.
+        uint64_t save_every = std::numeric_limits<uint64_t>::max();
+
+        // File name passed in the constructor
+        std::string filename;
+
+        // Thread to write to the file
+        std::thread file_worker_thread;
+
+        // Flag that all threads have finished
+        atomic<bool> finished;
+
+        // Counter for time stamp output
+        uint64_t batch_counter = 0;
+
+        // buffer before writing to file
+        // sfen_buffers is the buffer for each thread
+        // sfen_buffers_pool is a buffer for writing.
+        // After loading the phase in the former buffer by SFEN_WRITE_SIZE, 
+        // transfer it to the latter.
+        std::vector<std::unique_ptr<PSVector>> sfen_buffers;
+        std::vector<std::unique_ptr<PSVector>> sfen_buffers_pool;
+
+        // Mutex required to access sfen_buffers_pool
+        std::mutex mutex;
+
+        // Number of sfens written in total, and the 
+        // number of sfens written in the current file.
+        uint64_t sfen_write_count = 0;
+        uint64_t sfen_write_count_current_file = 0;
+    };
+
+    // -----------------------------------
+    // worker that creates the game record (for each thread)
+    // -----------------------------------
+
+    // Class to generate sfen with multiple threads
+    struct MultiThinkGenSfen : public MultiThink
+    {
+        // Hash to limit the export of identical sfens
+        static constexpr uint64_t GENSFEN_HASH_SIZE = 64 * 1024 * 1024;
+        // It must be 2**N because it will be used as the mask to calculate hash_index.
+        static_assert((GENSFEN_HASH_SIZE& (GENSFEN_HASH_SIZE - 1)) == 0);
+
+        MultiThinkGenSfen(int search_depth_min_, int search_depth_max_, SfenWriter& sw_) : 
+            search_depth_min(search_depth_min_), 
+            search_depth_max(search_depth_max_), 
+            sfen_writer(sw_)
+        {
+            hash.resize(GENSFEN_HASH_SIZE);
+
+            // Output seed to veryfy by the user if it's not identical by chance.
+            std::cout << prng << std::endl;
+        }
+
+        void start_file_write_worker()
+        {
+            sfen_writer.start_file_write_worker();
+        }
+
+        void thread_worker(size_t thread_id) override;
+
+        optional<int8_t> get_current_game_result(
+            Position& pos,
+            const vector<int>& move_hist_scores) const;
+
+        vector<uint8_t> generate_random_move_flags();
+
+        bool commit_psv(PSVector& a_psv, size_t thread_id, int8_t lastTurnIsWin);
+
+        optional<Move> choose_random_move(
+            Position& pos,
+            std::vector<uint8_t>& random_move_flag,
+            int ply,
+            int& random_move_c);
+
+        Value evaluate_leaf(
+            Position& pos,
+            std::vector<StateInfo, AlignedAllocator<StateInfo>>& states,
+            int ply,
+            vector<Move>& pv);
+
+        // Min and max depths for search during gensfen
+        int search_depth_min;
+        int search_depth_max;
+
+        // Number of the nodes to be searched.
+        // 0 represents no limits.
+        uint64_t nodes;
+
+        // Upper limit of evaluation value of generated situation
+        int eval_limit;
+
+        // minimum ply with random move
+        // maximum ply with random move
+        // Number of random moves in one station
+        int random_move_minply;
+        int random_move_maxply;
+        int random_move_count;
+
+        // Move kings with a probability of 1/N when randomly moving like Apery software.
+        // When you move the king again, there is a 1/N chance that it will randomly moved
+        // once in the opponent's turn.
+        // Apery has N=2. Specifying 0 here disables this function.
+        int random_move_like_apery;
+
+        // For when using multi pv instead of random move.
+        // random_multi_pv is the number of candidates for MultiPV.
+        // When adopting the move of the candidate move, the difference 
+        // between the evaluation value of the move of the 1st place 
+        // and the evaluation value of the move of the Nth place is.
+        // Must be in the range random_multi_pv_diff.
+        // random_multi_pv_depth is the search depth for MultiPV.
+        int random_multi_pv;
+        int random_multi_pv_diff;
+        int random_multi_pv_depth;
+
+        // The minimum and maximum ply (number of steps from 
+        // the initial phase) of the sfens to write out.
+        int write_minply;
+        int write_maxply;
+
+        // sfen exporter
+        SfenWriter& sfen_writer;
+
+        vector<Key> hash; // 64MB*sizeof(HASH_KEY) = 512MB
+    };
+
+    optional<int8_t> MultiThinkGenSfen::get_current_game_result(
+        Position& pos,
+        const vector<int>& move_hist_scores) const
+    {
+        // Variables for draw adjudication.
+        // Todo: Make this as an option.
+
+        // start the adjudication when ply reaches this value
+        constexpr int adj_draw_ply = 80;
+
+        // 4 move scores for each side have to be checked
+        constexpr int adj_draw_cnt = 8;
+
+        // move score in CP
+        constexpr int adj_draw_score = 0;
+
+        // For the time being, it will be treated as a 
+        // draw at the maximum number of steps to write.
+        const int ply = move_hist_scores.size();
+
+        // has it reached the max length or is a draw
+        if (ply >= write_maxply || pos.is_draw(ply))
+        {
+            return 0;
+        }
+
+        // Initialize the Syzygy Ending Tablebase and sort the moves.
+        Search::RootMoves rootMoves;
+        for (const auto& m : MoveList<LEGAL>(pos))
+        {
+            rootMoves.emplace_back(m);
+        }
+
+        if (!rootMoves.empty())
+        {
+            Tablebases::rank_root_moves(pos, rootMoves);
+        }
+        else 
+        {
+            // If there is no legal move
+            return pos.checkers() 
+                ? -1 /* mate */ 
+                : 0 /* stalemate */;
+        }
+
+        // Adjudicate game to a draw if the last 4 scores of each engine is 0.
+        if (detect_draw_by_consecutive_low_score) 
+        {
+            if (ply >= adj_draw_ply) 
+            {
+                int num_cons_plies_within_draw_score = 0;
+                bool is_adj_draw = false;
+
+                for (auto it = move_hist_scores.rbegin();
+                    it != move_hist_scores.rend(); ++it)
+                {
+                    if (abs(*it) <= adj_draw_score)
+                    {
+                        num_cons_plies_within_draw_score++;
+                    }
+                    else
+                    {
+                        // Draw scores must happen on consecutive plies
+                        break;
+                    }
+
+                    if (num_cons_plies_within_draw_score >= adj_draw_cnt) 
+                    {
+                        is_adj_draw = true;
+                        break;
+                    }
+                }
+
+                if (is_adj_draw) 
+                {
+                    return 0;
+                }
+            }
+        }
+
+        // Draw by insufficient mating material
+        if (detect_draw_by_insufficient_mating_material) 
+        {
+            if (pos.count<ALL_PIECES>() <= 4) 
+            {
+                int num_pieces = pos.count<ALL_PIECES>();
+
+                // (1) KvK
+                if (num_pieces == 2) 
+                {
+                    return 0;
+                }
+
+                // (2) KvK + 1 minor piece
+                if (num_pieces == 3) 
+                {
+                    int minor_pc = pos.count<BISHOP>(WHITE) + pos.count<KNIGHT>(WHITE) +
+                        pos.count<BISHOP>(BLACK) + pos.count<KNIGHT>(BLACK);
+                    if (minor_pc == 1) 
+                    {
+                        return 0;
+                    }
+                }
+
+                // (3) KBvKB, bishops of the same color
+                else if (num_pieces == 4) 
+                {
+                    if (pos.count<BISHOP>(WHITE) == 1 && pos.count<BISHOP>(BLACK) == 1) 
+                    {
+                        // Color of bishops is black.
+                        if ((pos.pieces(WHITE, BISHOP) & DarkSquares)
+                            && (pos.pieces(BLACK, BISHOP) & DarkSquares))
+                        {
+                            return 0;
+                        }
+                        // Color of bishops is white.
+                        if ((pos.pieces(WHITE, BISHOP) & ~DarkSquares)
+                            && (pos.pieces(BLACK, BISHOP) & ~DarkSquares))
+                        {
+                            return 0;
+                        }
+                    }
+                }
+            }
+        }
+
+        return nullopt;
+    }
+
+    // Write out the phases loaded in sfens to a file.
+    // lastTurnIsWin: win/loss in the next phase after the final phase in sfens
+    // 1 when winning. -1 when losing. Pass 0 for a draw.
+    // Return value: true if the specified number of 
+    // sfens has already been reached and the process ends.
+    bool MultiThinkGenSfen::commit_psv(PSVector& sfens, size_t thread_id, int8_t lastTurnIsWin)
+    {
+        int8_t is_win = lastTurnIsWin;
+
+        // From the final stage (one step before) to the first stage, give information on the outcome of the game for each stage.
+        // The phases stored in sfens are assumed to be continuous (in order).
+        for (auto it = sfens.rbegin(); it != sfens.rend(); ++it)
+        {
+            // If is_win == 0 (draw), multiply by -1 and it will remain 0 (draw)
+            is_win = -is_win;
+            it->game_result = is_win;
+
+            // See how many sfens were already written and get the next id.
+            // Exit if requested number of sfens reached.
+            auto now_loop_count = get_next_loop_count();
+            if (now_loop_count == LOOP_COUNT_FINISHED)
+            {
+                return true;
+            }
+
+            // Write out one sfen.
+            sfen_writer.write(thread_id, *it);
+
+#if 0
+            pos.set_from_packed_sfen(it->sfen);
+            cout << pos << "Win : " << it->is_win << " , " << it->score << endl;
+#endif
+        }
+
+        return false;
+    }
+
+    optional<Move> MultiThinkGenSfen::choose_random_move(
+        Position& pos,
+        std::vector<uint8_t>& random_move_flag,
+        int ply,
+        int& random_move_c)
+    {
+        optional<Move> random_move;
+
+        // Randomly choose one from legal move
+        if (
+            // 1. Random move of random_move_count times from random_move_minply to random_move_maxply
+            (random_move_minply != -1 && ply < (int)random_move_flag.size() && random_move_flag[ply]) ||
+            // 2. A mode to perform random move of random_move_count times after leaving the startpos
+            (random_move_minply == -1 && random_move_c < random_move_count))
+        {
+            ++random_move_c;
+
+            // It's not a mate, so there should be one legal move...
+            if (random_multi_pv == 0)
+            {
+                // Normal random move
+                MoveList<LEGAL> list(pos);
+
+                // I don't really know the goodness and badness of making this the Apery method.
+                if (random_move_like_apery == 0
+                    || prng.rand(random_move_like_apery) != 0)
+                {
+                    // Normally one move from legal move
+                    random_move = list.at((size_t)prng.rand((uint64_t)list.size()));
+                }
+                else 
+                {
+                    // if you can move the king, move the king
+                    Move moves[8]; // Near 8
+                    Move* p = &moves[0];
+                    for (auto& m : list)
+                    {
+                        if (type_of(pos.moved_piece(m)) == KING)
+                        {
+                            *(p++) = m;
+                        }
+                    }
+
+                    size_t n = p - &moves[0];
+                    if (n != 0)
+                    {
+                        // move to move the king
+                        random_move = moves[prng.rand(n)];
+
+                        // In Apery method, at this time there is a 1/2 chance 
+                        // that the opponent will also move randomly
+                        if (prng.rand(2) == 0)
+                        {
+                            // Is it a simple hack to add a "1" next to random_move_flag[ply]?
+                            random_move_flag.insert(random_move_flag.begin() + ply + 1, 1, true);
+                        }
+                    }
+                    else
+                    {
+                        // Normally one move from legal move
+                        random_move = list.at((size_t)prng.rand((uint64_t)list.size()));
+                    }
+                }
+            }
+            else 
+            {
+                Learner::search(pos, random_multi_pv_depth, random_multi_pv);
+
+                // Select one from the top N hands of root Moves
+                auto& rm = pos.this_thread()->rootMoves;
+
+                uint64_t s = min((uint64_t)rm.size(), (uint64_t)random_multi_pv);
+                for (uint64_t i = 1; i < s; ++i)
+                {
+                    // The difference from the evaluation value of rm[0] must 
+                    // be within the range of random_multi_pv_diff.
+                    // It can be assumed that rm[x].score is arranged in descending order.
+                    if (rm[0].score > rm[i].score + random_multi_pv_diff)
+                    {
+                        s = i;
+                        break;
+                    }
+                }
+
+                random_move = rm[prng.rand(s)].pv[0];
+            }
+        }
+
+        return random_move;
+    }
+
+    vector<uint8_t> MultiThinkGenSfen::generate_random_move_flags()
+    {
+        vector<uint8_t> random_move_flag;
+
+        // Depending on random move selection parameters setup
+        // the array of flags that indicates whether a random move
+        // be taken at a given ply.
+
+        // Make an array like a[0] = 0 ,a[1] = 1, ...
+        // Fisher-Yates shuffle and take out the first N items.
+        // Actually, I only want N pieces, so I only need 
+        // to shuffle the first N pieces with Fisher-Yates.
+
+        vector<int> a;
+        a.reserve((size_t)random_move_maxply);
+
+        // random_move_minply ,random_move_maxply is specified by 1 origin,
+        // Note that we are handling 0 origin here.
+        for (int i = std::max(random_move_minply - 1, 0); i < random_move_maxply; ++i)
+        {
+            a.push_back(i);
+        }
+
+        // In case of Apery random move, insert() may be called random_move_count times.
+        // Reserve only the size considering it.
+        random_move_flag.resize((size_t)random_move_maxply + random_move_count);
+
+        // A random move that exceeds the size() of a[] cannot be applied, so limit it.
+        for (int i = 0; i < std::min(random_move_count, (int)a.size()); ++i)
+        {
+            swap(a[i], a[prng.rand((uint64_t)a.size() - i) + i]);
+            random_move_flag[a[i]] = true;
+        }
+
+        return random_move_flag;
+    }
+
+    Value MultiThinkGenSfen::evaluate_leaf(
+        Position& pos, 
+        std::vector<StateInfo, AlignedAllocator<StateInfo>>& states,
+        int ply,
+        vector<Move>& pv)
+    {
+        auto rootColor = pos.side_to_move();
+
+        for (auto m : pv)
+        {
+#if 1
+            // There should be no illegal move. This is as a debugging precaution.
+            if (!pos.pseudo_legal(m) || !pos.legal(m))
+            {
+                cout << "Error! : " << pos.fen() << m << endl;
+            }
+#endif
+            pos.do_move(m, states[ply++]);
+
+            // Because the difference calculation of evaluate() cannot be 
+            // performed unless each node evaluate() is called!
+            // If the depth is 8 or more, it seems 
+            // faster not to calculate this difference.
+#if defined(EVAL_NNUE)
+            if (depth < 8)
+            {
+                Eval::NNUE::update_eval(pos);
+            }
+#endif  // defined(EVAL_NNUE)
+        }
+
+        // Reach leaf
+        Value v;
+        if (pos.checkers()) {
+            // Sometime a king is checked.  An example is a case that a checkmate is
+            // found in the search.  If Eval::evaluate() is called whne a king is
+            // checked, classic eval crashes by an assertion. To avoid crashes, return
+            // VALUE_NONE and let the caller assign a value to the position.
+            return VALUE_NONE;
+        }
+        else 
+        {
+            v = Eval::evaluate(pos);
+
+            // evaluate() returns the evaluation value on the turn side, so
+            // If it's a turn different from root_color, you must invert v and return it.
+            if (rootColor != pos.side_to_move())
+            {
+                v = -v;
+            }
+        }
+
+        // Rewind the pv moves.
+        for (auto it = pv.rbegin(); it != pv.rend(); ++it)
+        {
+            pos.undo_move(*it);
+        }
+
+        return v;
+    }
+
+    // thread_id = 0..Threads.size()-1
+    void MultiThinkGenSfen::thread_worker(size_t thread_id)
+    {
+        // For the time being, it will be treated as a draw 
+        // at the maximum number of steps to write.
+        // Maximum StateInfo + Search PV to advance to leaf buffer
+        std::vector<StateInfo, AlignedAllocator<StateInfo>> states(
+            write_maxply + MAX_PLY /* == search_depth_min + α */);
+
+        StateInfo si;
+
+        // end flag
+        bool quit = false;
+
+        // repeat until the specified number of times
+        while (!quit)
+        {
+            // It is necessary to set a dependent thread for Position.
+            // When parallelizing, Threads (since this is a vector<Thread*>,
+            // Do the same for up to Threads[0]...Threads[thread_num-1].
+            auto th = Threads[thread_id];
+
+            auto& pos = th->rootPos;
+            pos.set(StartFEN, false, &si, th);
+
+#if defined(USE_BOOK)
+            // Refer to the members of BookMoveSelector defined in the search section.
+            auto& book = ::book;
+#endif
+
+            // Vector for holding the sfens in the current simulated game.
+            PSVector a_psv;
+            a_psv.reserve(write_maxply + MAX_PLY);
+
+            // Precomputed flags. Used internally by choose_random_move.
+            vector<uint8_t> random_move_flag = generate_random_move_flags();
+
+            // A counter that keeps track of the number of random moves
+            // When random_move_minply == -1, random moves are 
+            // performed continuously, so use it at this time.
+            // Used internally by choose_random_move.
+            int actual_random_move_count = 0;
+
+            // Save history of move scores for adjudication
+            vector<int> move_hist_scores;
+
+            auto flush_psv = [&](int8_t result) {
+                quit = commit_psv(a_psv, thread_id, result);
+            };
+
+            for (int ply = 0; ; ++ply)
+            {
+                Move next_move = MOVE_NONE;
+
+                // Current search depth
+                const int depth = search_depth_min + (int)prng.rand(search_depth_max - search_depth_min + 1);
+
+                const auto result = get_current_game_result(pos, move_hist_scores);
+                if (result.has_value())
+                {
+                    flush_psv(result.value());
+                    break;
+                }
+#if defined(USE_BOOK)
+                if ((next_move = book.probe(pos)) != MOVE_NONE)
+                {
+                    // Hit the constant track.
+                    // The move was stored in next_move.
+
+                    // Do not use the fixed phase for learning.
+                    sfens.clear();
+
+                    if (random_move_minply != -1)
+                    {
+                        // Random move is performed with a certain 
+                        // probability even in the constant phase.
+                        goto RANDOM_MOVE;
+                    }
+                    else
+                    {
+                        // When -1 is specified as random_move_minply, 
+                        // it points according to the standard until 
+                        // it goes out of the standard.
+                        // Prepare an innumerable number of situations 
+                        // that have left the constant as 
+                        // ConsiderationBookMoveCount true using a huge constant
+                        // Used for purposes such as performing 
+                        // a random move 5 times from there.
+                        goto DO_MOVE;
+                    }
+                }
+#endif
+                {
+                    auto [search_value, search_pv] = search(pos, depth, 1, nodes);
+
+                    // Always adjudivate by eval limit.
+                    // Also because of this we don't have to check for TB/MATE scores
+                    if (abs(search_value) >= eval_limit)
+                    {
+                        const auto wdl = (search_value >= eval_limit) ? 1 : -1;
+                        flush_psv(wdl);
+                        break;
+                    }
+
+                    // Verification of a strange move
+                    if (search_pv.size() > 0
+                        && (search_pv[0] == MOVE_NONE || search_pv[0] == MOVE_NULL))
+                    {
+                        // (???)
+                        // MOVE_WIN is checking if it is the declaration victory stage before this
+                        // The declarative winning move should never come back here.
+                        // Also, when MOVE_RESIGN, search_value is a one-stop score, which should be the minimum value of eval_limit (-31998)...
+                        cout << "Error! : " << pos.fen() << next_move << search_value << endl;
+                        break;
+                    }
+
+                    // Save the move score for adjudication.
+                    move_hist_scores.push_back(search_value);
+
+#if 0
+                    dbg_hit_on(search_value == leaf_value);
+                    // gensfen depth 3 eval_limit 32000
+                    // Total 217749 Hits 203579 hit rate (%) 93.490
+                    // gensfen depth 6 eval_limit 32000
+                    // Total 78407 Hits 69190 hit rate (%) 88.245
+                    // gensfen depth 6 eval_limit 3000
+                    // Total 53879 Hits 43713 hit rate (%) 81.132
+
+                    // Problems such as pruning with moves in the substitution table.
+                    // This is a little uncomfortable as a teacher...
+#endif
+
+                    // If depth 0, pv is not obtained, so search again at depth 2.
+                    if (search_depth_min <= 0)
+                    {
+                        auto [research_value, research_pv] = search(pos, 2);
+                        search_pv = research_pv;
+                    }
+
+                    // Discard stuff before write_minply is reached
+                    // because it can harm training due to overfitting.
+                    // Initial positions would be too common.
+                    if (ply < write_minply - 1)
+                    {
+                        a_psv.clear();
+                        goto SKIP_SAVE;
+                    }
+
+                    // Look into the position hashtable to see if the same
+                    // position was seen before.
+                    // This is a good heuristic to exlude already seen
+                    // positions without many false positives.
+                    {
+                        auto key = pos.key();
+                        auto hash_index = (size_t)(key & (GENSFEN_HASH_SIZE - 1));
+                        auto old_key = hash[hash_index];
+                        if (key == old_key)
+                        {
+                            a_psv.clear();
+                            goto SKIP_SAVE;
+                        }
+                        else
+                        {
+                            // Replace with the current key.
+                            hash[hash_index] = key;
+                        }
+                    }
+
+                    // Pack the current position into a packed sfen and save it into the buffer.
+                    {
+                        a_psv.emplace_back(PackedSfenValue());
+                        auto& psv = a_psv.back();
+
+                        // Here we only write the position data.
+                        // Result is added after the whole game is done.
+                        pos.sfen_pack(psv.sfen);
+
+                        // Get the value of evaluate() as seen from the 
+                        // root color on the leaf node of the PV line.
+                        // I don't know the goodness and badness of using the 
+                        // return value of search() as it is.
+                        // TODO: Consider using search value instead of evaluate_leaf.
+                        //       Maybe give it as an option.
+                        
+                        // Use PV moves to reach the leaf node and use the value 
+                        // that evaluated() is called on that leaf node.
+                        const auto leaf_value = evaluate_leaf(pos, states, ply, search_pv);
+
+                        // If for some reason the leaf node couldn't yield an eval
+                        // we fallback to search value.
+                        psv.score = leaf_value == VALUE_NONE ? search_value : leaf_value;
+
+                        psv.gamePly = ply;
+
+                        // Take out the first PV move. This should be present unless depth 0.
+                        assert(search_pv.size() >= 1);
+                        psv.move = search_pv[0];
+                    }
+
+                SKIP_SAVE:;
+
+                    // For some reason, We could not get PV (hit the substitution table etc. and got stuck?) 
+                    // so go to the next game. It's a rare case, so you can ignore it.
+                    if (search_pv.size() == 0)
+                    {
+                        break;
+                    }
+
+                    // Update the next move according to best search result.
+                    next_move = search_pv[0];
+                }
+
+            RANDOM_MOVE:;
+
+                auto random_move = choose_random_move(pos, random_move_flag, ply, actual_random_move_count);
+                if (random_move.has_value())
+                {
+                    next_move = random_move.value();
+
+                    // We don't have the whole game yet, but it ended, 
+                    // so the writing process ends and the next game starts.
+                    if (!is_ok(next_move))
+                    {
+                        break;
+                    }
+
+                    // Clear the sfens that were written before the random move.
+                    // (???) why?
+                    a_psv.clear();
+                }
+
+            DO_MOVE:;
+                pos.do_move(next_move, states[ply]);
+
+                // Call node evaluate() for each difference calculation.
+                Eval::NNUE::update_eval(pos);
+
+            } // for (int ply = 0; ; ++ply)
+
+        } // while(!quit)
+
+        sfen_writer.finalize(thread_id);
+    }
+
+    // -----------------------------------
+    // Command to generate a game record (master thread)
+    // -----------------------------------
+
+    // Command to generate a game record
+    void gen_sfen(Position&, istringstream& is)
+    {
+        // number of threads (given by USI setoption)
+        uint32_t thread_num = (uint32_t)Options["Threads"];
+
+        // Number of generated game records default = 8 billion phases (Ponanza specification)
+        uint64_t loop_max = 8000000000UL;
+
+        // Stop the generation when the evaluation value reaches this value.
+        int eval_limit = 3000;
+
+        // search depth
+        int search_depth_min = 3;
+        int search_depth_max = INT_MIN;
+
+        // Number of nodes to be searched.
+        uint64_t nodes = 0;
+
+        // minimum ply, maximum ply and number of random moves
+        int random_move_minply = 1;
+        int random_move_maxply = 24;
+        int random_move_count = 5;
+
+        // A function to move the random move mainly like Apery
+        // If this is set to 3, the ball will move with a probability of 1/3.
+        int random_move_like_apery = 0;
+
+        // If you search with multipv instead of random move and choose from among them randomly, set random_multi_pv = 1 or more.
+        int random_multi_pv = 0;
+        int random_multi_pv_diff = 32000;
+        int random_multi_pv_depth = INT_MIN;
+
+        // The minimum and maximum ply (number of steps from the initial phase) of the phase to write out.
+        int write_minply = 16;
+        int write_maxply = 400;
+
+        // File name to write
+        string output_file_name = "generated_kifu.bin";
+
+        string token;
+
+        // When hit to eval hash, as a evaluation value near the initial stage, if a hash collision occurs and a large value is written
+        // When eval_limit is set small, eval_limit will be exceeded every time in the initial phase, and phase generation will not proceed.
+        // Therefore, eval hash needs to be disabled.
+        // After that, when the hash of the eval hash collides, the evaluation value of a strange value is used, and it may be unpleasant to use it for the teacher.
+        bool use_eval_hash = false;
+
+        // Save to file in this unit.
+        // File names are serialized like file_1.bin, file_2.bin.
+        uint64_t save_every = UINT64_MAX;
+
+        // Add a random number to the end of the file name.
+        bool random_file_name = false;
+
+        while (true)
+        {
+            token = "";
+            is >> token;
+            if (token == "")
+                break;
+
+            if (token == "depth")
+                is >> search_depth_min;
+            else if (token == "depth2")
+                is >> search_depth_max;
+            else if (token == "nodes")
+                is >> nodes;
+            else if (token == "loop")
+                is >> loop_max;
+            else if (token == "output_file_name")
+                is >> output_file_name;
+            else if (token == "eval_limit")
+            {
+                is >> eval_limit;
+                // Limit the maximum to a one-stop score. (Otherwise you might not end the loop)
+                eval_limit = std::min(eval_limit, (int)mate_in(2));
+            }
+            else if (token == "random_move_minply")
+                is >> random_move_minply;
+            else if (token == "random_move_maxply")
+                is >> random_move_maxply;
+            else if (token == "random_move_count")
+                is >> random_move_count;
+            else if (token == "random_move_like_apery")
+                is >> random_move_like_apery;
+            else if (token == "random_multi_pv")
+                is >> random_multi_pv;
+            else if (token == "random_multi_pv_diff")
+                is >> random_multi_pv_diff;
+            else if (token == "random_multi_pv_depth")
+                is >> random_multi_pv_depth;
+            else if (token == "write_minply")
+                is >> write_minply;
+            else if (token == "write_maxply")
+                is >> write_maxply;
+            else if (token == "use_eval_hash")
+                is >> use_eval_hash;
+            else if (token == "save_every")
+                is >> save_every;
+            else if (token == "random_file_name")
+                is >> random_file_name;
+            // Accept also the old option name.
+            else if (token == "use_draw_in_training_data_generation" || token == "write_out_draw_game_in_training_data_generation")
+                is >> write_out_draw_game_in_training_data_generation;
+            // Accept also the old option name.
+            else if (token == "use_game_draw_adjudication" || token == "detect_draw_by_consecutive_low_score")
+                is >> detect_draw_by_consecutive_low_score;
+            else if (token == "detect_draw_by_insufficient_mating_material")
+                is >> detect_draw_by_insufficient_mating_material;
+            else if (token == "use_raw_nnue_eval")
+                is >> use_raw_nnue_eval;
+            else
+                cout << "Error! : Illegal token " << token << endl;
+        }
+
+#if defined(USE_GLOBAL_OPTIONS)
+        // Save it for later restore.
+        auto oldGlobalOptions = GlobalOptions;
+        GlobalOptions.use_eval_hash = use_eval_hash;
+#endif
+
+        // If search depth2 is not set, leave it the same as search depth.
+        if (search_depth_max == INT_MIN)
+            search_depth_max = search_depth_min;
+        if (random_multi_pv_depth == INT_MIN)
+            random_multi_pv_depth = search_depth_min;
+
+        if (random_file_name)
+        {
+            // Give a random number to output_file_name at this point.
+            // Do not use std::random_device().  Because it always the same integers on MinGW.
+            PRNG r(std::chrono::system_clock::now().time_since_epoch().count());
+            // Just in case, reassign the random numbers.
+            for (int i = 0; i < 10; ++i)
+                r.rand(1);
+            auto to_hex = [](uint64_t u) {
+                std::stringstream ss;
+                ss << std::hex << u;
+                return ss.str();
+            };
+            // I don't want to wear 64bit numbers by accident, so I'next_move going to make a 64bit number 2 just in case.
+            output_file_name = output_file_name + "_" + to_hex(r.rand<uint64_t>()) + to_hex(r.rand<uint64_t>());
+        }
+
+        std::cout << "gensfen : " << endl
+            << "  search_depth_min = " << search_depth_min << " to " << search_depth_max << endl
+            << "  nodes = " << nodes << endl
+            << "  loop_max = " << loop_max << endl
+            << "  eval_limit = " << eval_limit << endl
+            << "  thread_num (set by USI setoption) = " << thread_num << endl
+#if defined(USE_BOOK)
+            << "  book_moves (set by USI setoption) = " << Options["BookMoves"] << endl
+#endif
+            << "  random_move_minply     = " << random_move_minply << endl
+            << "  random_move_maxply     = " << random_move_maxply << endl
+            << "  random_move_count      = " << random_move_count << endl
+            << "  random_move_like_apery = " << random_move_like_apery << endl
+            << "  random_multi_pv        = " << random_multi_pv << endl
+            << "  random_multi_pv_diff   = " << random_multi_pv_diff << endl
+            << "  random_multi_pv_depth  = " << random_multi_pv_depth << endl
+            << "  write_minply           = " << write_minply << endl
+            << "  write_maxply           = " << write_maxply << endl
+            << "  output_file_name       = " << output_file_name << endl
+            << "  use_eval_hash          = " << use_eval_hash << endl
+            << "  save_every             = " << save_every << endl
+            << "  random_file_name       = " << random_file_name << endl
+            << "  write_out_draw_game_in_training_data_generation = " << write_out_draw_game_in_training_data_generation << endl
+            << "  detect_draw_by_consecutive_low_score = " << detect_draw_by_consecutive_low_score << endl
+            << "  detect_draw_by_insufficient_mating_material = " << detect_draw_by_insufficient_mating_material << endl;
+
+        // Show if the training data generator uses NNUE.
+        Eval::verify_NNUE();
+
+        // Create and execute threads as many as Options["Threads"].
+        {
+            SfenWriter sfen_writer(output_file_name, thread_num);
+            sfen_writer.set_save_interval(save_every);
+
+            MultiThinkGenSfen multi_think(search_depth_min, search_depth_max, sfen_writer);
+            multi_think.nodes = nodes;
+            multi_think.set_loop_max(loop_max);
+            multi_think.eval_limit = eval_limit;
+            multi_think.random_move_minply = random_move_minply;
+            multi_think.random_move_maxply = random_move_maxply;
+            multi_think.random_move_count = random_move_count;
+            multi_think.random_move_like_apery = random_move_like_apery;
+            multi_think.random_multi_pv = random_multi_pv;
+            multi_think.random_multi_pv_diff = random_multi_pv_diff;
+            multi_think.random_multi_pv_depth = random_multi_pv_depth;
+            multi_think.write_minply = write_minply;
+            multi_think.write_maxply = write_maxply;
+            multi_think.start_file_write_worker();
+            multi_think.go_think();
+
+            // Since we are joining with the destructor of SfenWriter, please give a message that it has finished after the join
+            // Enclose this in a block because it should be displayed.
+        }
+
+        std::cout << "gensfen finished." << endl;
+
+#if defined(USE_GLOBAL_OPTIONS)
+        // Restore Global Options.
+        GlobalOptions = oldGlobalOptions;
+#endif
+
+    }
+}
+#endif
diff --git a/src/learn/learn.h b/src/learn/learn.h
index eda2bb32..e29ed74a 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -180,6 +180,23 @@ typedef float LearnFloatType;
 #define ADA_GRAD_UPDATE
 #endif
 
+// Character string according to update formula. (Output for debugging.)
+// Implemented various update expressions, but concluded that AdaGrad is the best in terms of speed and memory.
+#if defined(ADA_GRAD_UPDATE)
+#define LEARN_UPDATE "AdaGrad"
+#elif defined(SGD_UPDATE)
+#define LEARN_UPDATE "SGD"
+#endif
+
+#if defined(LOSS_FUNCTION_IS_WINNING_PERCENTAGE)
+#define LOSS_FUNCTION "WINNING_PERCENTAGE"
+#elif defined(LOSS_FUNCTION_IS_CROSS_ENTOROPY)
+#define LOSS_FUNCTION "CROSS_ENTOROPY"
+#elif defined(LOSS_FUNCTION_IS_CROSS_ENTOROPY_FOR_VALUE)
+#define LOSS_FUNCTION "CROSS_ENTOROPY_FOR_VALUE"
+#elif defined(LOSS_FUNCTION_IS_ELMO_METHOD)
+#define LOSS_FUNCTION "ELMO_METHOD(WCSC27)"
+#endif
 
 // ----------------------
 // Definition of struct used in Learner
@@ -223,13 +240,38 @@ namespace Learner
 	// Used in Learner::search(), Learner::qsearch().
 	typedef std::pair<Value, std::vector<Move> > ValueAndPV;
 
+	// Phase array: PSVector stands for packed sfen vector.
+	typedef std::vector<PackedSfenValue> PSVector;
+
 	// So far, only Yaneura King 2018 Otafuku has this stub
 	// This stub is required if EVAL_LEARN is defined.
 	extern Learner::ValueAndPV  search(Position& pos, int depth , size_t multiPV = 1 , uint64_t NodesLimit = 0);
 	extern Learner::ValueAndPV qsearch(Position& pos);
 
 	double calc_grad(Value shallow, const PackedSfenValue& psv);
+	
+	void convert_bin_from_pgn_extract(
+		const std::vector<std::string>& filenames,
+		const std::string& output_file_name,
+		const bool pgn_eval_side_to_move,
+		const bool convert_no_eval_fens_as_score_zero);
+	
+	void convert_bin(
+		const std::vector<std::string>& filenames,
+		const std::string& output_file_name,
+		const int ply_minimum,
+		const int ply_maximum,
+		const int interpolate_eval,
+		const int src_score_min_value,
+		const int src_score_max_value,
+		const int dest_score_min_value,
+		const int dest_score_max_value,
+		const bool check_invalid_fen,
+		const bool check_illegal_move);
 
+	void convert_plain(
+		const std::vector<std::string>& filenames,
+		const std::string& output_file_name);
 }
 
 #endif
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 9f02a594..c897dd93 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -13,54 +13,34 @@
 // → I will not be involved in the engine because it is a problem that the GUI should assist.
 // etc..
 
+#define EVAL_LEARN
+
 #if defined(EVAL_LEARN)
 
-#include <chrono>
-#include <filesystem>
-#include <random>
-#include <regex>
+#include "../eval/evaluate_common.h"
 
 #include "learn.h"
 #include "multi_think.h"
 #include "../uci.h"
 #include "../syzygy/tbprobe.h"
+#include "../misc.h"
+#include "../thread.h"
+#include "../position.h"
+#include "../tt.h"
 
-// evaluate header for learning
-#include "../eval/evaluate_common.h"
-
-// ----------------------
-// constant string based on the settings
-// ----------------------
-
-// Character string according to update formula. (Output for debugging.)
-// Implemented various update expressions, but concluded that AdaGrad is the best in terms of speed and memory.
-#if defined(ADA_GRAD_UPDATE)
-#define LEARN_UPDATE "AdaGrad"
-#elif defined(SGD_UPDATE)
-#define LEARN_UPDATE "SGD"
-#endif
-
-#if defined(LOSS_FUNCTION_IS_WINNING_PERCENTAGE)
-#define LOSS_FUNCTION "WINNING_PERCENTAGE"
-#elif defined(LOSS_FUNCTION_IS_CROSS_ENTOROPY)
-#define LOSS_FUNCTION "CROSS_ENTOROPY"
-#elif defined(LOSS_FUNCTION_IS_CROSS_ENTOROPY_FOR_VALUE)
-#define LOSS_FUNCTION "CROSS_ENTOROPY_FOR_VALUE"
-#elif defined(LOSS_FUNCTION_IS_ELMO_METHOD)
-#define LOSS_FUNCTION "ELMO_METHOD(WCSC27)"
-#endif
-
-// -----------------------------------
-// Below, the implementation section.
-// -----------------------------------
-
+#include <chrono>
+#include <random>
+#include <regex>
 #include <sstream>
 #include <fstream>
 #include <unordered_set>
 #include <iomanip>
 #include <list>
-#include <cmath>	// std::exp(),std::pow(),std::log()
-#include <cstring>	// memcpy()
+#include <cmath>    // std::exp(),std::pow(),std::log()
+#include <cstring>  // memcpy()
+#include <memory>
+#include <limits>
+#include <optional>
 
 #if defined (_OPENMP)
 #include <omp.h>
@@ -75,13 +55,6 @@
 #include <dirent.h>
 #endif
 
-#include "../misc.h"
-#include "../thread.h"
-#include "../position.h"
-//#include "../extra/book/book.h"
-#include "../tt.h"
-#include "multi_think.h"
-
 #if defined(EVAL_NNUE)
 #include "../nnue/evaluate_nnue_learner.h"
 #include <climits>
@@ -93,3470 +66,2019 @@ using namespace std;
 //// This is defined in the search section.
 //extern Book::BookMoveSelector book;
 
-// Addition and subtraction definition for atomic<T>
-// Aligned with atomicAdd() in Apery/learner.hpp.
 template <typename T>
 T operator += (std::atomic<T>& x, const T rhs)
 {
-	T old = x.load(std::memory_order_consume);
-	// It is allowed that the value is rewritten from other thread at this timing.
-	// The idea that the value is not destroyed is good.
-	T desired = old + rhs;
-	while (!x.compare_exchange_weak(old, desired, std::memory_order_release, std::memory_order_consume))
-		desired = old + rhs;
-	return desired;
+    T old = x.load(std::memory_order_consume);
+    // It is allowed that the value is rewritten from other thread at this timing.
+    // The idea that the value is not destroyed is good.
+    T desired = old + rhs;
+    while (!x.compare_exchange_weak(old, desired, std::memory_order_release, std::memory_order_consume))
+        desired = old + rhs;
+    return desired;
 }
 template <typename T>
 T operator -= (std::atomic<T>& x, const T rhs) { return x += -rhs; }
 
 namespace Learner
 {
-
-// Phase array: PSVector stands for packed sfen vector.
-typedef std::vector<PackedSfenValue> PSVector;
-
-bool write_out_draw_game_in_training_data_generation = false;
-bool use_draw_games_in_training = false;
-bool use_draw_games_in_validation = false;
-bool skip_duplicated_positions_in_training = true;
-bool detect_draw_by_consecutive_low_score = false;
-bool detect_draw_by_insufficient_mating_material = false;
-// 1.0 / PawnValueEg / 4.0 * log(10.0)
-double winning_probability_coefficient = 0.00276753015984861260098316280611;
-// Score scale factors.  ex) If we set src_score_min_value = 0.0,
-// src_score_max_value = 1.0, dest_score_min_value = 0.0,
-// dest_score_max_value = 10000.0, [0.0, 1.0] will be scaled to [0, 10000].
-double src_score_min_value = 0.0;
-double src_score_max_value = 1.0;
-double dest_score_min_value = 0.0;
-double dest_score_max_value = 1.0;
-// Assume teacher signals are the scores of deep searches, and convert them into winning
-// probabilities in the trainer. Sometimes we want to use the winning probabilities in the training
-// data directly. In those cases, we set false to this variable.
-bool convert_teacher_signal_to_winning_probability = true;
-// Use raw NNUE eval value in the Eval::evaluate(). If hybrid eval is enabled, training data
-// generation and training don't work well.
-// https://discordapp.com/channels/435943710472011776/733545871911813221/748524079761326192
-bool use_raw_nnue_eval = true;
-// Using WDL with win rate model instead of sigmoid
-bool use_wdl = false;
-
-// -----------------------------------
-// write phase file
-// -----------------------------------
-
-// Helper class for exporting Sfen
-struct SfenWriter
-{
-		// File name to write and number of threads to create
-	SfenWriter(string filename, int thread_num)
-	{
-		sfen_buffers_pool.reserve((size_t)thread_num * 10);
-		sfen_buffers.resize(thread_num);
-
-		// When performing additional learning, the quality of the teacher generated after learning the evaluation function does not change much and I want to earn more teacher positions.
-		// Since it is preferable that old teachers also use it, it has such a specification.
-		fs.open(filename, ios::out | ios::binary | ios::app);
-		filename_ = filename;
-
-		finished = false;
-	}
-
-	~SfenWriter()
-	{
-		finished = true;
-		file_worker_thread.join();
-		fs.close();
-
-		// all buffers should be empty since file_worker_thread has written all..
-		for (auto p : sfen_buffers) { assert(p == nullptr); }
-		assert(sfen_buffers_pool.empty());
-	}
-
-	// For each thread, flush the file by this number of phases.
-	const size_t SFEN_WRITE_SIZE = 5000;
-
-	// write one by pairing the phase and evaluation value (in packed sfen format)
-	void write(size_t thread_id, const PackedSfenValue& psv)
-	{
-		// We have a buffer for each thread and add it there.
-		// If the buffer overflows, write it to a file.
-
-		// This buffer is prepared for each thread.
-		auto& buf = sfen_buffers[thread_id];
-
-		// Secure since there is no buf at the first time and immediately after writing the thread buffer.
-		if (!buf)
-		{
-			buf = new PSVector();
-			buf->reserve(SFEN_WRITE_SIZE);
-		}
-
-		// It is prepared for each thread, so one thread does not call this write() function at the same time.
-		// There is no need to exclude at this point.
-		buf->push_back(psv);
-
-		if (buf->size() >= SFEN_WRITE_SIZE)
-		{
-			// If you load it in sfen_buffers_pool, the worker will do the rest.
-
-			// Mutex lock is required when changing the contents of sfen_buffers_pool.
-			std::unique_lock<std::mutex> lk(mutex);
-			sfen_buffers_pool.push_back(buf);
-
-			buf = nullptr;
-			// If you set buf == nullptr, the buffer will be allocated the next time this function is called.
-		}
-	}
-
-	// Move what remains in the buffer for your thread to a buffer for writing to a file.
-	void finalize(size_t thread_id)
-	{
-		std::unique_lock<std::mutex> lk(mutex);
-
-		auto& buf = sfen_buffers[thread_id];
-
-		// There is a case that buf==nullptr, so that check is necessary.
-		if (buf && buf->size() != 0)
-			sfen_buffers_pool.push_back(buf);
-
-		buf = nullptr;
-	}
-
-	// Start the write_worker thread.
-	void start_file_write_worker()
-	{
-		file_worker_thread = std::thread([&] { this->file_write_worker(); });
-	}
-
-	// Dedicated thread to write to file
-	void file_write_worker()
-	{
-		auto output_status = [&]()
-		{
-			// also output the current time
-			sync_cout << endl << sfen_write_count << " sfens , at " << now_string() << sync_endl;
-
-			// This is enough for flush().
-			fs.flush();
-		};
-
-		while (!finished || sfen_buffers_pool.size())
-		{
-			vector<PSVector*> buffers;
-			{
-				std::unique_lock<std::mutex> lk(mutex);
-
-				// copy the whole
-				buffers = sfen_buffers_pool;
-				sfen_buffers_pool.clear();
-			}
-
-			// sleep() if you didn't get anything
-			if (!buffers.size())
-				sleep(100);
-			else
-			{
-				for (auto ptr : buffers)
-				{
-					fs.write((const char*)&((*ptr)[0]), sizeof(PackedSfenValue) * ptr->size());
-
-					sfen_write_count += ptr->size();
-
-#if 1
-					// Add the processed number here, and if it exceeds save_every, change the file name and reset this counter.
-					save_every_counter += ptr->size();
-					if (save_every_counter >= save_every)
-					{
-						save_every_counter = 0;
-						// Change the file name.
-
-						fs.close();
-
-						// Sequential number attached to the file
-						int n = (int)(sfen_write_count / save_every);
-						// Rename the file and open it again. Add ios::app in consideration of overwriting. (Depending on the operation, it may not be necessary.)
-						string filename = filename_ + "_" + std::to_string(n);
-						fs.open(filename, ios::out | ios::binary | ios::app);
-						cout << endl << "output sfen file = " << filename << endl;
-					}
-#endif
-
-					// Output'.' every time when writing a game record.
-					std::cout << ".";
-
-					// Output the number of phases processed every 40 times
-					// Finally, the remainder of the teacher phase of each thread is written out, so halfway numbers are displayed, but is it okay?
-					// If you overuse the threads to the maximum number of logical cores, the console will be clogged, so it may be a little more loose.
-					if ((++time_stamp_count % 40) == 0)
-						output_status();
-
-					// Since this memory is unnecessary, release it at this timing.
-					delete ptr;
-				}
-			}
-		}
-
-		// Output the time stamp again before the end.
-		output_status();
-	}
-
-	// Change the file name in this unit.
-	uint64_t save_every = UINT64_MAX;
-
-private:
-
-	fstream fs;
-
-	// File name passed in the constructor
-	std::string filename_;
-
-	// Add the processed number here, and if it exceeds save_every, change the file name and reset this counter.
-	uint64_t save_every_counter = 0;
-
-	// thread to write to the file
-	std::thread file_worker_thread;
-	// Flag that all threads have finished
-	atomic<bool> finished;
-
-	// Counter for time stamp output
-	uint64_t time_stamp_count = 0;
-
-	// buffer before writing to file
-	// sfen_buffers is the buffer for each thread
-	// sfen_buffers_pool is a buffer for writing.
-	// After loading the phase in the former buffer by SFEN_WRITE_SIZE, transfer it to the latter.
-	std::vector<PSVector*> sfen_buffers;
-	std::vector<PSVector*> sfen_buffers_pool;
-
-	// Mutex required to access sfen_buffers_pool
-	std::mutex mutex;
-
-	// number of written phases
-	uint64_t sfen_write_count = 0;
-};
-
-// -----------------------------------
-// worker that creates the game record (for each thread)
-// -----------------------------------
-
-// Class to generate sfen with multiple threads
-struct MultiThinkGenSfen : public MultiThink
-{
-	MultiThinkGenSfen(int search_depth_, int search_depth2_, SfenWriter& sw_)
-		: search_depth(search_depth_), search_depth2(search_depth2_), sw(sw_)
-	{
-		hash.resize(GENSFEN_HASH_SIZE);
-
-		// Output for confirmation if the same random seed is not drawn when parallelizing and gensfening the PC.
-		std::cout << prng << std::endl;
-	}
-
-	virtual void thread_worker(size_t thread_id);
-	void start_file_write_worker() { sw.start_file_write_worker(); }
-
-	// search_depth = search depth for normal search
-	int search_depth;
-	int search_depth2;
-
-	// Number of the nodes to be searched.
-	// 0 represents no limits.
-	uint64_t nodes;
-
-	// Upper limit of evaluation value of generated situation
-	int eval_limit;
-
-	// minimum ply with random move
-	int random_move_minply;
-	// maximum ply with random move
-	int random_move_maxply;
-	// Number of random moves in one station
-	int random_move_count;
-	// Move balls with a probability of 1/N when randomly moving like Apery.
-	// When you move the ball again, there is a 1/N chance that it will randomly move once in the opponent's number.
-	// Apery has N=2. Specifying 0 here disables this function.
-	int random_move_like_apery;
-
-	// For when using multi pv instead of random move.
-	// random_multi_pv is the number of candidates for MultiPV.
-	// When adopting the move of the candidate move, the difference between the evaluation value of the move of the 1st place and the evaluation value of the move of the Nth place is
-	// Must be in the range random_multi_pv_diff.
-	// random_multi_pv_depth is the search depth for MultiPV.
-	int random_multi_pv;
-	int random_multi_pv_diff;
-	int random_multi_pv_depth;
-
-	// The minimum and maximum ply (number of steps from the initial phase) of the phase to write out.
-	int write_minply;
-	int write_maxply;
-
-	// sfen exporter
-	SfenWriter& sw;
-
-	// hash to limit the export of the same phase
-	// It must be 2**N because it will be used as the mask to calculate hash_index.
-	static const uint64_t GENSFEN_HASH_SIZE = 64 * 1024 * 1024;
-
-	vector<Key> hash; // 64MB*sizeof(HASH_KEY) = 512MB
-};
-
-//  thread_id    = 0..Threads.size()-1
-void MultiThinkGenSfen::thread_worker(size_t thread_id)
-{
-	// For the time being, it will be treated as a draw at the maximum number of steps to write.
-	const int MAX_PLY2 = write_maxply;
-
-	//Maximum StateInfo + Search PV to advance to leaf buffer
-	std::vector<StateInfo,AlignedAllocator<StateInfo>> states(MAX_PLY2 + MAX_PLY /* == search_depth + α */);
-	StateInfo si;
-
-	// This move. Use this move to advance the stage.
-	Move m = MOVE_NONE;
-
-	// end flag
-	bool quit = false;
-
-	// Variables for draw adjudication.
-	// Todo: Make this as an option.
-	int adj_draw_ply = 80; // start the adjudication when ply reaches this value
-	int adj_draw_cnt = 8;  // 4 move scores for each side have to be checked
-	int adj_draw_score = 0;  // move score in CP
-
-	// repeat until the specified number of times
-	while (!quit)
-	{
-		// It is necessary to set a dependent thread for Position.
-		// When parallelizing, Threads (since this is a vector<Thread*>,
-		// Do the same for up to Threads[0]...Threads[thread_num-1].
-		auto th = Threads[thread_id];
-
-		auto& pos = th->rootPos;
-    pos.set(StartFEN, false, &si, th);
-
-    // Test cod for Packed SFEN.
-    //{
-    //  PackedSfen packed_sfen;
-    //  pos.sfen_pack(packed_sfen);
-    //  std::cout << pos << std::endl;
-    //  pos.set_from_packed_sfen(packed_sfen, &si, th);
-    //  std::string actual = pos.fen();
-    //  assert(actual == StartFEN);
-    //}
-
-		// Refer to the members of BookMoveSelector defined in the search section.
-		//auto& book = ::book;
-
-		// Save the situation for one station, and write it out including the winning and losing at the end.
-		// The function to write is flush_psv() below this.
-		PSVector a_psv;
-		a_psv.reserve(MAX_PLY2 + MAX_PLY);
-
-		// Write out the phases loaded in a_psv to a file.
-		// lastTurnIsWin: win/loss in the next phase after the final phase in a_psv
-		// 1 when winning. -1 when losing. Pass 0 for a draw.
-		// Return value: true if the specified number of phases has already been reached and the process ends.
-		auto flush_psv = [&](int8_t lastTurnIsWin)
-		{
-			int8_t isWin = lastTurnIsWin;
-
-			// From the final stage (one step before) to the first stage, give information on the outcome of the game for each stage.
-			// The phases stored in a_psv are assumed to be continuous (in order).
-			for (auto it = a_psv.rbegin(); it != a_psv.rend(); ++it)
-			{
-				// If isWin == 0 (draw), multiply by -1 and it will remain 0 (draw)
-				isWin = - isWin;
-				it->game_result = isWin;
-
-				// When I tried to write out the phase, it reached the specified number of times.
-				// Because the counter is added in get_next_loop_count()
-				// If you don't call this when the phase is output, the counter goes crazy.
-				auto loop_count = get_next_loop_count();
-				if (loop_count == UINT64_MAX)
-				{
-					// Set the end flag.
-					quit = true;
-					return;
-				}
-
-				// Write out one aspect.
-				sw.write(thread_id, *it);
-
-#if 0
-				pos.set_from_packed_sfen(it->sfen);
-				cout << pos << "Win : " << it->isWin << " , " << it->score << endl;
-#endif
-			}
-		};
-
-		// ply flag for whether or not to randomly move by eyes
-		vector<bool> random_move_flag;
-		{
-			// If you want to add a random move, random_move_maxply be sure to enter random_move_count times before the first move.
-			// I want you to disperse so much.
-			// I'm not sure how best it is. Experimenting under various conditions.
-
-			// Make an array like a[0] = 0 ,a[1] = 1, ...
-			// Fisher-Yates shuffle and take out the first N items.
-			// Actually, I only want N pieces, so I only need to shuffle the first N pieces with Fisher-Yates.
-
-			vector<int> a;
-			a.reserve((size_t)random_move_maxply);
-
-			// random_move_minply ,random_move_maxply is specified by 1 origin,
-			// Note that we are handling 0 origin here.
-			for (int i = std::max(random_move_minply - 1 , 0) ; i < random_move_maxply; ++i)
-				a.push_back(i);
-
-			// In case of Apery random move, insert() may be called random_move_count times.
-			// Reserve only the size considering it.
-			random_move_flag.resize((size_t)random_move_maxply + random_move_count);
-
-			// A random move that exceeds the size() of a[] cannot be applied, so limit it.
-			for (int i = 0 ; i < std::min(random_move_count, (int)a.size()) ; ++i)
-			{
-				swap(a[i], a[prng.rand((uint64_t)a.size() - i) + i]);
-				random_move_flag[a[i]] = true;
-			}
-		}
-
-		// A counter that keeps track of the number of random moves
-		// When random_move_minply == -1, random moves are performed continuously, so use it at this time.
-		int random_move_c = 0;
-
-		// Save history of move scores for adjudication
-		vector<int> move_hist_scores;
-
-		// ply: steps from the initial stage
-		for (int ply = 0; ; ++ply)
-		{
-			//cout << pos << endl;
-
-			// Current search depth
-			// Goto will fly, so declare it first.
-			int depth = search_depth + (int)prng.rand(search_depth2 - search_depth + 1);
-
-			// has it reached the length
-			if (ply >= MAX_PLY2)
-			{
-				if (write_out_draw_game_in_training_data_generation) {
-				// Write out as win/loss = draw.
-				// This way it is harder to allow the opponent to enter the ball when I enter (may)
-				flush_psv(0);
-				}
-				break;
-			}
-
-      if (pos.is_draw(ply)) {
-		  if (write_out_draw_game_in_training_data_generation) {
-			  // Write if draw.
-			  flush_psv(0);
-		  }
-        break;
-      }
-
-			// Initialize the Syzygy Ending Tablebase and sort the moves.
-			Search::RootMoves rootMoves;
-			for (const auto& m : MoveList<LEGAL>(pos))
-				rootMoves.emplace_back(m);
-			if (!rootMoves.empty())
-				Tablebases::rank_root_moves(pos, rootMoves);
-
-			// If there is no legal move, terminate the game if position
-			// is mate or a stalemate.
-			else {
-				if (pos.checkers()) // Mate
-					flush_psv(-1);
-				else if (write_out_draw_game_in_training_data_generation) {
-					flush_psv(0); // Stalemate
-				}
-				break;
-			}
-
-			// Adjudicate game to a draw if the last 4 scores of each engine is 0.
-			if (detect_draw_by_consecutive_low_score) {
-				if (ply >= adj_draw_ply) {
-					int draw_cnt = 0;
-					bool is_adj_draw = false;
-
-					for (vector<int>::reverse_iterator it = move_hist_scores.rbegin();
-						it != move_hist_scores.rend(); ++it) 
-					{
-						if (abs(*it) <= adj_draw_score)
-							draw_cnt++;
-						else
-							break;  // score should be successive
-
-						if (draw_cnt >= adj_draw_cnt) {
-							is_adj_draw = true;
-							break;
-						}
-					}
-
-					if (is_adj_draw) {
-						if (write_out_draw_game_in_training_data_generation)
-							flush_psv(0);
-						break;
-					}
-				}
-			}
-
-			// Draw by insufficient mating material
-			if (detect_draw_by_insufficient_mating_material) {
-				if (pos.count<ALL_PIECES>() <= 4) {
-					int pcnt = pos.count<ALL_PIECES>();
-					// (1) KvK
-					if (pcnt == 2) {
-						if (write_out_draw_game_in_training_data_generation)
-							flush_psv(0);
-						break;
-					}
-					// (2) KvK + 1 minor piece
-					if (pcnt == 3) {
-						int minor_pc = pos.count<BISHOP>(WHITE) + pos.count<KNIGHT>(WHITE) +
-							pos.count<BISHOP>(BLACK) + pos.count<KNIGHT>(BLACK);
-						if (minor_pc == 1) {
-							if (write_out_draw_game_in_training_data_generation)
-								flush_psv(0);
-							break;
-						}
-					}
-					// (3) KBvKB, bishops of the same color
-					else if (pcnt == 4) {
-						if (pos.count<BISHOP>(WHITE) == 1 && pos.count<BISHOP>(BLACK) == 1) {
-							// Color of bishops is black.
-							if ((pos.pieces(WHITE, BISHOP) & DarkSquares)
-								&& (pos.pieces(BLACK, BISHOP) & DarkSquares))
-							{
-								if (write_out_draw_game_in_training_data_generation)
-									flush_psv(0);
-								break;
-							}
-							// Color of bishops is white.
-							if ((pos.pieces(WHITE, BISHOP) & ~DarkSquares)
-								&& (pos.pieces(BLACK, BISHOP) & ~DarkSquares))
-							{
-								if (write_out_draw_game_in_training_data_generation)
-									flush_psv(0);
-								break;
-							}
-						}
-					}
-				}
-			}
-
-			//// constant track
-			//if ((m = book.probe(pos)) != MOVE_NONE)
-			//{
-			//  // Hit the constant track.
-			//  // The move was stored in m.
-
-			//  // Do not use the fixed phase for learning.
-			//  a_psv.clear();
-
-			//  if (random_move_minply != -1)
-			// 		// Random move is performed with a certain probability even in the constant phase.
-			// 		goto RANDOM_MOVE;
-			//  else
-			// 		// When -1 is specified as random_move_minply, it points according to the standard until it goes out of the standard.
-			// 		// Prepare an innumerable number of situations that have left the constant as ConsiderationBookMoveCount true using a huge constant
-			// 		// Used for purposes such as performing a random move 5 times from there.
-			// 		goto DO_MOVE;
-			//}
-
-			{
-				// search_depth～search_depth2 Evaluation value of hand reading and PV (best responder row)
-				// There should be no problem if you narrow the search window.
-
-				auto pv_value1 = search(pos, depth, 1, nodes);
-
-				auto value1 = pv_value1.first;
-				auto& pv1 = pv_value1.second;
-
-				// For situations where the absolute evaluation value is greater than or equal to this value
-				// It doesn't make much sense to use that aspect for learning, so this game ends.
-				// Treat this as having won or lost.
-
-				// If you win one move, declarative win, mate_in(2) will be returned here, so it will be the same value as the upper limit of eval_limit,
-				// This if expression is always true. The same applies to resign.
-
-				if (abs(value1) >= eval_limit)
-				{
-					// sync_cout << pos << "eval limit = "<< eval_limit << "over ,move = "<< pv1[0] << sync_endl;
-
-					// If value1 >= eval_limit in this aspect, you win (the turn side of this aspect).
-					flush_psv((value1 >= eval_limit) ? 1 : -1);
-					break;
-				}
-
-				// Verification of a strange move
-				if (pv1.size() > 0
-					&& (pv1[0] == MOVE_NONE || pv1[0] == MOVE_NULL)
-					)
-				{
-					// MOVE_WIN is checking if it is the declaration victory stage before this
-					// The declarative winning move should never come back here.
-					// Also, when MOVE_RESIGN, value1 is a one-stop score, which should be the minimum value of eval_limit (-31998)...
-					cout << "Error! : " << pos.fen() << m << value1 << endl;
-					break;
-				}
-
-				// Save the move score for adjudication.
-				move_hist_scores.push_back(value1);
-
-				// Use PV's move to the leaf node and use the value that evaluated() is called on that leaf node.
-				auto evaluate_leaf = [&](Position& pos , vector<Move>& pv)
-				{
-					auto rootColor = pos.side_to_move();
-
-					int ply2 = ply;
-					for (auto m : pv)
-					{
-						// As a verification for debugging, make sure there are no illegal players in the middle.
-						// NULL_MOVE does not come.
-
-						// I tested it out enough so I can comment it out.
-#if 1
-						// I shouldn't be an illegal player.
-						// declarative win and not mated() are tested above so
-						// It is guaranteed that MOVE_WIN and MOVE_RESIGN do not come as a reader. (Should...)
-						if (!pos.pseudo_legal(m) || !pos.legal(m))
-						{
-							cout << "Error! : " << pos.fen() << m << endl;
-						}
-#endif
-						pos.do_move(m, states[ply2++]);
-						
-						//Because the difference calculation of evaluate() cannot be performed unless each node evaluate() is called!
-						// If the depth is 8 or more, it seems faster not to calculate this difference.
-#if defined(EVAL_NNUE)
-            if (depth < 8)
-              Eval::NNUE::update_eval(pos);
-#endif  // defined(EVAL_NNUE)
-					}
-
-					// reach leaf
-					Value v;
-					if (pos.checkers()) {
-						// Sometime a king is checked.  An example is a case that a checkmate is
-						// found in the search.  If Eval::evaluate() is called whne a king is
-						// checked, classic eval crashes by an assertion.  To avoid crashes, return
-						// value1 instead of the score of the PV leaf.
-						v = value1;
-					}
-					else {
-						v = Eval::evaluate(pos);
-					// evaluate() returns the evaluation value on the turn side, so
-					// If it's a turn different from root_color, you must invert v and return it.
-					if (rootColor != pos.side_to_move())
-						v = -v;
-					}
-
-					// Rewind.
-					// Is it C++x14, and isn't there even foreach to turn in reverse?
-					//  for (auto it : boost::adaptors::reverse(pv))
-
-					for (auto it = pv.rbegin(); it != pv.rend(); ++it)
-						pos.undo_move(*it);
-
-					return v;
-				};
-
-#if 0
-				dbg_hit_on(pv_value1.first == leaf_value);
-				// gensfen depth 3 eval_limit 32000
-				// Total 217749 Hits 203579 hit rate (%) 93.490
-				// gensfen depth 6 eval_limit 32000
-				// Total 78407 Hits 69190 hit rate (%) 88.245
-				// gensfen depth 6 eval_limit 3000
-				// Total 53879 Hits 43713 hit rate (%) 81.132
-
-				// Problems such as pruning with moves in the substitution table.
-				// This is a little uncomfortable as a teacher...
-#endif
-
-				//If depth 0, pv is not obtained, so search again at depth 2.
-				if (search_depth <= 0)
-				{
-					pv_value1 = search(pos, 2);
-					pv1 = pv_value1.second;
-				}
-
-				// The surroundings of the initial stage are all similar
-				// Do not write it out because it can lead to overlearning when used for learning.
-				// → comparative experiment should be done
-				if (ply < write_minply - 1)
-				{
-					a_psv.clear();
-					goto SKIP_SAVE;
-				}
-
-				// Did you just write the same phase?
-				// This may include the same aspect as it is generated in parallel on multiple PCs, so
-				// It is better to do the same process when reading.
-				{
-					auto key = pos.key();
-					auto hash_index = (size_t)(key & (GENSFEN_HASH_SIZE - 1));
-					auto key2 = hash[hash_index];
-					if (key == key2)
-					{
-						// when skipping regarding earlier
-						// Clear the saved situation because the win/loss information will be incorrect.
-						// anyway, when the hash matches, it's likely that the previous phases also match
-						// Not worth writing out.
-						a_psv.clear();
-						goto SKIP_SAVE;
-					}
-					hash[hash_index] = key; // Replace with the current key.
-				}
-
-				// Temporary saving of the situation.
-				{
-					a_psv.emplace_back(PackedSfenValue());
-					auto &psv = a_psv.back();
-
-					// If pack is requested, write the packed sfen and the evaluation value at that time.
-					// The final writing is after winning or losing.
-					pos.sfen_pack(psv.sfen);
-
-          //{
-          //  std::string before_fen = pos.fen();
-          //  pos.set_from_packed_sfen(psv.sfen, &si, th);
-          //  std::string after_fen = pos.fen();
-          //  assert(before_fen == after_fen);
-          //}
-
-					// Get the value of evaluate() as seen from the root color on the leaf node of the PV line.
-					//I don't know the goodness and badness of using the return value of search() as it is.
-					psv.score = evaluate_leaf(pos, pv1);
-					psv.gamePly = ply;
-
-					// Take out the first PV hand. This should be present unless depth 0.
-					assert(pv_value1.second.size() >= 1);
-					Move pv_move1 = pv_value1.second[0];
-					psv.move = pv_move1;
-				}
-
-			SKIP_SAVE:;
-
-				// For some reason, I could not get PV (hit the substitution table etc. and got stuck?) so go to the next game.
-				// It's a rare case, so you can ignore it.
-				if (pv1.size() == 0)
-					break;
-
-				// search_depth Advance the phase by hand reading.
-				m = pv1[0];
-			}
-
-		RANDOM_MOVE:;
-
-			// Phase to randomly choose one from legal hands
-			if (
-				// 1. Random move of random_move_count times from random_move_minply to random_move_maxply
-				(random_move_minply != -1 && ply <(int)random_move_flag.size() && random_move_flag[ply]) ||
-				// 2. A mode to perform random move of random_move_count times after leaving the track
-				(random_move_minply == -1 && random_move_c <random_move_count))
-			{
-				++random_move_c;
-
-				// It's not a mate, so there should be one legal hand...
-				if (random_multi_pv == 0)
-				{
-					// normal random move
-
-					MoveList<LEGAL> list(pos);
-
-					// I don't really know the goodness and badness of making this the Apery method.
-					if (random_move_like_apery == 0
-						|| prng.rand(random_move_like_apery) != 0
-					)
-					{
-						// Normally one move from legal move
-						m = list.at((size_t)prng.rand((uint64_t)list.size()));
-					}
-					else {
-						// if you can move the ball, move the ball
-						Move moves[8]; // Near 8
-						Move* p = &moves[0];
-						for (auto& m : list)
-							if (type_of(pos.moved_piece(m)) == KING)
-								*(p++) = m;
-						size_t n = p - &moves[0];
-						if (n != 0)
-						{
-							// move to move the ball
-							m = moves[prng.rand(n)];
-
-							// In Apery method, at this time there is a 1/2 chance that the opponent will also move randomly
-							if (prng.rand(2) == 0)
-							{
-								// Is it a simple hack to add a "1" next to random_move_flag[ply]?
-								random_move_flag.insert(random_move_flag.begin() + ply + 1, 1, true);
-							}
-						}
-						else
-							// Normally one move from legal move
-							m = list.at((size_t)prng.rand((uint64_t)list.size()));
-					}
-
-					// I put in the code of two handed balls, but if you choose one from legal hands, it should be equivalent to that
-					// I decided it's unnecessary because it just makes the code more complicated.
-				}
-				else {
-					// Since the logic becomes complicated, I'm sorry, I will search again with MultiPV here.
-					Learner::search(pos, random_multi_pv_depth, random_multi_pv);
-					// Select one from the top N hands of root Moves
-
-					auto& rm = pos.this_thread()->rootMoves;
-
-					uint64_t s = min((uint64_t)rm.size(), (uint64_t)random_multi_pv);
-					for (uint64_t i = 1; i < s; ++i)
-					{
-						// The difference from the evaluation value of rm[0] must be within the range of random_multi_pv_diff.
-						// It can be assumed that rm[x].score is arranged in descending order.
-						if (rm[0].score > rm[i].score + random_multi_pv_diff)
-						{
-							s = i;
-							break;
-						}
-					}
-
-					m = rm[prng.rand(s)].pv[0];
-
-					// I haven't written one phase yet, but it ended, so the writing process ends and the next game starts.
-					if (!is_ok(m))
-						break;
-				}
-
-				// When trying to evaluate the move from the outcome of the game,
-				// There is a random move this time, so try not to fall below this.
-				a_psv.clear(); // clear saved aspect
-			}
-
-		DO_MOVE:;
-			pos.do_move(m, states[ply]);
-
-			// Call node evaluate() for each difference calculation.
-			Eval::NNUE::update_eval(pos);
-
-		} // for (int ply = 0; ; ++ply)
-
-	} // while(!quit)
-
-	sw.finalize(thread_id);
-}
-
-// -----------------------------------
-// Command to generate a game record (master thread)
-// -----------------------------------
-
-// Command to generate a game record
-void gen_sfen(Position&, istringstream& is)
-{
-	// number of threads (given by USI setoption)
-	uint32_t thread_num = (uint32_t)Options["Threads"];
-
-	// Number of generated game records default = 8 billion phases (Ponanza specification)
-	uint64_t loop_max = 8000000000UL;
-
-	// Stop the generation when the evaluation value reaches this value.
-	int eval_limit = 3000;
-
-	// search depth
-	int search_depth = 3;
-	int search_depth2 = INT_MIN;
-
-	// Number of nodes to be searched.
-	uint64_t nodes = 0;
-
-	// minimum ply, maximum ply and number of random moves
-	int random_move_minply = 1;
-	int random_move_maxply = 24;
-	int random_move_count = 5;
-	// A function to move the random move mainly like Apery
-	// If this is set to 3, the ball will move with a probability of 1/3.
-	int random_move_like_apery = 0;
-	// If you search with multipv instead of random move and choose from among them randomly, set random_multi_pv = 1 or more.
-	int random_multi_pv = 0;
-	int random_multi_pv_diff = 32000;
-	int random_multi_pv_depth = INT_MIN;
-
-	// The minimum and maximum ply (number of steps from the initial phase) of the phase to write out.
-	int write_minply = 16;
-	int write_maxply = 400;
-
-	// File name to write
-	string output_file_name = "generated_kifu.bin";
-
-	string token;
-
-	// When hit to eval hash, as a evaluation value near the initial stage, if a hash collision occurs and a large value is written
-	// When eval_limit is set small, eval_limit will be exceeded every time in the initial phase, and phase generation will not proceed.
-	// Therefore, eval hash needs to be disabled.
-	// After that, when the hash of the eval hash collides, the evaluation value of a strange value is used, and it may be unpleasant to use it for the teacher.
-	bool use_eval_hash = false;
-
-	// Save to file in this unit.
-	// File names are serialized like file_1.bin, file_2.bin.
-	uint64_t save_every = UINT64_MAX;
-
-	// Add a random number to the end of the file name.
-	bool random_file_name = false;
-
-	while (true)
-	{
-		token = "";
-		is >> token;
-		if (token == "")
-			break;
-
-		if (token == "depth")
-			is >> search_depth;
-		else if (token == "depth2")
-			is >> search_depth2;
-		else if (token == "nodes")
-			is >> nodes;
-		else if (token == "loop")
-			is >> loop_max;
-		else if (token == "output_file_name")
-			is >> output_file_name;
-		else if (token == "eval_limit")
-		{
-			is >> eval_limit;
-			// Limit the maximum to a one-stop score. (Otherwise you might not end the loop)
-			eval_limit = std::min(eval_limit, (int)mate_in(2));
-		}
-		else if (token == "random_move_minply")
-			is >> random_move_minply;
-		else if (token == "random_move_maxply")
-			is >> random_move_maxply;
-		else if (token == "random_move_count")
-			is >> random_move_count;
-		else if (token == "random_move_like_apery")
-			is >> random_move_like_apery;
-		else if (token == "random_multi_pv")
-			is >> random_multi_pv;
-		else if (token == "random_multi_pv_diff")
-			is >> random_multi_pv_diff;
-		else if (token == "random_multi_pv_depth")
-			is >> random_multi_pv_depth;
-		else if (token == "write_minply")
-			is >> write_minply;
-		else if (token == "write_maxply")
-			is >> write_maxply;
-		else if (token == "use_eval_hash")
-			is >> use_eval_hash;
-		else if (token == "save_every")
-			is >> save_every;
-		else if (token == "random_file_name")
-			is >> random_file_name;
-		// Accept also the old option name.
-		else if (token == "use_draw_in_training_data_generation" || token == "write_out_draw_game_in_training_data_generation")
-			is >> write_out_draw_game_in_training_data_generation;
-		// Accept also the old option name.
-		else if (token == "use_game_draw_adjudication" || token == "detect_draw_by_consecutive_low_score")
-			is >> detect_draw_by_consecutive_low_score;
-		else if (token == "detect_draw_by_insufficient_mating_material")
-			is >> detect_draw_by_insufficient_mating_material;
-		else if (token == "use_raw_nnue_eval")
-			is >> use_raw_nnue_eval;
-		else
-			cout << "Error! : Illegal token " << token << endl;
-	}
-
-#if defined(USE_GLOBAL_OPTIONS)
-	// Save it for later restore.
-	auto oldGlobalOptions = GlobalOptions;
-	GlobalOptions.use_eval_hash = use_eval_hash;
-#endif
-
-	// If search depth2 is not set, leave it the same as search depth.
-	if (search_depth2 == INT_MIN)
-		search_depth2 = search_depth;
-	if (random_multi_pv_depth == INT_MIN)
-		random_multi_pv_depth = search_depth;
-
-	if (random_file_name)
-	{
-		// Give a random number to output_file_name at this point.
-		// Do not use std::random_device().  Because it always the same integers on MinGW.
-		PRNG r(std::chrono::system_clock::now().time_since_epoch().count());
-		// Just in case, reassign the random numbers.
-		for(int i=0;i<10;++i)
-			r.rand(1);
-		auto to_hex = [](uint64_t u){
-			std::stringstream ss;
-			ss << std::hex << u;
-			return ss.str();
-		};
-		// I don't want to wear 64bit numbers by accident, so I'm going to make a 64bit number 2 just in case.
-		output_file_name = output_file_name + "_" + to_hex(r.rand<uint64_t>()) + to_hex(r.rand<uint64_t>());
-	}
-
-	std::cout << "gensfen : " << endl
-		<< "  search_depth = " << search_depth << " to " << search_depth2 << endl
-		<< "  nodes = " << nodes << endl
-		<< "  loop_max = " << loop_max << endl
-		<< "  eval_limit = " << eval_limit << endl
-		<< "  thread_num (set by USI setoption) = " << thread_num << endl
-		//<< "  book_moves (set by USI setoption) = " << Options["BookMoves"] << endl
-		<< "  random_move_minply     = " << random_move_minply << endl
-		<< "  random_move_maxply     = " << random_move_maxply << endl
-		<< "  random_move_count      = " << random_move_count << endl
-		<< "  random_move_like_apery = " << random_move_like_apery << endl
-		<< "  random_multi_pv        = " << random_multi_pv << endl
-		<< "  random_multi_pv_diff   = " << random_multi_pv_diff << endl
-		<< "  random_multi_pv_depth  = " << random_multi_pv_depth << endl
-		<< "  write_minply           = " << write_minply << endl
-		<< "  write_maxply           = " << write_maxply << endl
-		<< "  output_file_name       = " << output_file_name << endl
-		<< "  use_eval_hash          = " << use_eval_hash << endl
-		<< "  save_every             = " << save_every << endl
-		<< "  random_file_name       = " << random_file_name << endl
-		<< "  write_out_draw_game_in_training_data_generation = " << write_out_draw_game_in_training_data_generation << endl
-		<< "  detect_draw_by_consecutive_low_score = " << detect_draw_by_consecutive_low_score << endl
-		<< "  detect_draw_by_insufficient_mating_material = " << detect_draw_by_insufficient_mating_material << endl;
-
-	// Show if the training data generator uses NNUE.
-	Eval::verify_NNUE();
-
-	// Create and execute threads as many as Options["Threads"].
-	{
-		SfenWriter sw(output_file_name, thread_num);
-		sw.save_every = save_every;
-
-		MultiThinkGenSfen multi_think(search_depth, search_depth2, sw);
-		multi_think.nodes = nodes;
-		multi_think.set_loop_max(loop_max);
-		multi_think.eval_limit = eval_limit;
-		multi_think.random_move_minply = random_move_minply;
-		multi_think.random_move_maxply = random_move_maxply;
-		multi_think.random_move_count = random_move_count;
-		multi_think.random_move_like_apery = random_move_like_apery;
-		multi_think.random_multi_pv = random_multi_pv;
-		multi_think.random_multi_pv_diff = random_multi_pv_diff;
-		multi_think.random_multi_pv_depth = random_multi_pv_depth;
-		multi_think.write_minply = write_minply;
-		multi_think.write_maxply = write_maxply;
-		multi_think.start_file_write_worker();
-		multi_think.go_think();
-
-		// Since we are joining with the destructor of SfenWriter, please give a message that it has finished after the join
-		// Enclose this in a block because it should be displayed.
-	}
-
-	std::cout << "gensfen finished." << endl;
-
-#if defined(USE_GLOBAL_OPTIONS)
-	// Restore Global Options.
-	GlobalOptions = oldGlobalOptions;
-#endif
-
-}
-
-// -----------------------------------
-// command to learn from the generated game (learn)
-// -----------------------------------
-
-// ordinary sigmoid function
-double sigmoid(double x)
-{
-	return 1.0 / (1.0 + std::exp(-x));
-}
-
-// A function that converts the evaluation value to the winning rate [0,1]
-double winning_percentage(double value)
-{
-	// 1/(1+10^(-Eval/4))
-	// = 1/(1+e^(-Eval/4*ln(10))
-	// = sigmoid(Eval/4*ln(10))
-	return sigmoid(value * winning_probability_coefficient);
-}
-
-// A function that converts the evaluation value to the winning rate [0,1]
-double winning_percentage_wdl(double value, int ply)
-{
-	double wdl_w = UCI::win_rate_model_double( value, ply);
-	double wdl_l = UCI::win_rate_model_double(-value, ply);
-	double wdl_d = 1000.0 - wdl_w - wdl_l;
-
-	return (wdl_w + wdl_d / 2.0) / 1000.0;
-}
-
-// A function that converts the evaluation value to the winning rate [0,1]
-double winning_percentage(double value, int ply)
-{
-	if (use_wdl) {
-		return winning_percentage_wdl(value, ply);
-	}
-	else {
-		return winning_percentage(value);
-	}
-}
-
-double calc_cross_entropy_of_winning_percentage(double deep_win_rate, double shallow_eval, int ply)
-{
-	double p = deep_win_rate;
-	double q = winning_percentage(shallow_eval, ply);
-	return -p * std::log(q) - (1 - p) * std::log(1 - q);
-}
-
-double calc_d_cross_entropy_of_winning_percentage(double deep_win_rate, double shallow_eval, int ply)
-{
-	constexpr double epsilon = 0.000001;
-	double y1 = calc_cross_entropy_of_winning_percentage(deep_win_rate, shallow_eval          , ply);
-	double y2 = calc_cross_entropy_of_winning_percentage(deep_win_rate, shallow_eval + epsilon, ply);
-
-	// Divide by the winning_probability_coefficient to match scale with the sigmoidal win rate
-	return ((y2 - y1) / epsilon) / winning_probability_coefficient;
-}
-
-double dsigmoid(double x)
-{
-	// Sigmoid function
-	// f(x) = 1/(1+exp(-x))
-	// the first derivative is
-	// f'(x) = df/dx = f(x)・{ 1-f(x)}
-	// becomes
-
-	return sigmoid(x) * (1.0 - sigmoid(x));
-}
-
-// When the objective function is the sum of squares of the difference in winning percentage
+    static bool use_draw_games_in_training = false;
+    static bool use_draw_games_in_validation = false;
+    static bool skip_duplicated_positions_in_training = true;
+    // 1.0 / PawnValueEg / 4.0 * log(10.0)
+    static double winning_probability_coefficient = 0.00276753015984861260098316280611;
+    // Score scale factors.  ex) If we set src_score_min_value = 0.0,
+    // src_score_max_value = 1.0, dest_score_min_value = 0.0,
+    // dest_score_max_value = 10000.0, [0.0, 1.0] will be scaled to [0, 10000].
+    static double src_score_min_value = 0.0;
+    static double src_score_max_value = 1.0;
+    static double dest_score_min_value = 0.0;
+    static double dest_score_max_value = 1.0;
+    // Assume teacher signals are the scores of deep searches, and convert them into winning
+    // probabilities in the trainer. Sometimes we want to use the winning probabilities in the training
+    // data directly. In those cases, we set false to this variable.
+    static bool convert_teacher_signal_to_winning_probability = true;
+    // Use raw NNUE eval value in the Eval::evaluate(). If hybrid eval is enabled, training data
+    // generation and training don't work well.
+    // https://discordapp.com/channels/435943710472011776/733545871911813221/748524079761326192
+    static bool use_raw_nnue_eval = true;
+    // Using WDL with win rate model instead of sigmoid
+    static bool use_wdl = false;
+
+    // -----------------------------------
+    // command to learn from the generated game (learn)
+    // -----------------------------------
+
+    // ordinary sigmoid function
+    double sigmoid(double x)
+    {
+        return 1.0 / (1.0 + std::exp(-x));
+    }
+
+    // A function that converts the evaluation value to the winning rate [0,1]
+    double winning_percentage(double value)
+    {
+        // 1/(1+10^(-Eval/4))
+        // = 1/(1+e^(-Eval/4*ln(10))
+        // = sigmoid(Eval/4*ln(10))
+        return sigmoid(value * winning_probability_coefficient);
+    }
+
+    // A function that converts the evaluation value to the winning rate [0,1]
+    double winning_percentage_wdl(double value, int ply)
+    {
+        double wdl_w = UCI::win_rate_model_double(value, ply);
+        double wdl_l = UCI::win_rate_model_double(-value, ply);
+        double wdl_d = 1000.0 - wdl_w - wdl_l;
+
+        return (wdl_w + wdl_d / 2.0) / 1000.0;
+    }
+
+    // A function that converts the evaluation value to the winning rate [0,1]
+    double winning_percentage(double value, int ply)
+    {
+        if (use_wdl) {
+            return winning_percentage_wdl(value, ply);
+        }
+        else {
+            return winning_percentage(value);
+        }
+    }
+
+    double calc_cross_entropy_of_winning_percentage(double deep_win_rate, double shallow_eval, int ply)
+    {
+        double p = deep_win_rate;
+        double q = winning_percentage(shallow_eval, ply);
+        return -p * std::log(q) - (1 - p) * std::log(1 - q);
+    }
+
+    double calc_d_cross_entropy_of_winning_percentage(double deep_win_rate, double shallow_eval, int ply)
+    {
+        constexpr double epsilon = 0.000001;
+        double y1 = calc_cross_entropy_of_winning_percentage(deep_win_rate, shallow_eval, ply);
+        double y2 = calc_cross_entropy_of_winning_percentage(deep_win_rate, shallow_eval + epsilon, ply);
+
+        // Divide by the winning_probability_coefficient to match scale with the sigmoidal win rate
+        return ((y2 - y1) / epsilon) / winning_probability_coefficient;
+    }
+
+    double dsigmoid(double x)
+    {
+        // Sigmoid function
+        // f(x) = 1/(1+exp(-x))
+        // the first derivative is
+        // f'(x) = df/dx = f(x)・{ 1-f(x)}
+        // becomes
+
+        return sigmoid(x) * (1.0 - sigmoid(x));
+    }
+
+    // When the objective function is the sum of squares of the difference in winning percentage
 #if defined (LOSS_FUNCTION_IS_WINNING_PERCENTAGE)
 // function to calculate the gradient
-double calc_grad(Value deep, Value shallow, PackedSfenValue& psv)
-{
-	// The square of the win rate difference minimizes it in the objective function.
-	// Objective function J = 1/2m Σ (win_rate(shallow)-win_rate(deep) )^2
-	// However, σ is a sigmoid function that converts the evaluation value into the difference in the winning percentage.
-	// m is the number of samples. shallow is the evaluation value for a shallow search (qsearch()). deep is the evaluation value for deep search.
-	// If W is the feature vector (parameter of the evaluation function) and Xi and Yi are teachers
-	// shallow = W*Xi // * is the Hadamard product, transposing W and meaning X
-	// f(Xi) = win_rate(W*Xi)
-	// If σ(i th deep) = Yi,
-	// J = m/2 Σ (f(Xi)-Yi )^2
-	// becomes a common expression.
-	// W is a vector, and if we write the jth element as Wj, from the chain rule
-	// ∂J/∂Wj = ∂J/∂f ・∂f/∂W ・∂W/∂Wj
-	// = 1/m Σ (f(Xi)-y) ・f'(Xi) ・ 1
+    double calc_grad(Value deep, Value shallow, PackedSfenValue& psv)
+    {
+        // The square of the win rate difference minimizes it in the objective function.
+        // Objective function J = 1/2m Σ (win_rate(shallow)-win_rate(deep) )^2
+        // However, σ is a sigmoid function that converts the evaluation value into the difference in the winning percentage.
+        // m is the number of samples. shallow is the evaluation value for a shallow search (qsearch()). deep is the evaluation value for deep search.
+        // If W is the feature vector (parameter of the evaluation function) and Xi and Yi are teachers
+        // shallow = W*Xi // * is the Hadamard product, transposing W and meaning X
+        // f(Xi) = win_rate(W*Xi)
+        // If σ(i th deep) = Yi,
+        // J = m/2 Σ (f(Xi)-Yi )^2
+        // becomes a common expression.
+        // W is a vector, and if we write the jth element as Wj, from the chain rule
+        // ∂J/∂Wj = ∂J/∂f ・∂f/∂W ・∂W/∂Wj
+        // = 1/m Σ (f(Xi)-y) ・f'(Xi) ・ 1
 
-	// 1/m will be multiplied later, but the contents of Σ can be retained in the array as the value of the gradient.
-	// f'(Xi) = win_rate'(shallow) = sigmoid'(shallow/600) = dsigmoid(shallow / 600) / 600
-	// This /600 at the end is adjusted by the learning rate, so do not write it..
-	// Also, the coefficient of 1/m is unnecessary if you use the update formula that has the automatic gradient adjustment function like Adam and AdaGrad.
-	// Therefore, it is not necessary to save it in memory.
+        // 1/m will be multiplied later, but the contents of Σ can be retained in the array as the value of the gradient.
+        // f'(Xi) = win_rate'(shallow) = sigmoid'(shallow/600) = dsigmoid(shallow / 600) / 600
+        // This /600 at the end is adjusted by the learning rate, so do not write it..
+        // Also, the coefficient of 1/m is unnecessary if you use the update formula that has the automatic gradient adjustment function like Adam and AdaGrad.
+        // Therefore, it is not necessary to save it in memory.
 
-	double p = winning_percentage(deep);
-	double q = winning_percentage(shallow);
-	return (q - p) * dsigmoid(double(shallow) / 600.0);
-}
+        double p = winning_percentage(deep);
+        double q = winning_percentage(shallow);
+        return (q - p) * dsigmoid(double(shallow) / 600.0);
+    }
 #endif
 
 #if defined (LOSS_FUNCTION_IS_CROSS_ENTOROPY)
-double calc_grad(Value deep, Value shallow, const PackedSfenValue& psv)
-{
-	// Objective function with cross entropy
+    double calc_grad(Value deep, Value shallow, const PackedSfenValue& psv)
+    {
+        // Objective function with cross entropy
 
-	// For the concept and nature of cross entropy,
-	// http://nnadl-ja.github.io/nnadl_site_ja/chap3.html#the_cross-entropy_cost_function
-	// http://postd.cc/visual-information-theory-3/
-	// Refer to etc.
+        // For the concept and nature of cross entropy,
+        // http://nnadl-ja.github.io/nnadl_site_ja/chap3.html#the_cross-entropy_cost_function
+        // http://postd.cc/visual-information-theory-3/
+        // Refer to etc.
 
-	// Objective function design)
-	// We want to make the distribution of p closer to the distribution of q → Think of it as the problem of minimizing the cross entropy between the probability distributions of p and q.
-	// J = H(p,q) =-Σ p(x) log(q(x)) = -p log q-(1-p) log(1-q)
-	// x
+        // Objective function design)
+        // We want to make the distribution of p closer to the distribution of q → Think of it as the problem of minimizing the cross entropy between the probability distributions of p and q.
+        // J = H(p,q) =-Σ p(x) log(q(x)) = -p log q-(1-p) log(1-q)
+        // x
 
-	// p is a constant and q is a Wi function (q = σ(W・Xi) ).
-	// ∂J/∂Wi = -p・q'/q-(1-p)(1-q)'/(1-q)
-	// = ...
-	// = q-p.
+        // p is a constant and q is a Wi function (q = σ(W・Xi) ).
+        // ∂J/∂Wi = -p・q'/q-(1-p)(1-q)'/(1-q)
+        // = ...
+        // = q-p.
 
-	double p = winning_percentage(deep);
-	double q = winning_percentage(shallow);
+        double p = winning_percentage(deep);
+        double q = winning_percentage(shallow);
 
-	return q - p;
-}
+        return q - p;
+    }
 #endif
 
 #if defined ( LOSS_FUNCTION_IS_CROSS_ENTOROPY_FOR_VALUE )
-double calc_grad(Value deep, Value shallow, const PackedSfenValue& psv)
-{
-	// Version that does not pass the winning percentage function
-	// This, unless EVAL_LIMIT is set low, trying to match the evaluation value with the shape of the end stage
-	// eval may exceed the range of eval.
-	return shallow - deep;
-}
+    double calc_grad(Value deep, Value shallow, const PackedSfenValue& psv)
+    {
+        // Version that does not pass the winning percentage function
+        // This, unless EVAL_LIMIT is set low, trying to match the evaluation value with the shape of the end stage
+        // eval may exceed the range of eval.
+        return shallow - deep;
+    }
 #endif
 
 #if defined ( LOSS_FUNCTION_IS_ELMO_METHOD )
 
-// A constant used in elmo (WCSC27). Adjustment required.
-// Since elmo does not internally divide the expression, the value is different.
-// You can set this value with the learn command.
-// 0.33 is equivalent to the constant (0.5) used in elmo (WCSC27)
-double ELMO_LAMBDA = 0.33;
-double ELMO_LAMBDA2 = 0.33;
-double ELMO_LAMBDA_LIMIT = 32000;
+    // A constant used in elmo (WCSC27). Adjustment required.
+    // Since elmo does not internally divide the expression, the value is different.
+    // You can set this value with the learn command.
+    // 0.33 is equivalent to the constant (0.5) used in elmo (WCSC27)
+    double ELMO_LAMBDA = 0.33;
+    double ELMO_LAMBDA2 = 0.33;
+    double ELMO_LAMBDA_LIMIT = 32000;
 
-double calc_grad(Value teacher_signal, Value shallow , const PackedSfenValue& psv)
-{
-	// elmo (WCSC27) method
-	// Correct with the actual game wins and losses.
+    double calc_grad(Value teacher_signal, Value shallow, const PackedSfenValue& psv)
+    {
+        // elmo (WCSC27) method
+        // Correct with the actual game wins and losses.
 
-	// Training Formula · Issue #71 · nodchip/Stockfish https://github.com/nodchip/Stockfish/issues/71
-	double scaled_teacher_signal = teacher_signal;
-	// Normalize to [0.0, 1.0].
-	scaled_teacher_signal = (scaled_teacher_signal - src_score_min_value) / (src_score_max_value - src_score_min_value);
-	// Scale to [dest_score_min_value, dest_score_max_value].
-	scaled_teacher_signal = scaled_teacher_signal * (dest_score_max_value - dest_score_min_value) + dest_score_min_value;
+        // Training Formula · Issue #71 · nodchip/Stockfish https://github.com/nodchip/Stockfish/issues/71
+        double scaled_teacher_signal = teacher_signal;
+        // Normalize to [0.0, 1.0].
+        scaled_teacher_signal = (scaled_teacher_signal - src_score_min_value) / (src_score_max_value - src_score_min_value);
+        // Scale to [dest_score_min_value, dest_score_max_value].
+        scaled_teacher_signal = scaled_teacher_signal * (dest_score_max_value - dest_score_min_value) + dest_score_min_value;
 
-	const double q = winning_percentage(shallow, psv.gamePly);
-	// Teacher winning probability.
-	double p = scaled_teacher_signal;
-	if (convert_teacher_signal_to_winning_probability) {
-		p = winning_percentage(scaled_teacher_signal, psv.gamePly);
-	}
+        const double q = winning_percentage(shallow, psv.gamePly);
+        // Teacher winning probability.
+        double p = scaled_teacher_signal;
+        if (convert_teacher_signal_to_winning_probability) {
+            p = winning_percentage(scaled_teacher_signal, psv.gamePly);
+        }
 
-	// Use 1 as the correction term if the expected win rate is 1, 0 if you lose, and 0.5 if you draw.
-	// game_result = 1,0,-1 so add 1 and divide by 2.
-	const double t = double(psv.game_result + 1) / 2;
+        // Use 1 as the correction term if the expected win rate is 1, 0 if you lose, and 0.5 if you draw.
+        // game_result = 1,0,-1 so add 1 and divide by 2.
+        const double t = double(psv.game_result + 1) / 2;
 
-	// If the evaluation value in deep search exceeds ELMO_LAMBDA_LIMIT, apply ELMO_LAMBDA2 instead of ELMO_LAMBDA.
-	const double lambda = (abs(teacher_signal) >= ELMO_LAMBDA_LIMIT) ? ELMO_LAMBDA2 : ELMO_LAMBDA;
+        // If the evaluation value in deep search exceeds ELMO_LAMBDA_LIMIT, apply ELMO_LAMBDA2 instead of ELMO_LAMBDA.
+        const double lambda = (abs(teacher_signal) >= ELMO_LAMBDA_LIMIT) ? ELMO_LAMBDA2 : ELMO_LAMBDA;
 
-	double grad;
-	if (use_wdl) {
-		double dce_p = calc_d_cross_entropy_of_winning_percentage(p, shallow, psv.gamePly);
-		double dce_t = calc_d_cross_entropy_of_winning_percentage(t, shallow, psv.gamePly);
-		grad = lambda * dce_p + (1.0 - lambda) * dce_t;
-	}
-	else {
-		// Use the actual win rate as a correction term.
-		// This is the idea of ​​elmo (WCSC27), modern O-parts.
-		grad = lambda * (q - p) + (1.0 - lambda) * (q - t);
-	}
+        double grad;
+        if (use_wdl) {
+            double dce_p = calc_d_cross_entropy_of_winning_percentage(p, shallow, psv.gamePly);
+            double dce_t = calc_d_cross_entropy_of_winning_percentage(t, shallow, psv.gamePly);
+            grad = lambda * dce_p + (1.0 - lambda) * dce_t;
+        }
+        else {
+            // Use the actual win rate as a correction term.
+            // This is the idea of ​​elmo (WCSC27), modern O-parts.
+            grad = lambda * (q - p) + (1.0 - lambda) * (q - t);
+        }
 
-	return grad;
-}
+        return grad;
+    }
 
-// Calculate cross entropy during learning
-// The individual cross entropy of the win/loss term and win rate term of the elmo expression is returned to the arguments cross_entropy_eval and cross_entropy_win.
-void calc_cross_entropy(Value teacher_signal, Value shallow, const PackedSfenValue& psv,
-	double& cross_entropy_eval, double& cross_entropy_win, double& cross_entropy,
-	double& entropy_eval, double& entropy_win, double& entropy)
-{
-	// Training Formula · Issue #71 · nodchip/Stockfish https://github.com/nodchip/Stockfish/issues/71
-	double scaled_teacher_signal = teacher_signal;
-	// Normalize to [0.0, 1.0].
-	scaled_teacher_signal = (scaled_teacher_signal - src_score_min_value) / (src_score_max_value - src_score_min_value);
-	// Scale to [dest_score_min_value, dest_score_max_value].
-	scaled_teacher_signal = scaled_teacher_signal * (dest_score_max_value - dest_score_min_value) + dest_score_min_value;
+    // Calculate cross entropy during learning
+    // The individual cross entropy of the win/loss term and win rate term of the elmo expression is returned to the arguments cross_entropy_eval and cross_entropy_win.
+    void calc_cross_entropy(Value teacher_signal, Value shallow, const PackedSfenValue& psv,
+        double& cross_entropy_eval, double& cross_entropy_win, double& cross_entropy,
+        double& entropy_eval, double& entropy_win, double& entropy)
+    {
+        // Training Formula · Issue #71 · nodchip/Stockfish https://github.com/nodchip/Stockfish/issues/71
+        double scaled_teacher_signal = teacher_signal;
+        // Normalize to [0.0, 1.0].
+        scaled_teacher_signal = (scaled_teacher_signal - src_score_min_value) / (src_score_max_value - src_score_min_value);
+        // Scale to [dest_score_min_value, dest_score_max_value].
+        scaled_teacher_signal = scaled_teacher_signal * (dest_score_max_value - dest_score_min_value) + dest_score_min_value;
 
-	// Teacher winning probability.
-	double p = scaled_teacher_signal;
-	if (convert_teacher_signal_to_winning_probability) {
-		p = winning_percentage(scaled_teacher_signal);
-	}
-	const double q /* eval_winrate    */ = winning_percentage(shallow);
-	const double t = double(psv.game_result + 1) / 2;
+        // Teacher winning probability.
+        double p = scaled_teacher_signal;
+        if (convert_teacher_signal_to_winning_probability) {
+            p = winning_percentage(scaled_teacher_signal);
+        }
+        const double q /* eval_winrate    */ = winning_percentage(shallow);
+        const double t = double(psv.game_result + 1) / 2;
 
-	constexpr double epsilon = 0.000001;
+        constexpr double epsilon = 0.000001;
 
-	// If the evaluation value in deep search exceeds ELMO_LAMBDA_LIMIT, apply ELMO_LAMBDA2 instead of ELMO_LAMBDA.
-	const double lambda = (abs(teacher_signal) >= ELMO_LAMBDA_LIMIT) ? ELMO_LAMBDA2 : ELMO_LAMBDA;
+        // If the evaluation value in deep search exceeds ELMO_LAMBDA_LIMIT, apply ELMO_LAMBDA2 instead of ELMO_LAMBDA.
+        const double lambda = (abs(teacher_signal) >= ELMO_LAMBDA_LIMIT) ? ELMO_LAMBDA2 : ELMO_LAMBDA;
 
-	const double m = (1.0 - lambda) * t + lambda * p;
+        const double m = (1.0 - lambda) * t + lambda * p;
 
-	cross_entropy_eval =
-		(-p * std::log(q + epsilon) - (1.0 - p) * std::log(1.0 - q + epsilon));
-	cross_entropy_win =
-		(-t * std::log(q + epsilon) - (1.0 - t) * std::log(1.0 - q + epsilon));
-	entropy_eval =
-		(-p * std::log(p + epsilon) - (1.0 - p) * std::log(1.0 - p + epsilon));
-	entropy_win =
-		(-t * std::log(t + epsilon) - (1.0 - t) * std::log(1.0 - t + epsilon));
+        cross_entropy_eval =
+            (-p * std::log(q + epsilon) - (1.0 - p) * std::log(1.0 - q + epsilon));
+        cross_entropy_win =
+            (-t * std::log(q + epsilon) - (1.0 - t) * std::log(1.0 - q + epsilon));
+        entropy_eval =
+            (-p * std::log(p + epsilon) - (1.0 - p) * std::log(1.0 - p + epsilon));
+        entropy_win =
+            (-t * std::log(t + epsilon) - (1.0 - t) * std::log(1.0 - t + epsilon));
 
-	cross_entropy =
-		(-m * std::log(q + epsilon) - (1.0 - m) * std::log(1.0 - q + epsilon));
-	entropy =
-		(-m * std::log(m + epsilon) - (1.0 - m) * std::log(1.0 - m + epsilon));
-}
+        cross_entropy =
+            (-m * std::log(q + epsilon) - (1.0 - m) * std::log(1.0 - q + epsilon));
+        entropy =
+            (-m * std::log(m + epsilon) - (1.0 - m) * std::log(1.0 - m + epsilon));
+    }
 
 #endif
 
 
-// Other variations may be prepared as the objective function..
-
-
-double calc_grad(Value shallow, const PackedSfenValue& psv) {
-	return calc_grad((Value)psv.score, shallow, psv);
-}
-
-// Sfen reader
-struct SfenReader
-{
-	// Do not use std::random_device().  Because it always the same integers on MinGW.
-	SfenReader(int thread_num) : prng(std::chrono::system_clock::now().time_since_epoch().count())
-	{
-		packed_sfens.resize(thread_num);
-		total_read = 0;
-		total_done = 0;
-		last_done = 0;
-		next_update_weights = 0;
-		save_count = 0;
-		end_of_files = false;
-		no_shuffle = false;
-		stop_flag = false;
-
-		hash.resize(READ_SFEN_HASH_SIZE);
-	}
-
-	~SfenReader()
-	{
-		if (file_worker_thread.joinable())
-			file_worker_thread.join();
-
-		for (auto p : packed_sfens)
-			delete p;
-		for (auto p : packed_sfens_pool)
-			delete p;
-	}
-
-	// number of phases used for calculation such as mse
-	// mini-batch size = 1M is standard, so 0.2% of that should be negligible in terms of time.
-	//Since search() is performed with depth = 1 in calculation of move match rate, simple comparison is not possible...
-	const uint64_t sfen_for_mse_size = 2000;
-
-	// Load the phase for calculation such as mse.
-	void read_for_mse()
-	{
-		auto th = Threads.main();
-		Position& pos = th->rootPos;
-		for (uint64_t i = 0; i < sfen_for_mse_size; ++i)
-		{
-			PackedSfenValue ps;
-			if (!read_to_thread_buffer(0, ps))
-			{
-				cout << "Error! read packed sfen , failed." << endl;
-				break;
-			}
-			sfen_for_mse.push_back(ps);
-
-			// Get the hash key.
-			StateInfo si;
-			pos.set_from_packed_sfen(ps.sfen,&si,th);
-			sfen_for_mse_hash.insert(pos.key());
-		}
-	}
-
-	void read_validation_set(const string file_name, int eval_limit)
-	{
-		ifstream fs(file_name, ios::binary);
-
-		while (fs)
-		{
-			PackedSfenValue p;
-			if (fs.read((char*)&p, sizeof(PackedSfenValue)))
-			{
-				if (eval_limit < abs(p.score))
-					continue;
-				if (!use_draw_games_in_validation && p.game_result == 0)
-					continue;
-				sfen_for_mse.push_back(p);
-			} else {
-				break;
-			}
-		}
-	}
-
-	// Number of phases buffered by each thread 0.1M phases. 4M phase at 40HT
-	const size_t THREAD_BUFFER_SIZE = 10 * 1000;
-
-	// Buffer for reading files (If this is made larger, the shuffle becomes larger and the phases may vary.
-	// If it is too large, the memory consumption will increase.
-	// SFEN_READ_SIZE is a multiple of THREAD_BUFFER_SIZE.
-	const size_t SFEN_READ_SIZE = LEARN_SFEN_READ_SIZE;
-
-	// [ASYNC] Thread returns one aspect. Otherwise returns false.
-	bool read_to_thread_buffer(size_t thread_id, PackedSfenValue& ps)
-	{
-		// If there are any positions left in the thread buffer, retrieve one and return it.
-		auto& thread_ps = packed_sfens[thread_id];
-
-		// Fill the read buffer if there is no remaining buffer, but if it doesn't even exist, finish.
-		if ((thread_ps == nullptr || thread_ps->size() == 0) // If the buffer is empty, fill it.
-			&& !read_to_thread_buffer_impl(thread_id))
-			return false;
-
-		// read_to_thread_buffer_impl() returned true,
-		// Since the filling of the thread buffer with the phase has been completed successfully
-		// thread_ps->rbegin() is alive.
-
-		ps = *(thread_ps->rbegin());
-		thread_ps->pop_back();
-
-		// If you've run out of buffers, call delete yourself to free this buffer.
-		if (thread_ps->size() == 0)
-		{
-
-			delete thread_ps;
-			thread_ps = nullptr;
-		}
-
-		return true;
-	}
-
-	// [ASYNC] Read some aspects into thread buffer.
-	bool read_to_thread_buffer_impl(size_t thread_id)
-	{
-		while (true)
-		{
-			{
-				std::unique_lock<std::mutex> lk(mutex);
-				// If you can fill from the file buffer, that's fine.
-				if (packed_sfens_pool.size() != 0)
-				{
-					// It seems that filling is possible, so fill and finish.
-
-					packed_sfens[thread_id] = packed_sfens_pool.front();
-					packed_sfens_pool.pop_front();
-
-					total_read += THREAD_BUFFER_SIZE;
-
-					return true;
-				}
-			}
-
-			// The file to read is already gone. No more use.
-			if (end_of_files)
-				return false;
-
-			// Waiting for file worker to fill packed_sfens_pool.
-			// The mutex isn't locked, so it should fill up soon.
-			sleep(1);
-		}
-
-	}
-
-	// Start a thread that loads the phase file in the background.
-	void start_file_read_worker()
-	{
-		file_worker_thread = std::thread([&] { this->file_read_worker(); });
-	}
-
-	// for file read-only threads
-	void file_read_worker()
-	{
-		auto open_next_file = [&]()
-		{
-			if (fs.is_open())
-				fs.close();
-
-			// no more
-			if (filenames.size() == 0)
-				return false;
-
-			// Get the next file name.
-			string filename = *filenames.rbegin();
-			filenames.pop_back();
-
-			fs.open(filename, ios::in | ios::binary);
-			cout << "open filename = " << filename << endl;
-			assert(fs);
-
-			return true;
-		};
-
-		while (true)
-		{
-			// Wait for the buffer to run out.
-			// This size() is read only, so you don't need to lock it.
-			while (!stop_flag && packed_sfens_pool.size() >= SFEN_READ_SIZE / THREAD_BUFFER_SIZE)
-				sleep(100);
-			if (stop_flag)
-				return;
-
-			PSVector sfens;
-			sfens.reserve(SFEN_READ_SIZE);
-
-			// Read from the file into the file buffer.
-			while (sfens.size() < SFEN_READ_SIZE)
-			{
-				PackedSfenValue p;
-				if (fs.read((char*)&p, sizeof(PackedSfenValue)))
-				{
-					sfens.push_back(p);
-				} else
-				{
-					// read failure
-					if (!open_next_file())
-					{
-						// There was no next file. Abon.
-						cout << "..end of files." << endl;
-						end_of_files = true;
-						return;
-					}
-				}
-			}
-
-			// Shuffle the read phase data.
-			// random shuffle by Fisher-Yates algorithm
-
-			if (!no_shuffle)
-			{
-				auto size = sfens.size();
-				for (size_t i = 0; i < size; ++i)
-					swap(sfens[i], sfens[(size_t)(prng.rand((uint64_t)size - i) + i)]);
-			}
-
-			// Divide this by THREAD_BUFFER_SIZE. There should be size pieces.
-			// SFEN_READ_SIZE shall be a multiple of THREAD_BUFFER_SIZE.
-			assert((SFEN_READ_SIZE % THREAD_BUFFER_SIZE)==0);
-
-			auto size = size_t(SFEN_READ_SIZE / THREAD_BUFFER_SIZE);
-			std::vector<PSVector*> ptrs;
-			ptrs.reserve(size);
-
-			for (size_t i = 0; i < size; ++i)
-			{
-				// Delete this pointer on the receiving side.
-				PSVector* ptr = new PSVector();
-				ptr->resize(THREAD_BUFFER_SIZE);
-				memcpy(&((*ptr)[0]), &sfens[i * THREAD_BUFFER_SIZE], sizeof(PackedSfenValue) * THREAD_BUFFER_SIZE);
-
-				ptrs.push_back(ptr);
-			}
-
-			// Since sfens is ready, look at the occasion and copy
-			{
-				std::unique_lock<std::mutex> lk(mutex);
-
-				// You can ignore this time because you just copy the pointer...
-				// The mutex lock is required because the contents of packed_sfens_pool are changed.
-
-				for (size_t i = 0; i < size; ++i)
-					packed_sfens_pool.push_back(ptrs[i]);
-			}
-		}
-	}
-
-	// sfen files
-	vector<string> filenames;
-
-	// number of phases read (file to memory buffer)
-	atomic<uint64_t> total_read;
-
-	// number of processed phases
-	atomic<uint64_t> total_done;
-
-	// number of cases processed so far
-	uint64_t last_done;
-
-	// If total_read exceeds this value, update_weights() and calculate mse.
-	uint64_t next_update_weights;
-
-	uint64_t save_count;
-
-	// Do not shuffle when reading the phase.
-	bool no_shuffle;
-
-	bool stop_flag;
-
-	// Determine if it is a phase for calculating rmse.
-	// (The computational aspects of rmse should not be used for learning.)
-	bool is_for_rmse(Key key) const
-	{
-			return sfen_for_mse_hash.count(key) != 0;
-	}
-
-	// hash to limit the reading of the same situation
-	// Is there too many 64 million phases? Or Not really..
-	// It must be 2**N because it will be used as the mask to calculate hash_index.
-	static const uint64_t READ_SFEN_HASH_SIZE = 64 * 1024 * 1024;
-	vector<Key> hash; // 64MB*8 = 512MB
-
-	// test phase for mse calculation
-	PSVector sfen_for_mse;
-
-protected:
-
-	// worker thread reading file in background
-	std::thread file_worker_thread;
-
-	// Random number to shuffle when reading the phase
-	PRNG prng;
-
-	// Did you read the files and reached the end?
-	atomic<bool> end_of_files;
+    // Other variations may be prepared as the objective function..
+
+
+    double calc_grad(Value shallow, const PackedSfenValue& psv) {
+        return calc_grad((Value)psv.score, shallow, psv);
+    }
+
+    // Sfen reader
+    struct SfenReader
+    {
+        // number of phases used for calculation such as mse
+        // mini-batch size = 1M is standard, so 0.2% of that should be negligible in terms of time.
+        //Since search() is performed with depth = 1 in calculation of move match rate, simple comparison is not possible...
+        static constexpr uint64_t sfen_for_mse_size = 2000;
+
+        // Number of phases buffered by each thread 0.1M phases. 4M phase at 40HT
+        static constexpr size_t THREAD_BUFFER_SIZE = 10 * 1000;
+
+        // Buffer for reading files (If this is made larger, the shuffle becomes larger and the phases may vary.
+        // If it is too large, the memory consumption will increase.
+        // SFEN_READ_SIZE is a multiple of THREAD_BUFFER_SIZE.
+        static constexpr const size_t SFEN_READ_SIZE = LEARN_SFEN_READ_SIZE;
+
+        // Do not use std::random_device().  Because it always the same integers on MinGW.
+        SfenReader(int thread_num) : prng(std::chrono::system_clock::now().time_since_epoch().count())
+        {
+            packed_sfens.resize(thread_num);
+            total_read = 0;
+            total_done = 0;
+            last_done = 0;
+            next_update_weights = 0;
+            save_count = 0;
+            end_of_files = false;
+            no_shuffle = false;
+            stop_flag = false;
+
+            hash.resize(READ_SFEN_HASH_SIZE);
+        }
+
+        ~SfenReader()
+        {
+            if (file_worker_thread.joinable())
+                file_worker_thread.join();
+        }
+
+        // Load the phase for calculation such as mse.
+        void read_for_mse()
+        {
+            auto th = Threads.main();
+            Position& pos = th->rootPos;
+            for (uint64_t i = 0; i < sfen_for_mse_size; ++i)
+            {
+                PackedSfenValue ps;
+                if (!read_to_thread_buffer(0, ps))
+                {
+                    cout << "Error! read packed sfen , failed." << endl;
+                    break;
+                }
+                sfen_for_mse.push_back(ps);
+
+                // Get the hash key.
+                StateInfo si;
+                pos.set_from_packed_sfen(ps.sfen, &si, th);
+                sfen_for_mse_hash.insert(pos.key());
+            }
+        }
+
+        void read_validation_set(const string& file_name, int eval_limit)
+        {
+            ifstream input(file_name, ios::binary);
+
+            while (input)
+            {
+                PackedSfenValue p;
+                if (input.read(reinterpret_cast<char*>(&p), sizeof(PackedSfenValue)))
+                {
+                    if (eval_limit < abs(p.score))
+                        continue;
+                    if (!use_draw_games_in_validation && p.game_result == 0)
+                        continue;
+                    sfen_for_mse.push_back(p);
+                }
+                else 
+                {
+                    break;
+                }
+            }
+        }
+
+        // [ASYNC] Thread returns one aspect. Otherwise returns false.
+        bool read_to_thread_buffer(size_t thread_id, PackedSfenValue& ps)
+        {
+            // If there are any positions left in the thread buffer, retrieve one and return it.
+            auto& thread_ps = packed_sfens[thread_id];
+
+            // Fill the read buffer if there is no remaining buffer, but if it doesn't even exist, finish.
+            if ((thread_ps == nullptr || thread_ps->size() == 0) // If the buffer is empty, fill it.
+                && !read_to_thread_buffer_impl(thread_id))
+                return false;
+
+            // read_to_thread_buffer_impl() returned true,
+            // Since the filling of the thread buffer with the phase has been completed successfully
+            // thread_ps->rbegin() is alive.
+
+            ps = *(thread_ps->rbegin());
+            thread_ps->pop_back();
+
+            // If you've run out of buffers, call delete yourself to free this buffer.
+            if (thread_ps->size() == 0)
+            {
+                thread_ps.reset();
+            }
+
+            return true;
+        }
+
+        // [ASYNC] Read some aspects into thread buffer.
+        bool read_to_thread_buffer_impl(size_t thread_id)
+        {
+            while (true)
+            {
+                {
+                    std::unique_lock<std::mutex> lk(mutex);
+                    // If you can fill from the file buffer, that's fine.
+                    if (packed_sfens_pool.size() != 0)
+                    {
+                        // It seems that filling is possible, so fill and finish.
+
+                        packed_sfens[thread_id] = std::move(packed_sfens_pool.front());
+                        packed_sfens_pool.pop_front();
+
+                        total_read += THREAD_BUFFER_SIZE;
+
+                        return true;
+                    }
+                }
+
+                // The file to read is already gone. No more use.
+                if (end_of_files)
+                    return false;
+
+                // Waiting for file worker to fill packed_sfens_pool.
+                // The mutex isn't locked, so it should fill up soon.
+                sleep(1);
+            }
+
+        }
+
+        // Start a thread that loads the phase file in the background.
+        void start_file_read_worker()
+        {
+            file_worker_thread = std::thread([&] { this->file_read_worker(); });
+        }
+
+        // for file read-only threads
+        void file_read_worker()
+        {
+            auto open_next_file = [&]()
+            {
+                if (fs.is_open())
+                    fs.close();
+
+                // no more
+                if (filenames.size() == 0)
+                    return false;
+
+                // Get the next file name.
+                string filename = *filenames.rbegin();
+                filenames.pop_back();
+
+                fs.open(filename, ios::in | ios::binary);
+                cout << "open filename = " << filename << endl;
+                assert(fs);
+
+                return true;
+            };
+
+            while (true)
+            {
+                // Wait for the buffer to run out.
+                // This size() is read only, so you don't need to lock it.
+                while (!stop_flag && packed_sfens_pool.size() >= SFEN_READ_SIZE / THREAD_BUFFER_SIZE)
+                    sleep(100);
+                if (stop_flag)
+                    return;
+
+                PSVector sfens;
+                sfens.reserve(SFEN_READ_SIZE);
+
+                // Read from the file into the file buffer.
+                while (sfens.size() < SFEN_READ_SIZE)
+                {
+                    PackedSfenValue p;
+                    if (fs.read(reinterpret_cast<char*>(&p), sizeof(PackedSfenValue)))
+                    {
+                        sfens.push_back(p);
+                    }
+                    else
+                    {
+                        // read failure
+                        if (!open_next_file())
+                        {
+                            // There was no next file. Abon.
+                            cout << "..end of files." << endl;
+                            end_of_files = true;
+                            return;
+                        }
+                    }
+                }
+
+                // Shuffle the read phase data.
+                // random shuffle by Fisher-Yates algorithm
+
+                if (!no_shuffle)
+                {
+                    auto size = sfens.size();
+                    for (size_t i = 0; i < size; ++i)
+                        swap(sfens[i], sfens[(size_t)(prng.rand((uint64_t)size - i) + i)]);
+                }
+
+                // Divide this by THREAD_BUFFER_SIZE. There should be size pieces.
+                // SFEN_READ_SIZE shall be a multiple of THREAD_BUFFER_SIZE.
+                assert((SFEN_READ_SIZE % THREAD_BUFFER_SIZE) == 0);
+
+                auto size = size_t(SFEN_READ_SIZE / THREAD_BUFFER_SIZE);
+                std::vector<std::unique_ptr<PSVector>> buffers;
+                buffers.reserve(size);
+
+                for (size_t i = 0; i < size; ++i)
+                {
+                    // Delete this pointer on the receiving side.
+                    auto buf = std::make_unique<PSVector>();
+                    buf->resize(THREAD_BUFFER_SIZE);
+                    memcpy(buf->data(), &sfens[i * THREAD_BUFFER_SIZE], sizeof(PackedSfenValue) * THREAD_BUFFER_SIZE);
+
+                    buffers.emplace_back(std::move(buf));
+                }
+
+                // Since sfens is ready, look at the occasion and copy
+                {
+                    std::unique_lock<std::mutex> lk(mutex);
+
+                    // You can ignore this time because you just copy the pointer...
+                    // The mutex lock is required because the contents of packed_sfens_pool are changed.
+
+                    for (auto& buf : buffers)
+                        packed_sfens_pool.emplace_back(std::move(buf));
+                }
+            }
+        }
+
+        // sfen files
+        vector<string> filenames;
+
+        // number of phases read (file to memory buffer)
+        atomic<uint64_t> total_read;
+
+        // number of processed phases
+        atomic<uint64_t> total_done;
+
+        // number of cases processed so far
+        uint64_t last_done;
+
+        // If total_read exceeds this value, update_weights() and calculate mse.
+        uint64_t next_update_weights;
+
+        uint64_t save_count;
+
+        // Do not shuffle when reading the phase.
+        bool no_shuffle;
+
+        bool stop_flag;
+
+        // Determine if it is a phase for calculating rmse.
+        // (The computational aspects of rmse should not be used for learning.)
+        bool is_for_rmse(Key key) const
+        {
+            return sfen_for_mse_hash.count(key) != 0;
+        }
+
+        // hash to limit the reading of the same situation
+        // Is there too many 64 million phases? Or Not really..
+        // It must be 2**N because it will be used as the mask to calculate hash_index.
+        static const uint64_t READ_SFEN_HASH_SIZE = 64 * 1024 * 1024;
+        vector<Key> hash; // 64MB*8 = 512MB
+
+        // test phase for mse calculation
+        PSVector sfen_for_mse;
+
+    protected:
+
+        // worker thread reading file in background
+        std::thread file_worker_thread;
+
+        // Random number to shuffle when reading the phase
+        PRNG prng;
+
+        // Did you read the files and reached the end?
+        atomic<bool> end_of_files;
 
 
-	// handle of sfen file
-	std::fstream fs;
+        // handle of sfen file
+        std::fstream fs;
 
-	// sfen for each thread
-	// (When the thread is used up, the thread should call delete to release it.)
-	std::vector<PSVector*> packed_sfens;
+        // sfen for each thread
+        // (When the thread is used up, the thread should call delete to release it.)
+        std::vector<std::unique_ptr<PSVector>> packed_sfens;
 
-	// Mutex when accessing packed_sfens_pool
-	std::mutex mutex;
+        // Mutex when accessing packed_sfens_pool
+        std::mutex mutex;
 
-	// pool of sfen. The worker thread read from the file is added here.
-	// Each worker thread fills its own packed_sfens[thread_id] from here.
-	// * Lock and access the mutex.
-	std::list<PSVector*> packed_sfens_pool;
+        // pool of sfen. The worker thread read from the file is added here.
+        // Each worker thread fills its own packed_sfens[thread_id] from here.
+        // * Lock and access the mutex.
+        std::list<std::unique_ptr<PSVector>> packed_sfens_pool;
 
-	// Hold the hash key so that the mse calculation phase is not used for learning.
-	std::unordered_set<Key> sfen_for_mse_hash;
-};
+        // Hold the hash key so that the mse calculation phase is not used for learning.
+        std::unordered_set<Key> sfen_for_mse_hash;
+    };
 
-// Class to generate sfen with multiple threads
-struct LearnerThink: public MultiThink
-{
-	LearnerThink(SfenReader& sr_):sr(sr_),stop_flag(false), save_only_once(false)
-	{
+    // Class to generate sfen with multiple threads
+    struct LearnerThink : public MultiThink
+    {
+        LearnerThink(SfenReader& sr_) :sr(sr_), stop_flag(false), save_only_once(false)
+        {
 #if defined ( LOSS_FUNCTION_IS_ELMO_METHOD )
-		learn_sum_cross_entropy_eval = 0.0;
-		learn_sum_cross_entropy_win = 0.0;
-		learn_sum_cross_entropy = 0.0;
-		learn_sum_entropy_eval = 0.0;
-		learn_sum_entropy_win = 0.0;
-		learn_sum_entropy = 0.0;
+            learn_sum_cross_entropy_eval = 0.0;
+            learn_sum_cross_entropy_win = 0.0;
+            learn_sum_cross_entropy = 0.0;
+            learn_sum_entropy_eval = 0.0;
+            learn_sum_entropy_win = 0.0;
+            learn_sum_entropy = 0.0;
 #endif
 #if defined(EVAL_NNUE)
-		newbob_scale = 1.0;
-		newbob_decay = 1.0;
-		newbob_num_trials = 2;
-		best_loss = std::numeric_limits<double>::infinity();
-		latest_loss_sum = 0.0;
-		latest_loss_count = 0;
+            newbob_scale = 1.0;
+            newbob_decay = 1.0;
+            newbob_num_trials = 2;
+            best_loss = std::numeric_limits<double>::infinity();
+            latest_loss_sum = 0.0;
+            latest_loss_count = 0;
 #endif
-	}
+        }
 
-	virtual void thread_worker(size_t thread_id);
+        virtual void thread_worker(size_t thread_id);
 
-	// Start a thread that loads the phase file in the background.
-	void start_file_read_worker() { sr.start_file_read_worker(); }
+        // Start a thread that loads the phase file in the background.
+        void start_file_read_worker() { sr.start_file_read_worker(); }
 
-	// save merit function parameters to a file
-	bool save(bool is_final=false);
+        // save merit function parameters to a file
+        bool save(bool is_final = false);
 
-	// sfen reader
-	SfenReader& sr;
+        // sfen reader
+        SfenReader& sr;
 
-	// Learning iteration counter
-	uint64_t epoch = 0;
+        // Learning iteration counter
+        uint64_t epoch = 0;
 
-	// Mini batch size size. Be sure to set it on the side that uses this class.
-	uint64_t mini_batch_size = 1000*1000;
+        // Mini batch size size. Be sure to set it on the side that uses this class.
+        uint64_t mini_batch_size = 1000 * 1000;
 
-	bool stop_flag;
+        bool stop_flag;
 
-	// Discount rate
-	double discount_rate;
+        // Discount rate
+        double discount_rate;
 
-	// Option to exclude early stage from learning
-	int reduction_gameply;
+        // Option to exclude early stage from learning
+        int reduction_gameply;
 
-	// Option not to learn kk/kkp/kpp/kppp
-	std::array<bool,4> freeze;
+        // Option not to learn kk/kkp/kpp/kppp
+        std::array<bool, 4> freeze;
 
-	// If the absolute value of the evaluation value of the deep search of the teacher phase exceeds this value, discard the teacher phase.
-	int eval_limit;
+        // If the absolute value of the evaluation value of the deep search of the teacher phase exceeds this value, discard the teacher phase.
+        int eval_limit;
 
-	// Flag whether to dig a folder each time the evaluation function is saved.
-	// If true, do not dig the folder.
-	bool save_only_once;
+        // Flag whether to dig a folder each time the evaluation function is saved.
+        // If true, do not dig the folder.
+        bool save_only_once;
 
-	// --- loss calculation
+        // --- loss calculation
 
 #if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
-	// For calculation of learning data loss
-	atomic<double> learn_sum_cross_entropy_eval;
-	atomic<double> learn_sum_cross_entropy_win;
-	atomic<double> learn_sum_cross_entropy;
-	atomic<double> learn_sum_entropy_eval;
-	atomic<double> learn_sum_entropy_win;
-	atomic<double> learn_sum_entropy;
+    // For calculation of learning data loss
+        atomic<double> learn_sum_cross_entropy_eval;
+        atomic<double> learn_sum_cross_entropy_win;
+        atomic<double> learn_sum_cross_entropy;
+        atomic<double> learn_sum_entropy_eval;
+        atomic<double> learn_sum_entropy_win;
+        atomic<double> learn_sum_entropy;
 #endif
 
 #if defined(EVAL_NNUE)
-	shared_timed_mutex nn_mutex;
-	double newbob_scale;
-	double newbob_decay;
-	int newbob_num_trials;
-	double best_loss;
-	double latest_loss_sum;
-	uint64_t latest_loss_count;
-	std::string best_nn_directory;
+        shared_timed_mutex nn_mutex;
+        double newbob_scale;
+        double newbob_decay;
+        int newbob_num_trials;
+        double best_loss;
+        double latest_loss_sum;
+        uint64_t latest_loss_count;
+        std::string best_nn_directory;
 #endif
 
-	uint64_t eval_save_interval;
-	uint64_t loss_output_interval;
-	uint64_t mirror_percentage;
+        uint64_t eval_save_interval;
+        uint64_t loss_output_interval;
+        uint64_t mirror_percentage;
 
-	// Loss calculation.
-	// done: Number of phases targeted this time
-	void calc_loss(size_t thread_id , uint64_t done);
+        // Loss calculation.
+        // done: Number of phases targeted this time
+        void calc_loss(size_t thread_id, uint64_t done);
 
-	// Define the loss calculation in ↑ as a task and execute it
-	TaskDispatcher task_dispatcher;
-};
+        // Define the loss calculation in ↑ as a task and execute it
+        TaskDispatcher task_dispatcher;
+    };
 
-void LearnerThink::calc_loss(size_t thread_id, uint64_t done)
-{
-	// There is no point in hitting the replacement table, so at this timing the generation of the replacement table is updated.
-	// It doesn't matter if you have disabled the substitution table.
-	TT.new_search();
+    void LearnerThink::calc_loss(size_t thread_id, uint64_t done)
+    {
+        // There is no point in hitting the replacement table, so at this timing the generation of the replacement table is updated.
+        // It doesn't matter if you have disabled the substitution table.
+        TT.new_search();
 
 
 #if defined(EVAL_NNUE)
-	std::cout << "PROGRESS: " << now_string() << ", ";
-	std::cout << sr.total_done << " sfens";
-	std::cout << ", iteration " << epoch;
-	std::cout << ", eta = " << Eval::get_eta() << ", ";
+        std::cout << "PROGRESS: " << now_string() << ", ";
+        std::cout << sr.total_done << " sfens";
+        std::cout << ", iteration " << epoch;
+        std::cout << ", eta = " << Eval::get_eta() << ", ";
 #endif
 
 #if !defined(LOSS_FUNCTION_IS_ELMO_METHOD)
-	double sum_error = 0;
-	double sum_error2 = 0;
-	double sum_error3 = 0;
+        double sum_error = 0;
+        double sum_error2 = 0;
+        double sum_error3 = 0;
 #endif
 
 #if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
-	// For calculation of verification data loss
-	atomic<double> test_sum_cross_entropy_eval,test_sum_cross_entropy_win,test_sum_cross_entropy;
-	atomic<double> test_sum_entropy_eval,test_sum_entropy_win,test_sum_entropy;
-	test_sum_cross_entropy_eval = 0;
-	test_sum_cross_entropy_win = 0;
-	test_sum_cross_entropy = 0;
-	test_sum_entropy_eval = 0;
-	test_sum_entropy_win = 0;
-	test_sum_entropy = 0;
+        // For calculation of verification data loss
+        atomic<double> test_sum_cross_entropy_eval, test_sum_cross_entropy_win, test_sum_cross_entropy;
+        atomic<double> test_sum_entropy_eval, test_sum_entropy_win, test_sum_entropy;
+        test_sum_cross_entropy_eval = 0;
+        test_sum_cross_entropy_win = 0;
+        test_sum_cross_entropy = 0;
+        test_sum_entropy_eval = 0;
+        test_sum_entropy_win = 0;
+        test_sum_entropy = 0;
 
-	// norm for learning
-	atomic<double> sum_norm;
-	sum_norm = 0;
+        // norm for learning
+        atomic<double> sum_norm;
+        sum_norm = 0;
 #endif
 
-	// The number of times the pv first move of deep search matches the pv first move of search(1).
-	atomic<int> move_accord_count;
-	move_accord_count = 0;
+        // The number of times the pv first move of deep search matches the pv first move of search(1).
+        atomic<int> move_accord_count;
+        move_accord_count = 0;
 
-	// Display the value of eval() in the initial stage of Hirate and see the shaking.
-	auto th = Threads[thread_id];
-	auto& pos = th->rootPos;
-	StateInfo si;
-  pos.set(StartFEN, false, &si, th);
-  std::cout << "hirate eval = " << Eval::evaluate(pos);
+        // Display the value of eval() in the initial stage of Hirate and see the shaking.
+        auto th = Threads[thread_id];
+        auto& pos = th->rootPos;
+        StateInfo si;
+        pos.set(StartFEN, false, &si, th);
+        std::cout << "hirate eval = " << Eval::evaluate(pos);
 
-	//Eval::print_eval_stat(pos);
+        //Eval::print_eval_stat(pos);
 
-	// It's better to parallelize here, but it's a bit troublesome because the search before slave has not finished.
-	// I created a mechanism to call task, so I will use it.
+        // It's better to parallelize here, but it's a bit troublesome because the search before slave has not finished.
+        // I created a mechanism to call task, so I will use it.
 
-	// The number of tasks to do.
-	atomic<int> task_count;
-	task_count = (int)sr.sfen_for_mse.size();
-	task_dispatcher.task_reserve(task_count);
+        // The number of tasks to do.
+        atomic<int> task_count;
+        task_count = (int)sr.sfen_for_mse.size();
+        task_dispatcher.task_reserve(task_count);
 
-	// Create a task to search for the situation and give it to each thread.
-	for (const auto& ps : sr.sfen_for_mse)
-	{
-		// Assign work to each thread using TaskDispatcher.
-		// A task definition for that.
-		// It is not possible to capture pos used in ↑, so specify the variables you want to capture one by one.
-		auto task = [&ps,&test_sum_cross_entropy_eval,&test_sum_cross_entropy_win,&test_sum_cross_entropy,&test_sum_entropy_eval,&test_sum_entropy_win,&test_sum_entropy, &sum_norm,&task_count ,&move_accord_count](size_t thread_id)
-		{
-			// Does C++ properly capture a new ps instance for each loop?.
-			auto th = Threads[thread_id];
-			auto& pos = th->rootPos;
-			StateInfo si;
-			if (pos.set_from_packed_sfen(ps.sfen ,&si, th) != 0)
-			{
-				// Unfortunately, as an sfen for rmse calculation, an invalid sfen was drawn.
-				cout << "Error! : illegal packed sfen " << pos.fen() << endl;
-			}
+        // Create a task to search for the situation and give it to each thread.
+        for (const auto& ps : sr.sfen_for_mse)
+        {
+            // Assign work to each thread using TaskDispatcher.
+            // A task definition for that.
+            // It is not possible to capture pos used in ↑, so specify the variables you want to capture one by one.
+            auto task = 
+                [
+                    &ps, 
+                    &test_sum_cross_entropy_eval, 
+                    &test_sum_cross_entropy_win, 
+                    &test_sum_cross_entropy, 
+                    &test_sum_entropy_eval, 
+                    &test_sum_entropy_win, 
+                    &test_sum_entropy, 
+                    &sum_norm, 
+                    &task_count, 
+                    &move_accord_count
+                ](size_t task_thread_id)
+            {
+                // Does C++ properly capture a new ps instance for each loop?.
+                auto task_th = Threads[task_thread_id];
+                auto& task_pos = task_th->rootPos;
+                StateInfo task_si;
+                if (task_pos.set_from_packed_sfen(ps.sfen, &task_si, task_th) != 0)
+                {
+                    // Unfortunately, as an sfen for rmse calculation, an invalid sfen was drawn.
+                    cout << "Error! : illegal packed sfen " << task_pos.fen() << endl;
+                }
 
-			// Evaluation value for shallow search
-			// The value of evaluate() may be used, but when calculating loss, learn_cross_entropy and
-			// Use qsearch() because it is difficult to compare the values.
-			// EvalHash has been disabled in advance. (If not, the same value will be returned every time)
-			auto r = qsearch(pos);
+                // Evaluation value for shallow search
+                // The value of evaluate() may be used, but when calculating loss, learn_cross_entropy and
+                // Use qsearch() because it is difficult to compare the values.
+                // EvalHash has been disabled in advance. (If not, the same value will be returned every time)
+                auto task_search_result = qsearch(task_pos);
 
-			auto shallow_value = r.first;
-			{
-				const auto rootColor = pos.side_to_move();
-				const auto pv = r.second;
-				std::vector<StateInfo,AlignedAllocator<StateInfo>> states(pv.size());
-				for (size_t i = 0; i < pv.size(); ++i)
-				{
-					pos.do_move(pv[i], states[i]);
-					Eval::NNUE::update_eval(pos);
-				}
-				shallow_value = (rootColor == pos.side_to_move()) ? Eval::evaluate(pos) : -Eval::evaluate(pos);
-				for (auto it = pv.rbegin(); it != pv.rend(); ++it)
-					pos.undo_move(*it);
-			}
+                auto shallow_value = task_search_result.first;
+                {
+                    const auto rootColor = task_pos.side_to_move();
+                    const auto pv = task_search_result.second;
+                    std::vector<StateInfo, AlignedAllocator<StateInfo>> states(pv.size());
+                    for (size_t i = 0; i < pv.size(); ++i)
+                    {
+                        task_pos.do_move(pv[i], states[i]);
+                        Eval::NNUE::update_eval(task_pos);
+                    }
+                    shallow_value = (rootColor == task_pos.side_to_move()) ? Eval::evaluate(task_pos) : -Eval::evaluate(task_pos);
+                    for (auto it = pv.rbegin(); it != pv.rend(); ++it)
+                        task_pos.undo_move(*it);
+                }
 
-			// Evaluation value of deep search
-			auto deep_value = (Value)ps.score;
+                // Evaluation value of deep search
+                auto deep_value = (Value)ps.score;
 
-			// Note) This code does not consider when eval_limit is specified in the learn command.
+                // Note) This code does not consider when eval_limit is specified in the learn command.
 
-			// --- error calculation
+                // --- error calculation
 
 #if !defined(LOSS_FUNCTION_IS_ELMO_METHOD)
-			auto grad = calc_grad(deep_value, shallow_value, ps);
+                auto grad = calc_grad(deep_value, shallow_value, ps);
 
-			// something like rmse
-			sum_error += grad*grad;
-			// Add the absolute value of the gradient
-			sum_error2 += abs(grad);
-			// Add the absolute value of the difference between the evaluation values
-			sum_error3 += abs(shallow_value - deep_value);
+                // something like rmse
+                sum_error += grad * grad;
+                // Add the absolute value of the gradient
+                sum_error2 += abs(grad);
+                // Add the absolute value of the difference between the evaluation values
+                sum_error3 += abs(shallow_value - deep_value);
 #endif
 
-			// --- calculation of cross entropy
+                // --- calculation of cross entropy
 
-			// For the time being, regarding the win rate and loss terms only in the elmo method
-			// Calculate and display the cross entropy.
+                // For the time being, regarding the win rate and loss terms only in the elmo method
+                // Calculate and display the cross entropy.
 
 #if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
-			double test_cross_entropy_eval, test_cross_entropy_win, test_cross_entropy;
-			double test_entropy_eval, test_entropy_win, test_entropy;
-			calc_cross_entropy(deep_value, shallow_value, ps, test_cross_entropy_eval, test_cross_entropy_win, test_cross_entropy, test_entropy_eval, test_entropy_win, test_entropy);
-			// The total cross entropy need not be abs() by definition.
-			test_sum_cross_entropy_eval += test_cross_entropy_eval;
-			test_sum_cross_entropy_win += test_cross_entropy_win;
-			test_sum_cross_entropy += test_cross_entropy;
-			test_sum_entropy_eval += test_entropy_eval;
-			test_sum_entropy_win += test_entropy_win;
-			test_sum_entropy += test_entropy;
-			sum_norm += (double)abs(shallow_value);
+                double test_cross_entropy_eval, test_cross_entropy_win, test_cross_entropy;
+                double test_entropy_eval, test_entropy_win, test_entropy;
+                calc_cross_entropy(deep_value, shallow_value, ps, test_cross_entropy_eval, test_cross_entropy_win, test_cross_entropy, test_entropy_eval, test_entropy_win, test_entropy);
+                // The total cross entropy need not be abs() by definition.
+                test_sum_cross_entropy_eval += test_cross_entropy_eval;
+                test_sum_cross_entropy_win += test_cross_entropy_win;
+                test_sum_cross_entropy += test_cross_entropy;
+                test_sum_entropy_eval += test_entropy_eval;
+                test_sum_entropy_win += test_entropy_win;
+                test_sum_entropy += test_entropy;
+                sum_norm += (double)abs(shallow_value);
 #endif
 
-			// Determine if the teacher's move and the score of the shallow search match
-			{
-				auto r = search(pos,1);
-				if ((uint16_t)r.second[0] == ps.move)
-					move_accord_count.fetch_add(1, std::memory_order_relaxed);
-			}
+                // Determine if the teacher's move and the score of the shallow search match
+                {
+                    auto r = search(task_pos, 1);
+                    if ((uint16_t)r.second[0] == ps.move)
+                        move_accord_count.fetch_add(1, std::memory_order_relaxed);
+                }
 
-			// Reduced one task because I did it
-			--task_count;
-		};
+                // Reduced one task because I did it
+                --task_count;
+            };
 
-		// Throw the defined task to slave.
-		task_dispatcher.push_task_async(task);
-	}
+            // Throw the defined task to slave.
+            task_dispatcher.push_task_async(task);
+        }
 
-	// join yourself as a slave
-	task_dispatcher.on_idle(thread_id);
+        // join yourself as a slave
+        task_dispatcher.on_idle(thread_id);
 
-	// wait for all tasks to complete
-	while (task_count)
-		sleep(1);
+        // wait for all tasks to complete
+        while (task_count)
+            sleep(1);
 
 #if !defined(LOSS_FUNCTION_IS_ELMO_METHOD)
-	// rmse = root mean square error: mean square error
-	// mae = mean absolute error: mean absolute error
-	auto dsig_rmse = std::sqrt(sum_error / (sfen_for_mse.size() + epsilon));
-	auto dsig_mae = sum_error2 / (sfen_for_mse.size() + epsilon);
-	auto eval_mae = sum_error3 / (sfen_for_mse.size() + epsilon);
-	cout << " , dsig rmse = " << dsig_rmse << " , dsig mae = " << dsig_mae
-		<< " , eval mae = " << eval_mae;
+        // rmse = root mean square error: mean square error
+        // mae = mean absolute error: mean absolute error
+        auto dsig_rmse = std::sqrt(sum_error / (sfen_for_mse.size() + epsilon));
+        auto dsig_mae = sum_error2 / (sfen_for_mse.size() + epsilon);
+        auto eval_mae = sum_error3 / (sfen_for_mse.size() + epsilon);
+        cout << " , dsig rmse = " << dsig_rmse << " , dsig mae = " << dsig_mae
+            << " , eval mae = " << eval_mae;
 #endif
 
 #if defined ( LOSS_FUNCTION_IS_ELMO_METHOD )
 #if defined(EVAL_NNUE)
-	latest_loss_sum += test_sum_cross_entropy - test_sum_entropy;
-	latest_loss_count += sr.sfen_for_mse.size();
+        latest_loss_sum += test_sum_cross_entropy - test_sum_entropy;
+        latest_loss_count += sr.sfen_for_mse.size();
 #endif
 
-// learn_cross_entropy may be called train cross entropy in the world of machine learning,
-// When omitting the acronym, it is nice to be able to distinguish it from test cross entropy(tce) by writing it as lce.
+        // learn_cross_entropy may be called train cross entropy in the world of machine learning,
+        // When omitting the acronym, it is nice to be able to distinguish it from test cross entropy(tce) by writing it as lce.
 
-	if (sr.sfen_for_mse.size() && done)
-	{
-		cout
-			<< " , test_cross_entropy_eval = "  << test_sum_cross_entropy_eval / sr.sfen_for_mse.size()
-			<< " , test_cross_entropy_win = "   << test_sum_cross_entropy_win / sr.sfen_for_mse.size()
-			<< " , test_entropy_eval = "        << test_sum_entropy_eval / sr.sfen_for_mse.size()
-			<< " , test_entropy_win = "         << test_sum_entropy_win / sr.sfen_for_mse.size()
-			<< " , test_cross_entropy = "       << test_sum_cross_entropy / sr.sfen_for_mse.size()
-			<< " , test_entropy = "             << test_sum_entropy / sr.sfen_for_mse.size()
-			<< " , norm = "						<< sum_norm
-			<< " , move accuracy = "			<< (move_accord_count * 100.0 / sr.sfen_for_mse.size()) << "%";
-		if (done != static_cast<uint64_t>(-1))
-		{
-			cout
-				<< " , learn_cross_entropy_eval = " << learn_sum_cross_entropy_eval / done
-				<< " , learn_cross_entropy_win = "  << learn_sum_cross_entropy_win / done
-				<< " , learn_entropy_eval = "       << learn_sum_entropy_eval / done
-				<< " , learn_entropy_win = "        << learn_sum_entropy_win / done
-				<< " , learn_cross_entropy = "      << learn_sum_cross_entropy / done
-				<< " , learn_entropy = "            << learn_sum_entropy / done;
-		}
-		cout << endl;
-	}
-	else {
-		cout << "Error! : sr.sfen_for_mse.size() = " << sr.sfen_for_mse.size() << " ,  done = " << done << endl;
-	}
+        if (sr.sfen_for_mse.size() && done)
+        {
+            cout
+                << " , test_cross_entropy_eval = " << test_sum_cross_entropy_eval / sr.sfen_for_mse.size()
+                << " , test_cross_entropy_win = " << test_sum_cross_entropy_win / sr.sfen_for_mse.size()
+                << " , test_entropy_eval = " << test_sum_entropy_eval / sr.sfen_for_mse.size()
+                << " , test_entropy_win = " << test_sum_entropy_win / sr.sfen_for_mse.size()
+                << " , test_cross_entropy = " << test_sum_cross_entropy / sr.sfen_for_mse.size()
+                << " , test_entropy = " << test_sum_entropy / sr.sfen_for_mse.size()
+                << " , norm = " << sum_norm
+                << " , move accuracy = " << (move_accord_count * 100.0 / sr.sfen_for_mse.size()) << "%";
+            if (done != static_cast<uint64_t>(-1))
+            {
+                cout
+                    << " , learn_cross_entropy_eval = " << learn_sum_cross_entropy_eval / done
+                    << " , learn_cross_entropy_win = " << learn_sum_cross_entropy_win / done
+                    << " , learn_entropy_eval = " << learn_sum_entropy_eval / done
+                    << " , learn_entropy_win = " << learn_sum_entropy_win / done
+                    << " , learn_cross_entropy = " << learn_sum_cross_entropy / done
+                    << " , learn_entropy = " << learn_sum_entropy / done;
+            }
+            cout << endl;
+        }
+        else {
+            cout << "Error! : sr.sfen_for_mse.size() = " << sr.sfen_for_mse.size() << " ,  done = " << done << endl;
+        }
 
-	// Clear 0 for next time.
-	learn_sum_cross_entropy_eval = 0.0;
-	learn_sum_cross_entropy_win = 0.0;
-	learn_sum_cross_entropy = 0.0;
-	learn_sum_entropy_eval = 0.0;
-	learn_sum_entropy_win = 0.0;
-	learn_sum_entropy = 0.0;
+        // Clear 0 for next time.
+        learn_sum_cross_entropy_eval = 0.0;
+        learn_sum_cross_entropy_win = 0.0;
+        learn_sum_cross_entropy = 0.0;
+        learn_sum_entropy_eval = 0.0;
+        learn_sum_entropy_win = 0.0;
+        learn_sum_entropy = 0.0;
 #else
-	<< endl;
+        << endl;
 #endif
-}
+    }
 
 
-void LearnerThink::thread_worker(size_t thread_id)
-{
+    void LearnerThink::thread_worker(size_t thread_id)
+    {
 #if defined(_OPENMP)
-	omp_set_num_threads((int)Options["Threads"]);
+        omp_set_num_threads((int)Options["Threads"]);
 #endif
 
-	auto th = Threads[thread_id];
-	auto& pos = th->rootPos;
+        auto th = Threads[thread_id];
+        auto& pos = th->rootPos;
 
-	while (true)
-	{
-	// display mse (this is sometimes done only for thread 0)
-	// Immediately after being read from the file...
+        while (true)
+        {
+            // display mse (this is sometimes done only for thread 0)
+            // Immediately after being read from the file...
 
 #if defined(EVAL_NNUE)
-		// Lock the evaluation function so that it is not used during updating.
-		shared_lock<shared_timed_mutex> read_lock(nn_mutex, defer_lock);
-		if (sr.next_update_weights <= sr.total_done ||
-		    (thread_id != 0 && !read_lock.try_lock()))
+        // Lock the evaluation function so that it is not used during updating.
+            shared_lock<shared_timed_mutex> read_lock(nn_mutex, defer_lock);
+            if (sr.next_update_weights <= sr.total_done ||
+                (thread_id != 0 && !read_lock.try_lock()))
 #else
-		if (sr.next_update_weights <= sr.total_done)
+            if (sr.next_update_weights <= sr.total_done)
 #endif
-		{
-			if (thread_id != 0)
-			{
-				// Wait except thread_id == 0.
+            {
+                if (thread_id != 0)
+                {
+                    // Wait except thread_id == 0.
 
-				if (stop_flag)
-					break;
+                    if (stop_flag)
+                        break;
 
-				// I want to parallelize rmse calculation etc., so if task() is loaded, process it.
-				task_dispatcher.on_idle(thread_id);
-				continue;
-			}
-			else
-			{
-				// Only thread_id == 0 performs the following update process.
+                    // I want to parallelize rmse calculation etc., so if task() is loaded, process it.
+                    task_dispatcher.on_idle(thread_id);
+                    continue;
+                }
+                else
+                {
+                    // Only thread_id == 0 performs the following update process.
 
-				// The weight array is not updated for the first time.
-				if (sr.next_update_weights == 0)
-				{
-					sr.next_update_weights += mini_batch_size;
-					continue;
-				}
+                    // The weight array is not updated for the first time.
+                    if (sr.next_update_weights == 0)
+                    {
+                        sr.next_update_weights += mini_batch_size;
+                        continue;
+                    }
 
 #if !defined(EVAL_NNUE)
-				// Output the current time. Output every time.
-				std::cout << sr.total_done << " sfens , at " << now_string() << std::endl;
+                    // Output the current time. Output every time.
+                    std::cout << sr.total_done << " sfens , at " << now_string() << std::endl;
 
-				// Reflect the gradient in the weight array at this timing. The calculation of the gradient is just right for each 1M phase in terms of mini-batch.
-				Eval::update_weights(epoch , freeze);
+                    // Reflect the gradient in the weight array at this timing. The calculation of the gradient is just right for each 1M phase in terms of mini-batch.
+                    Eval::update_weights(epoch, freeze);
 
-				// Display epoch and current eta for debugging.
-				std::cout << "epoch = " << epoch << " , eta = " << Eval::get_eta() << std::endl;
+                    // Display epoch and current eta for debugging.
+                    std::cout << "epoch = " << epoch << " , eta = " << Eval::get_eta() << std::endl;
 #else
-				{
-					// update parameters
+                    {
+                        // update parameters
 
-					// Lock the evaluation function so that it is not used during updating.
-					lock_guard<shared_timed_mutex> write_lock(nn_mutex);
-					Eval::NNUE::UpdateParameters(epoch);
-				}
+                        // Lock the evaluation function so that it is not used during updating.
+                        lock_guard<shared_timed_mutex> write_lock(nn_mutex);
+                        Eval::NNUE::UpdateParameters(epoch);
+                    }
 #endif
-				++epoch;
+                    ++epoch;
 
-				// Save once every 1 billion phases.
+                    // Save once every 1 billion phases.
 
-				// However, the elapsed time during update_weights() and calc_rmse() is ignored.
-				if (++sr.save_count * mini_batch_size >= eval_save_interval)
-				{
-					sr.save_count = 0;
+                    // However, the elapsed time during update_weights() and calc_rmse() is ignored.
+                    if (++sr.save_count * mini_batch_size >= eval_save_interval)
+                    {
+                        sr.save_count = 0;
 
-					// During this time, as the gradient calculation proceeds, the value becomes too large and I feel annoyed, so stop other threads.
-					const bool converged = save();
-					if (converged)
-					{
-						stop_flag = true;
-						sr.stop_flag = true;
-						break;
-					}
-				}
+                        // During this time, as the gradient calculation proceeds, the value becomes too large and I feel annoyed, so stop other threads.
+                        const bool converged = save();
+                        if (converged)
+                        {
+                            stop_flag = true;
+                            sr.stop_flag = true;
+                            break;
+                        }
+                    }
 
-				// Calculate rmse. This is done for samples of 10,000 phases.
-				// If you do with 40 cores, update_weights every 1 million phases
-				// I don't think it's so good to be tiring.
-				static uint64_t loss_output_count = 0;
-				if (++loss_output_count * mini_batch_size >= loss_output_interval)
-				{
-					loss_output_count = 0;
+                    // Calculate rmse. This is done for samples of 10,000 phases.
+                    // If you do with 40 cores, update_weights every 1 million phases
+                    // I don't think it's so good to be tiring.
+                    static uint64_t loss_output_count = 0;
+                    if (++loss_output_count * mini_batch_size >= loss_output_interval)
+                    {
+                        loss_output_count = 0;
 
-					// Number of cases processed this time
-					uint64_t done = sr.total_done - sr.last_done;
+                        // Number of cases processed this time
+                        uint64_t done = sr.total_done - sr.last_done;
 
-					// loss calculation
-					calc_loss(thread_id , done);
+                        // loss calculation
+                        calc_loss(thread_id, done);
 
 #if defined(EVAL_NNUE)
-					Eval::NNUE::CheckHealth();
+                        Eval::NNUE::CheckHealth();
 #endif
 
-					// Make a note of how far you have totaled.
-					sr.last_done = sr.total_done;
-				}
+                        // Make a note of how far you have totaled.
+                        sr.last_done = sr.total_done;
+                    }
 
-				// Next time, I want you to do this series of processing again when you process only mini_batch_size.
-				sr.next_update_weights += mini_batch_size;
+                    // Next time, I want you to do this series of processing again when you process only mini_batch_size.
+                    sr.next_update_weights += mini_batch_size;
 
-				// Since I was waiting for the update of this sr.next_update_weights except the main thread,
-				// Once this value is updated, it will start moving again.
-			}
-		}
+                    // Since I was waiting for the update of this sr.next_update_weights except the main thread,
+                    // Once this value is updated, it will start moving again.
+                }
+            }
 
-		PackedSfenValue ps;
-	RetryRead:;
-		if (!sr.read_to_thread_buffer(thread_id, ps))
-		{
-			// ran out of thread pool for my thread.
-			// Because there are almost no phases left,
-			// Terminate all other threads.
+            PackedSfenValue ps;
+        RetryRead:;
+            if (!sr.read_to_thread_buffer(thread_id, ps))
+            {
+                // ran out of thread pool for my thread.
+                // Because there are almost no phases left,
+                // Terminate all other threads.
 
-			stop_flag = true;
-			break;
-		}
+                stop_flag = true;
+                break;
+            }
 
-		// The evaluation value exceeds the learning target value.
-		// Ignore this aspect information.
-		if (eval_limit <abs(ps.score))
-			goto RetryRead;
+            // The evaluation value exceeds the learning target value.
+            // Ignore this aspect information.
+            if (eval_limit < abs(ps.score))
+                goto RetryRead;
 
 
-		if (!use_draw_games_in_training && ps.game_result == 0)
-			goto RetryRead;
+            if (!use_draw_games_in_training && ps.game_result == 0)
+                goto RetryRead;
 
 
-		// Skip over the opening phase
-		if (ps.gamePly < prng.rand(reduction_gameply))
-			goto RetryRead;
+            // Skip over the opening phase
+            if (ps.gamePly < prng.rand(reduction_gameply))
+                goto RetryRead;
 
 #if 0
-		auto sfen = pos.sfen_unpack(ps.data);
-		pos.set(sfen);
+            auto sfen = pos.sfen_unpack(ps.data);
+            pos.set(sfen);
 #endif
-		// ↑ Since it is slow when passing through sfen, I made a dedicated function.
-		StateInfo si;
-		const bool mirror = prng.rand(100) < mirror_percentage;
-		if (pos.set_from_packed_sfen(ps.sfen,&si,th,mirror) != 0)
-		{
-			// I got a strange sfen. Should be debugged!
-			// Since it is an illegal sfen, it may not be displayed with pos.sfen(), but it is better than not.
-			cout << "Error! : illigal packed sfen = " << pos.fen() << endl;
-			goto RetryRead;
-		}
+            // ↑ Since it is slow when passing through sfen, I made a dedicated function.
+            StateInfo si;
+            const bool mirror = prng.rand(100) < mirror_percentage;
+            if (pos.set_from_packed_sfen(ps.sfen, &si, th, mirror) != 0)
+            {
+                // I got a strange sfen. Should be debugged!
+                // Since it is an illegal sfen, it may not be displayed with pos.sfen(), but it is better than not.
+                cout << "Error! : illigal packed sfen = " << pos.fen() << endl;
+                goto RetryRead;
+            }
 #if !defined(EVAL_NNUE)
-		{
-			auto key = pos.key();
-			// Exclude the phase used for rmse calculation.
-			if (sr.is_for_rmse(key) && skip_duplicated_positions_in_training)
-				goto RetryRead;
+            {
+                auto key = pos.key();
+                // Exclude the phase used for rmse calculation.
+                if (sr.is_for_rmse(key) && skip_duplicated_positions_in_training)
+                    goto RetryRead;
 
-			// Exclude the most recently used aspect.
-			auto hash_index = size_t(key & (sr.READ_SFEN_HASH_SIZE - 1));
-			auto key2 = sr.hash[hash_index];
-			if (key == key2 && skip_duplicated_positions_in_training)
-				goto RetryRead;
-			sr.hash[hash_index] = key; // Replace with the current key.
-		}
+                // Exclude the most recently used aspect.
+                auto hash_index = size_t(key & (sr.READ_SFEN_HASH_SIZE - 1));
+                auto key2 = sr.hash[hash_index];
+                if (key == key2 && skip_duplicated_positions_in_training)
+                    goto RetryRead;
+                sr.hash[hash_index] = key; // Replace with the current key.
+            }
 #endif
 
-		// There is a possibility that all the pieces are blocked and stuck.
-		// Also, the declaration win phase is excluded from learning because you cannot go to leaf with PV moves.
-		// (shouldn't write out such teacher aspect itself, but may have written it out with an old generation routine)
-	// Skip the position if there are no legal moves (=checkmated or stalemate).
-		if (MoveList<LEGAL>(pos).size() == 0)
-			goto RetryRead;
+            // There is a possibility that all the pieces are blocked and stuck.
+            // Also, the declaration win phase is excluded from learning because you cannot go to leaf with PV moves.
+            // (shouldn't write out such teacher aspect itself, but may have written it out with an old generation routine)
+        // Skip the position if there are no legal moves (=checkmated or stalemate).
+            if (MoveList<LEGAL>(pos).size() == 0)
+                goto RetryRead;
 
-		// I can read it, so try displaying it.
-		//		cout << pos << value << endl;
+            // I can read it, so try displaying it.
+            //      cout << pos << value << endl;
 
-		// Evaluation value of shallow search (qsearch)
-		auto r = qsearch(pos);
-		auto pv = r.second;
+            // Evaluation value of shallow search (qsearch)
+            auto r = qsearch(pos);
+            auto pv = r.second;
 
-		// Evaluation value of deep search
-		auto deep_value = (Value)ps.score;
+            // Evaluation value of deep search
+            auto deep_value = (Value)ps.score;
 
-		// I feel that the mini batch has a better gradient.
-		// Go to the leaf node as it is, add only to the gradient array, and later try AdaGrad at the time of rmse aggregation.
+            // I feel that the mini batch has a better gradient.
+            // Go to the leaf node as it is, add only to the gradient array, and later try AdaGrad at the time of rmse aggregation.
 
-		auto rootColor = pos.side_to_move();
+            auto rootColor = pos.side_to_move();
 
-		// If the initial PV is different, it is better not to use it for learning.
-		// If it is the result of searching a completely different place, it may become noise.
-		// It may be better not to study where the difference in evaluation values ​​is too large.
+            // If the initial PV is different, it is better not to use it for learning.
+            // If it is the result of searching a completely different place, it may become noise.
+            // It may be better not to study where the difference in evaluation values ​​is too large.
 
 #if 0
-		// If you do this, about 13% of the phases will be excluded from the learning target. Good and bad are subtle.
-		if (pv.size() >= 1 && (uint16_t)pv[0] != ps.move)
-		{
-			// dbg_hit_on(false);
-			continue;
-		}
+        // If you do this, about 13% of the phases will be excluded from the learning target. Good and bad are subtle.
+            if (pv.size() >= 1 && (uint16_t)pv[0] != ps.move)
+            {
+                // dbg_hit_on(false);
+                continue;
+            }
 #endif
 
 #if 0
-		// It may be better not to study where the difference in evaluation values ​​is too large.
-		// → It's okay because it passes the win rate function... About 30% of the phases are out of the scope of learning...
-		if (abs((int16_t)r.first - ps.score) >= Eval::PawnValue * 4)
-		{
-//			dbg_hit_on(false);
-			continue;
-		}
-		//		dbg_hit_on(true);
+            // It may be better not to study where the difference in evaluation values ​​is too large.
+            // → It's okay because it passes the win rate function... About 30% of the phases are out of the scope of learning...
+            if (abs((int16_t)r.first - ps.score) >= Eval::PawnValue * 4)
+            {
+                //          dbg_hit_on(false);
+                continue;
+            }
+            //      dbg_hit_on(true);
 #endif
 
-		int ply = 0;
+            int ply = 0;
 
-		// A helper function that adds the gradient to the current phase.
-		auto pos_add_grad = [&]() {
-			// Use the value of evaluate in leaf as shallow_value.
-			// Using the return value of qsearch() as shallow_value,
-			// If PV is interrupted in the middle, the phase where evaluate() is called to calculate the gradient, and
-			// I don't think this is a very desirable property, as the aspect that gives that gradient will be different.
-			// I have turned off the substitution table, but since the pv array has not been updated due to one stumbling block etc...
+            // A helper function that adds the gradient to the current phase.
+            auto pos_add_grad = [&]() {
+                // Use the value of evaluate in leaf as shallow_value.
+                // Using the return value of qsearch() as shallow_value,
+                // If PV is interrupted in the middle, the phase where evaluate() is called to calculate the gradient, and
+                // I don't think this is a very desirable property, as the aspect that gives that gradient will be different.
+                // I have turned off the substitution table, but since the pv array has not been updated due to one stumbling block etc...
 
-			Value shallow_value = (rootColor == pos.side_to_move()) ? Eval::evaluate(pos) : -Eval::evaluate(pos);
+                Value shallow_value = (rootColor == pos.side_to_move()) ? Eval::evaluate(pos) : -Eval::evaluate(pos);
 
 #if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
-			// Calculate loss for training data
-			double learn_cross_entropy_eval, learn_cross_entropy_win, learn_cross_entropy;
-			double learn_entropy_eval, learn_entropy_win, learn_entropy;
-			calc_cross_entropy(deep_value, shallow_value, ps, learn_cross_entropy_eval, learn_cross_entropy_win, learn_cross_entropy, learn_entropy_eval, learn_entropy_win, learn_entropy);
-			learn_sum_cross_entropy_eval += learn_cross_entropy_eval;
-			learn_sum_cross_entropy_win += learn_cross_entropy_win;
-			learn_sum_cross_entropy += learn_cross_entropy;
-			learn_sum_entropy_eval += learn_entropy_eval;
-			learn_sum_entropy_win += learn_entropy_win;
-			learn_sum_entropy += learn_entropy;
+                // Calculate loss for training data
+                double learn_cross_entropy_eval, learn_cross_entropy_win, learn_cross_entropy;
+                double learn_entropy_eval, learn_entropy_win, learn_entropy;
+                calc_cross_entropy(deep_value, shallow_value, ps, learn_cross_entropy_eval, learn_cross_entropy_win, learn_cross_entropy, learn_entropy_eval, learn_entropy_win, learn_entropy);
+                learn_sum_cross_entropy_eval += learn_cross_entropy_eval;
+                learn_sum_cross_entropy_win += learn_cross_entropy_win;
+                learn_sum_cross_entropy += learn_cross_entropy;
+                learn_sum_entropy_eval += learn_entropy_eval;
+                learn_sum_entropy_win += learn_entropy_win;
+                learn_sum_entropy += learn_entropy;
 #endif
 
 #if !defined(EVAL_NNUE)
-			// Slope
-			double dj_dw = calc_grad(deep_value, shallow_value, ps);
+                // Slope
+                double dj_dw = calc_grad(deep_value, shallow_value, ps);
 
-			// Add jd_dw as the gradient (∂J/∂Wj) for the feature vector currently appearing in the leaf node.
+                // Add jd_dw as the gradient (∂J/∂Wj) for the feature vector currently appearing in the leaf node.
 
-			// If it is not PV termination, apply a discount rate.
-			if (discount_rate != 0 && ply != (int)pv.size())
-				dj_dw *= discount_rate;
+                // If it is not PV termination, apply a discount rate.
+                if (discount_rate != 0 && ply != (int)pv.size())
+                    dj_dw *= discount_rate;
 
-			// Since we have reached leaf, add the gradient to the features that appear in this phase.
-			// Update based on gradient later.
-			Eval::add_grad(pos, rootColor, dj_dw, freeze);
+                // Since we have reached leaf, add the gradient to the features that appear in this phase.
+                // Update based on gradient later.
+                Eval::add_grad(pos, rootColor, dj_dw, freeze);
 #else
-			const double example_weight =
-			    (discount_rate != 0 && ply != (int)pv.size()) ? discount_rate : 1.0;
-			Eval::NNUE::AddExample(pos, rootColor, ps, example_weight);
+                const double example_weight =
+                    (discount_rate != 0 && ply != (int)pv.size()) ? discount_rate : 1.0;
+                Eval::NNUE::AddExample(pos, rootColor, ps, example_weight);
 #endif
 
-			// Since the processing is completed, the counter of the processed number is incremented
-			sr.total_done++;
-		};
+                // Since the processing is completed, the counter of the processed number is incremented
+                sr.total_done++;
+            };
 
-		StateInfo state[MAX_PLY]; // PV of qsearch cannot be so long.
-		bool illegal_move = false;
-		for (auto m : pv)
-		{
-			// I shouldn't be an illegal player.
-			// An illegal move sometimes comes here...
-			if (!pos.pseudo_legal(m) || !pos.legal(m))
-			{
-				//cout << pos << m << endl;
-				//assert(false);
-				illegal_move = true;
-				break;
-			}
+            StateInfo state[MAX_PLY]; // PV of qsearch cannot be so long.
+            bool illegal_move = false;
+            for (auto m : pv)
+            {
+                // I shouldn't be an illegal player.
+                // An illegal move sometimes comes here...
+                if (!pos.pseudo_legal(m) || !pos.legal(m))
+                {
+                    //cout << pos << m << endl;
+                    //assert(false);
+                    illegal_move = true;
+                    break;
+                }
 
-			// Processing when adding the gradient to the node on each PV.
-			//If discount_rate is 0, this process is not performed.
-			if (discount_rate != 0)
-				pos_add_grad();
+                // Processing when adding the gradient to the node on each PV.
+                //If discount_rate is 0, this process is not performed.
+                if (discount_rate != 0)
+                    pos_add_grad();
 
-			pos.do_move(m, state[ply++]);
+                pos.do_move(m, state[ply++]);
 
-			// Since the value of evaluate in leaf is used, the difference is updated.
-			Eval::NNUE::update_eval(pos);
-		}
+                // Since the value of evaluate in leaf is used, the difference is updated.
+                Eval::NNUE::update_eval(pos);
+            }
 
-		if (illegal_move) {
-			sync_cout << "An illical move was detected... Excluded the position from the learning data..." << sync_endl;
-			continue;
-		}
+            if (illegal_move) {
+                sync_cout << "An illical move was detected... Excluded the position from the learning data..." << sync_endl;
+                continue;
+            }
 
-		// Since we have reached the end phase of PV, add the slope here.
-		pos_add_grad();
+            // Since we have reached the end phase of PV, add the slope here.
+            pos_add_grad();
 
-		// rewind the phase
-		for (auto it = pv.rbegin(); it != pv.rend(); ++it)
-			pos.undo_move(*it);
+            // rewind the phase
+            for (auto it = pv.rbegin(); it != pv.rend(); ++it)
+                pos.undo_move(*it);
 
 #if 0
-		// When adding the gradient to the root phase
-		shallow_value = (rootColor == pos.side_to_move()) ? Eval::evaluate(pos) : -Eval::evaluate(pos);
-		dj_dw = calc_grad(deep_value, shallow_value, ps);
-		Eval::add_grad(pos, rootColor, dj_dw , without_kpp);
+            // When adding the gradient to the root phase
+            shallow_value = (rootColor == pos.side_to_move()) ? Eval::evaluate(pos) : -Eval::evaluate(pos);
+            dj_dw = calc_grad(deep_value, shallow_value, ps);
+            Eval::add_grad(pos, rootColor, dj_dw, without_kpp);
 #endif
 
-	}
+        }
 
-}
+    }
 
-// Write evaluation function file.
-bool LearnerThink::save(bool is_final)
-{
-	// Each time you save, change the extension part of the file name like "0","1","2",..
-	// (Because I want to compare the winning rate for each evaluation function parameter later)
+    // Write evaluation function file.
+    bool LearnerThink::save(bool is_final)
+    {
+        // Each time you save, change the extension part of the file name like "0","1","2",..
+        // (Because I want to compare the winning rate for each evaluation function parameter later)
 
-	if (save_only_once)
-	{
-		// When EVAL_SAVE_ONLY_ONCE is defined,
-		// Do not dig a subfolder because I want to save it only once.
-		Eval::save_eval("");
-	}
-	else if (is_final) {
-		Eval::save_eval("final");
-		return true;
-	}
-	else {
-		static int dir_number = 0;
-		const std::string dir_name = std::to_string(dir_number++);
-		Eval::save_eval(dir_name);
+        if (save_only_once)
+        {
+            // When EVAL_SAVE_ONLY_ONCE is defined,
+            // Do not dig a subfolder because I want to save it only once.
+            Eval::save_eval("");
+        }
+        else if (is_final) {
+            Eval::save_eval("final");
+            return true;
+        }
+        else {
+            static int dir_number = 0;
+            const std::string dir_name = std::to_string(dir_number++);
+            Eval::save_eval(dir_name);
 #if defined(EVAL_NNUE)
-		if (newbob_decay != 1.0 && latest_loss_count > 0) {
-			static int trials = newbob_num_trials;
-			const double latest_loss = latest_loss_sum / latest_loss_count;
-			latest_loss_sum = 0.0;
-			latest_loss_count = 0;
-			cout << "loss: " << latest_loss;
-			if (latest_loss < best_loss) {
-				cout << " < best (" << best_loss << "), accepted" << endl;
-				best_loss = latest_loss;
-				best_nn_directory = Path::Combine((std::string)Options["EvalSaveDir"], dir_name);
-				trials = newbob_num_trials;
-			} else {
-				cout << " >= best (" << best_loss << "), rejected" << endl;
-				if (best_nn_directory.empty()) {
-					cout << "WARNING: no improvement from initial model" << endl;
-				} else {
-					cout << "restoring parameters from " << best_nn_directory << endl;
-					Eval::NNUE::RestoreParameters(best_nn_directory);
-				}
-				if (--trials > 0 && !is_final) {
-					cout << "reducing learning rate scale from " << newbob_scale
-					     << " to " << (newbob_scale * newbob_decay)
-					     << " (" << trials << " more trials)" << endl;
-					newbob_scale *= newbob_decay;
-					Eval::NNUE::SetGlobalLearningRateScale(newbob_scale);
-				}
-			}
-			if (trials == 0) {
-				cout << "converged" << endl;
-				return true;
-			}
-		}
+            if (newbob_decay != 1.0 && latest_loss_count > 0) {
+                static int trials = newbob_num_trials;
+                const double latest_loss = latest_loss_sum / latest_loss_count;
+                latest_loss_sum = 0.0;
+                latest_loss_count = 0;
+                cout << "loss: " << latest_loss;
+                if (latest_loss < best_loss) {
+                    cout << " < best (" << best_loss << "), accepted" << endl;
+                    best_loss = latest_loss;
+                    best_nn_directory = Path::Combine((std::string)Options["EvalSaveDir"], dir_name);
+                    trials = newbob_num_trials;
+                }
+                else {
+                    cout << " >= best (" << best_loss << "), rejected" << endl;
+                    if (best_nn_directory.empty()) {
+                        cout << "WARNING: no improvement from initial model" << endl;
+                    }
+                    else {
+                        cout << "restoring parameters from " << best_nn_directory << endl;
+                        Eval::NNUE::RestoreParameters(best_nn_directory);
+                    }
+                    if (--trials > 0 && !is_final) {
+                        cout << "reducing learning rate scale from " << newbob_scale
+                            << " to " << (newbob_scale * newbob_decay)
+                            << " (" << trials << " more trials)" << endl;
+                        newbob_scale *= newbob_decay;
+                        Eval::NNUE::SetGlobalLearningRateScale(newbob_scale);
+                    }
+                }
+                if (trials == 0) {
+                    cout << "converged" << endl;
+                    return true;
+                }
+            }
 #endif
-	}
-	return false;
-}
-
-// Shuffle_files(), shuffle_files_quick() subcontracting, writing part.
-// output_file_name: Name of the file to write
-// prng: random number
-// afs: fstream of each teacher phase file
-// a_count: The number of teacher positions inherent in each file.
-void shuffle_write(const string& output_file_name , PRNG& prng , vector<fstream>& afs , vector<uint64_t>& a_count)
-{
-	uint64_t total_sfen_count = 0;
-	for (auto c : a_count)
-		total_sfen_count += c;
-
-	// number of exported phases
-	uint64_t write_sfen_count = 0;
-
-	// Output the progress on the screen for each phase.
-	const uint64_t buffer_size = 10000000;
-
-	auto print_status = [&]()
-	{
-		// Output progress every 10M phase or when all writing is completed
-		if (((write_sfen_count % buffer_size) == 0) ||
-			(write_sfen_count == total_sfen_count))
-			cout << write_sfen_count << " / " << total_sfen_count << endl;
-	};
-
-
-	cout << endl <<  "write : " << output_file_name << endl;
-
-	fstream fs(output_file_name, ios::out | ios::binary);
-
-	// total teacher positions
-	uint64_t sum = 0;
-	for (auto c : a_count)
-		sum += c;
-
-	while (sum != 0)
-	{
-		auto r = prng.rand(sum);
-
-		// Aspects stored in fs[0] file ... Aspects stored in fs[1] file ...
-		//Think of it as a series like, and determine in which file r is pointing.
-		// The contents of the file are shuffled, so you can take the next element from that file.
-		// Each file has a_count[x] phases, so this process can be written as follows.
-
-		uint64_t n = 0;
-		while (a_count[n] <= r)
-			r -= a_count[n++];
-
-		// This confirms n. Before you forget it, reduce the remaining number.
-
-		--a_count[n];
-		--sum;
-
-		PackedSfenValue psv;
-		// It's better to read and write all at once until the performance is not so good...
-		if (afs[n].read((char*)&psv, sizeof(PackedSfenValue)))
-		{
-			fs.write((char*)&psv, sizeof(PackedSfenValue));
-			++write_sfen_count;
-			print_status();
-		}
-	}
-	print_status();
-	fs.close();
-	cout << "done!" << endl;
-}
-
-// Subcontracting the teacher shuffle "learn shuffle" command.
-// output_file_name: name of the output file where the shuffled teacher positions will be written
-void shuffle_files(const vector<string>& filenames , const string& output_file_name , uint64_t buffer_size )
-{
-	// The destination folder is
-	// tmp/ for temporary writing
-
-	// Temporary file is written to tmp/ folder for each buffer_size phase.
-	// For example, if buffer_size = 20M, you need a buffer of 20M*40bytes = 800MB.
-	// In a PC with a small memory, it would be better to reduce this.
-	// However, if the number of files increases too much, it will not be possible to open at the same time due to OS restrictions.
-	// There should have been a limit of 512 per process on Windows, so you can open here as 500,
-	// The current setting is 500 files x 20M = 10G = 10 billion phases.
-
-	PSVector buf;
-	buf.resize(buffer_size);
-	// ↑ buffer, a marker that indicates how much you have used
-	uint64_t buf_write_marker = 0;
-
-	// File name to write (incremental counter because it is a serial number)
-	uint64_t write_file_count = 0;
-
-	// random number to shuffle
-	// Do not use std::random_device().  Because it always the same integers on MinGW.
-	PRNG prng(std::chrono::system_clock::now().time_since_epoch().count());
-
-	// generate the name of the temporary file
-	auto make_filename = [](uint64_t i)
-	{
-		return "tmp/" + to_string(i) + ".bin";
-	};
-
-	// Exported files in tmp/ folder, number of teacher positions stored in each
-	vector<uint64_t> a_count;
-
-	auto write_buffer = [&](uint64_t size)
-	{
-		// shuffle from buf[0] to buf[size-1]
-		for (uint64_t i = 0; i < size; ++i)
-			swap(buf[i], buf[(uint64_t)(prng.rand(size - i) + i)]);
-
-		// write to a file
-		fstream fs;
-		fs.open(make_filename(write_file_count++), ios::out | ios::binary);
-		fs.write((char*)&buf[0], size * sizeof(PackedSfenValue));
-		fs.close();
-		a_count.push_back(size);
-
-		buf_write_marker = 0;
-		cout << ".";
-	};
-
-	Dependency::mkdir("tmp");
-
-	// Shuffle and export as a 10M phase shredded file.
-	for (auto filename : filenames)
-	{
-		fstream fs(filename, ios::in | ios::binary);
-		cout << endl << "open file = " << filename;
-		while (fs.read((char*)&buf[buf_write_marker], sizeof(PackedSfenValue)))
-			if (++buf_write_marker == buffer_size)
-				write_buffer(buffer_size);
-
-		// Read in units of sizeof(PackedSfenValue),
-		// Ignore the last remaining fraction. (Fails in fs.read, so exit while)
-		// (The remaining fraction seems to be half-finished data that was created because it was stopped halfway during teacher generation.)
-
-	}
-
-	if (buf_write_marker != 0)
-		write_buffer(buf_write_marker);
-
-	// Only shuffled files have been written write_file_count.
-	// As a second pass, if you open all of them at the same time, select one at random and load one phase at a time
-	// Now you have shuffled.
-
-	// Original file for shirt full + tmp file + file to write requires 3 times the storage capacity of the original file.
-	// 1 billion SSD is not enough for shuffling because it is 400GB for 10 billion phases.
-	// If you want to delete (or delete by hand) the original file at this point after writing to tmp,
-	// The storage capacity is about twice that of the original file.
-	// So, maybe we should have an option to delete the original file.
-
-	// Files are opened at the same time. It is highly possible that this will exceed FOPEN_MAX.
-	// In that case, rather than adjusting buffer_size to reduce the number of files.
-
-	vector<fstream> afs;
-	for (uint64_t i = 0; i < write_file_count; ++i)
-		afs.emplace_back(fstream(make_filename(i),ios::in | ios::binary));
-
-	// Throw to the subcontract function and end.
-	shuffle_write(output_file_name, prng, afs, a_count);
-}
-
-// Subcontracting the teacher shuffle "learn shuffleq" command.
-// This is written in 1 pass.
-// output_file_name: name of the output file where the shuffled teacher positions will be written
-void shuffle_files_quick(const vector<string>& filenames, const string& output_file_name)
-{
-	// number of phases read
-	uint64_t read_sfen_count = 0;
-
-	// random number to shuffle
-	// Do not use std::random_device().  Because it always the same integers on MinGW.
-	PRNG prng(std::chrono::system_clock::now().time_since_epoch().count());
-
-	// number of files
-	size_t file_count = filenames.size();
-
-	// Number of teacher positions stored in each file in filenames
-	vector<uint64_t> a_count(file_count);
-
-	// Count the number of teacher aspects in each file.
-	vector<fstream> afs(file_count);
-
-	for (size_t i = 0; i <file_count ;++i)
-	{
-		auto filename = filenames[i];
-		auto& fs = afs[i];
-
-		fs.open(filename, ios::in | ios::binary);
-		fs.seekg(0, fstream::end);
-		uint64_t eofPos = (uint64_t)fs.tellg();
-		fs.clear(); // Otherwise, the next seek may fail.
-		fs.seekg(0, fstream::beg);
-		uint64_t begPos = (uint64_t)fs.tellg();
-		uint64_t file_size = eofPos - begPos;
-		uint64_t sfen_count = file_size / sizeof(PackedSfenValue);
-		a_count[i] = sfen_count;
-
-		// Output the number of sfen stored in each file.
-		cout << filename << " = " << sfen_count << " sfens." << endl;
-	}
-
-	// Since we know the file size of each file,
-	// open them all at once (already open),
-	// Select one at a time and load one phase at a time
-	// Now you have shuffled.
-
-	// Throw to the subcontract function and end.
-	shuffle_write(output_file_name, prng, afs, a_count);
-}
-
-// Subcontracting the teacher shuffle "learn shufflem" command.
-// Read the whole memory and write it out with the specified file name.
-void shuffle_files_on_memory(const vector<string>& filenames,const string output_file_name)
-{
-	PSVector buf;
-
-	for (auto filename : filenames)
-	{
-		std::cout << "read : " << filename << std::endl;
-		read_file_to_memory(filename, [&buf](uint64_t size) {
-			assert((size % sizeof(PackedSfenValue)) == 0);
-			// Expand the buffer and read after the last end.
-			uint64_t last = buf.size();
-			buf.resize(last + size / sizeof(PackedSfenValue));
-			return (void*)&buf[last];
-		});
-	}
-
-	// shuffle from buf[0] to buf[size-1]
-	// Do not use std::random_device().  Because it always the same integers on MinGW.
-	PRNG prng(std::chrono::system_clock::now().time_since_epoch().count());
-	uint64_t size = (uint64_t)buf.size();
-	std::cout << "shuffle buf.size() = " << size << std::endl;
-	for (uint64_t i = 0; i < size; ++i)
-		swap(buf[i], buf[(uint64_t)(prng.rand(size - i) + i)]);
-
-	std::cout << "write : " << output_file_name << endl;
-
-	// If the file to be written exceeds 2GB, it cannot be written in one shot with fstream::write, so use wrapper.
-	write_memory_to_file(output_file_name, (void*)&buf[0], (uint64_t)sizeof(PackedSfenValue)*(uint64_t)buf.size());
-
-	std::cout << "..shuffle_on_memory done." << std::endl;
-}
-
-bool fen_is_ok(Position& pos, std::string input_fen) {
-	std::string pos_fen = pos.fen();
-	std::istringstream ss_input(input_fen);
-	std::istringstream ss_pos(pos_fen);
-
-	// example : "2r4r/4kpp1/nb1np3/p2p3p/B2P1BP1/PP6/4NPKP/2R1R3 w - h6 0 24"
-	//       --> "2r4r/4kpp1/nb1np3/p2p3p/B2P1BP1/PP6/4NPKP/2R1R3"
-	std::string str_input, str_pos;
-	ss_input >> str_input;
-	ss_pos >> str_pos;
-
-	// Only compare "Piece placement field" between input_fen and pos.fen().
-	return str_input == str_pos;
-}
-
-void convert_bin(const vector<string>& filenames, const string& output_file_name, const int ply_minimum, const int ply_maximum, const int interpolate_eval, const bool check_invalid_fen, const bool check_illegal_move)
-{
-	std::cout << "check_invalid_fen=" << check_invalid_fen << std::endl;
-	std::cout << "check_illegal_move=" << check_illegal_move << std::endl;
-
-	std::fstream fs;
-	uint64_t data_size=0;
-	uint64_t filtered_size = 0;
-	uint64_t filtered_size_fen = 0;
-	uint64_t filtered_size_move = 0;
-	uint64_t filtered_size_ply = 0;
-	auto th = Threads.main();
-	auto &tpos = th->rootPos;
-	// convert plain rag to packed sfenvalue for Yaneura king
-	fs.open(output_file_name, ios::app | ios::binary);
-	StateListPtr states;
-	for (auto filename : filenames) {
-		std::cout << "convert " << filename << " ... ";
-		std::string line;
-		ifstream ifs;
-		ifs.open(filename);
-		PackedSfenValue p;
-		data_size = 0;
-		filtered_size = 0;
-		filtered_size_fen = 0;
-		filtered_size_move = 0;
-		filtered_size_ply = 0;
-		p.gamePly = 1; // Not included in apery format. Should be initialized
-		bool ignore_flag_fen = false;
-		bool ignore_flag_move = false;
-		bool ignore_flag_ply = false;
-		while (std::getline(ifs, line)) {
-			std::stringstream ss(line);
-			std::string token;
-			std::string value;
-			ss >> token;
-			if (token == "fen") {
-				states = StateListPtr(new std::deque<StateInfo>(1)); // Drop old and create a new one
-				std::string input_fen = line.substr(4);
-				tpos.set(input_fen, false, &states->back(), Threads.main());
-				if (check_invalid_fen && !fen_is_ok(tpos, input_fen)) {
-					ignore_flag_fen = true;
-					filtered_size_fen++;
-				}
-				else {
-					tpos.sfen_pack(p.sfen);
-				}
-			}
-			else if (token == "move") {
-				ss >> value;
-				Move move = UCI::to_move(tpos, value);
-				if (check_illegal_move && move == MOVE_NONE) {
-					ignore_flag_move = true;
-					filtered_size_move++;
-				}
-				else {
-					p.move = move;
-				}
-			}
-			else if (token == "score") {
-				double score;
-				ss >> score;
-				// Training Formula · Issue #71 · nodchip/Stockfish https://github.com/nodchip/Stockfish/issues/71
-				// Normalize to [0.0, 1.0].
-				score = (score - src_score_min_value) / (src_score_max_value - src_score_min_value);
-				// Scale to [dest_score_min_value, dest_score_max_value].
-				score = score * (dest_score_max_value - dest_score_min_value) + dest_score_min_value;
-				p.score = Math::clamp((int32_t)std::round(score) , -(int32_t)VALUE_MATE , (int32_t)VALUE_MATE);
-			}
-			else if (token == "ply") {
-				int temp;
-				ss >> temp;
-				if(temp < ply_minimum || temp > ply_maximum){
-					ignore_flag_ply = true;
-					filtered_size_ply++;
-				}
-				p.gamePly = uint16_t(temp); // No cast here?
-				if (interpolate_eval != 0){
-					p.score = min(3000, interpolate_eval * temp);
-				}
-			}
-			else if (token == "result") {
-				int temp;
-				ss >> temp;
-				p.game_result = int8_t(temp); // Do you need a cast here?
-				if (interpolate_eval){
-					p.score = p.score * p.game_result;
-				}
-			}
-			else if (token == "e") {
-				if (!(ignore_flag_fen || ignore_flag_move || ignore_flag_ply)) {
-					fs.write((char*)&p, sizeof(PackedSfenValue));
-					data_size+=1;
-					// debug
-					// std::cout<<tpos<<std::endl;
-					// std::cout<<p.score<<","<<int(p.gamePly)<<","<<int(p.game_result)<<std::endl;
-				}
-				else {
-					filtered_size++;
-				}
-				ignore_flag_fen = false;
-				ignore_flag_move = false;
-				ignore_flag_ply = false;
-			}
-		}
-		std::cout << "done " << data_size << " parsed " << filtered_size << " is filtered"
-				  << " (invalid fen:" << filtered_size_fen << ", illegal move:" << filtered_size_move << ", invalid ply:" << filtered_size_ply << ")" << std::endl;
-		ifs.close();
-	}
-	std::cout << "all done" << std::endl;
-	fs.close();
-}
-
-static inline void ltrim(std::string &s) {
-	s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](int ch) {
-		return !std::isspace(ch);
-	}));
-}
-
-static inline void rtrim(std::string &s) {
-	s.erase(std::find_if(s.rbegin(), s.rend(), [](int ch) {
-		return !std::isspace(ch);
-	}).base(), s.end());
-}
-
-static inline void trim(std::string &s) {
-	ltrim(s);
-	rtrim(s);
-}
-
-int parse_game_result_from_pgn_extract(std::string result) {
-	// White Win
-	if (result == "\"1-0\"") {
-		return 1;
-	}
-	// Black Win
-	else if (result == "\"0-1\"") {
-		return -1;
-	}
-	// Draw
-	else {
-		return 0;
-	}
-}
-
-// 0.25 -->  0.25 * PawnValueEg
-// #-4  --> -mate_in(4)
-// #3   -->  mate_in(3)
-// -M4  --> -mate_in(4)
-// +M3  -->  mate_in(3)
-Value parse_score_from_pgn_extract(std::string eval, bool& success) {
-	success = true;
-
-	if (eval.substr(0, 1) == "#") {
-		if (eval.substr(1, 1) == "-") {
-			return -mate_in(stoi(eval.substr(2, eval.length() - 2)));
-		}
-		else {
-			return mate_in(stoi(eval.substr(1, eval.length() - 1)));
-		}
-	}
-	else if (eval.substr(0, 2) == "-M") {
-		//std::cout << "eval=" << eval << std::endl;
-		return -mate_in(stoi(eval.substr(2, eval.length() - 2)));
-	}
-	else if (eval.substr(0, 2) == "+M") {
-		//std::cout << "eval=" << eval << std::endl;
-		return mate_in(stoi(eval.substr(2, eval.length() - 2)));
-	}
-	else {
-		char *endptr;
-		double value = strtod(eval.c_str(), &endptr);
-
-		if (*endptr != '\0') {
-			success = false;
-			return VALUE_ZERO;
-		}
-		else {
-			return Value(value * static_cast<double>(PawnValueEg));
-		}
-	}
-}
-
-// for Debug
-//#define DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT
-
-bool is_like_fen(std::string fen) {
-	int count_space = std::count(fen.cbegin(), fen.cend(), ' ');
-	int count_slash = std::count(fen.cbegin(), fen.cend(), '/');
-
-#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
-	//std::cout << "count_space=" << count_space << std::endl;
-	//std::cout << "count_slash=" << count_slash << std::endl;
-#endif
-
-	return count_space == 5 && count_slash == 7;
-}
-
-void convert_bin_from_pgn_extract(const vector<string>& filenames, const string& output_file_name, const bool pgn_eval_side_to_move, const bool convert_no_eval_fens_as_score_zero)
-{
-	std::cout << "pgn_eval_side_to_move=" << pgn_eval_side_to_move << std::endl;
-	std::cout << "convert_no_eval_fens_as_score_zero=" << convert_no_eval_fens_as_score_zero << std::endl;
-
-	auto th = Threads.main();
-	auto &pos = th->rootPos;
-
-	std::fstream ofs;
-	ofs.open(output_file_name, ios::out | ios::binary);
-
-	int game_count = 0;
-	int fen_count = 0;
-
-	for (auto filename : filenames) {
-		std::cout << now_string() << " convert " << filename << std::endl;
-		ifstream ifs;
-		ifs.open(filename);
-
-		int game_result = 0;
-
-		std::string line;
-		while (std::getline(ifs, line)) {
-
-			if (line.empty()) {
-				continue;
-			}
-
-			else if (line.substr(0, 1) == "[") {
-				std::regex pattern_result(R"(\[Result (.+?)\])");
-				std::smatch match;
-
-				// example: [Result "1-0"]
-				if (std::regex_search(line, match, pattern_result)) {
-					game_result = parse_game_result_from_pgn_extract(match.str(1));
-#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
-					std::cout << "game_result=" << game_result << std::endl;
-#endif
-					game_count++;
-					if (game_count % 10000 == 0) {
-						std::cout << now_string() << " game_count=" << game_count << ", fen_count=" << fen_count << std::endl;
-					}
-				}
-
-				continue;
-			}
-
-			else {
-				int gamePly = 1;
-				auto itr = line.cbegin();
-
-				while (true) {
-					gamePly++;
-
-					PackedSfenValue psv;
-					memset((char*)&psv, 0, sizeof(PackedSfenValue));
-
-					// fen
-					{
-						bool fen_found = false;
-
-						while (!fen_found) {
-							std::regex pattern_bracket(R"(\{(.+?)\})");
-							std::smatch match;
-							if (!std::regex_search(itr, line.cend(), match, pattern_bracket)) {
-								break;
-							}
-
-							itr += match.position(0) + match.length(0) - 1;
-							std::string str_fen = match.str(1);
-							trim(str_fen);
-
-							if (is_like_fen(str_fen)) {
-								fen_found = true;
-
-								StateInfo si;
-								pos.set(str_fen, false, &si, th);
-								pos.sfen_pack(psv.sfen);
-							}
-
-#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
-							std::cout << "str_fen=" << str_fen << std::endl;
-							std::cout << "fen_found=" << fen_found << std::endl;
-#endif
-						}
-
-						if (!fen_found) {
-							break;
-						}
-					}
-
-					// move
-					{
-						std::regex pattern_move(R"(\}(.+?)\{)");
-						std::smatch match;
-						if (!std::regex_search(itr, line.cend(), match, pattern_move)) {
-							break;
-						}
-
-						itr += match.position(0) + match.length(0) - 1;
-						std::string str_move = match.str(1);
-						trim(str_move);
-#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
-						std::cout << "str_move=" << str_move << std::endl;
-#endif
-						psv.move = UCI::to_move(pos, str_move);
-					}
-
-					// eval
-					bool eval_found = false;
-					{
-						std::regex pattern_bracket(R"(\{(.+?)\})");
-						std::smatch match;
-						if (!std::regex_search(itr, line.cend(), match, pattern_bracket)) {
-							break;
-						}
-
-						std::string str_eval_clk = match.str(1);
-						trim(str_eval_clk);
-#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
-						std::cout << "str_eval_clk=" << str_eval_clk << std::endl;
-#endif
-
-						// example: { [%eval 0.25] [%clk 0:10:00] }
-						// example: { [%eval #-4] [%clk 0:10:00] }
-						// example: { [%eval #3] [%clk 0:10:00] }
-						// example: { +0.71/22 1.2s }
-						// example: { -M4/7 0.003s }
-						// example: { M3/245 0.017s }
-						// example: { +M1/245 0.010s, White mates }
-						// example: { 0.60 }
-						// example: { book }
-						// example: { rnbqkb1r/pp3ppp/2p1pn2/3p4/2PP4/2N2N2/PP2PPPP/R1BQKB1R w KQkq - 0 5 }
-
-						// Considering the absence of eval
-						if (!is_like_fen(str_eval_clk)) {
-							itr += match.position(0) + match.length(0) - 1;
-
-							if (str_eval_clk != "book") {
-								std::regex pattern_eval1(R"(\[\%eval (.+?)\])");
-								std::regex pattern_eval2(R"((.+?)\/)");
-
-								std::string str_eval;
-								if (std::regex_search(str_eval_clk, match, pattern_eval1) ||
-									std::regex_search(str_eval_clk, match, pattern_eval2)) {
-									str_eval = match.str(1);
-									trim(str_eval);
-								}
-								else {
-									str_eval = str_eval_clk;
-								}
-
-								bool success = false;
-								Value value = parse_score_from_pgn_extract(str_eval, success);
-								if (success) {
-									eval_found = true;
-									psv.score = Math::clamp(value, -VALUE_MATE , VALUE_MATE);
-								}
-
-#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
-								std::cout << "str_eval=" << str_eval << std::endl;
-								std::cout << "success=" << success << ", psv.score=" << psv.score << std::endl;
-#endif
-							}
-						}
-					}
-
-					// write
-					if (eval_found || convert_no_eval_fens_as_score_zero) {
-						if (!eval_found && convert_no_eval_fens_as_score_zero) {
-							psv.score = 0;
-						}
-
-						psv.gamePly = gamePly;
-						psv.game_result = game_result;
-
-						if (pos.side_to_move() == BLACK) {
-							if (!pgn_eval_side_to_move) {
-								psv.score *= -1;
-							}
-							psv.game_result *= -1;
-						}
-
-						ofs.write((char*)&psv, sizeof(PackedSfenValue));
-
-						fen_count++;
-					}
-				}
-
-				game_result = 0;
-			}
-		}
-	}
-
-	std::cout << now_string() << " game_count=" << game_count << ", fen_count=" << fen_count << std::endl;
-	std::cout << now_string() << " all done" << std::endl;
-	ofs.close();
-}
-
-void convert_plain(const vector<string>& filenames, const string& output_file_name)
-{
-	Position tpos;
-	std::ofstream ofs;
-	ofs.open(output_file_name, ios::app);
-	auto th = Threads.main();
-	for (auto filename : filenames) {
-		std::cout << "convert " << filename << " ... ";
-
-		// Just convert packedsfenvalue to text
-		std::fstream fs;
-		fs.open(filename, ios::in | ios::binary);
-		PackedSfenValue p;
-		while (true)
-		{
-			if (fs.read((char*)&p, sizeof(PackedSfenValue))) {
-				StateInfo si;
-				tpos.set_from_packed_sfen(p.sfen, &si, th, false);
-
-				// write as plain text
-				ofs << "fen " << tpos.fen() << std::endl;
-				ofs << "move " << UCI::move(Move(p.move), false) << std::endl;
-				ofs << "score " << p.score << std::endl;
-				ofs << "ply " << int(p.gamePly) << std::endl;
-				ofs << "result " << int(p.game_result) << std::endl;
-				ofs << "e" << std::endl;
-			}
-			else {
-				break;
-			}
-		}
-		fs.close();
-		std::cout << "done" << std::endl;
-	}
-	ofs.close();
-	std::cout << "all done" << std::endl;
-}
-
-// Learning from the generated game record
-void learn(Position&, istringstream& is)
-{
-	auto thread_num = (int)Options["Threads"];
-	SfenReader sr(thread_num);
-
-	LearnerThink learn_think(sr);
-	vector<string> filenames;
-
-	// mini_batch_size 1M aspect by default. This can be increased.
-	auto mini_batch_size = LEARN_MINI_BATCH_SIZE;
-
-	// Number of loops (read the game record file this number of times)
-	int loop = 1;
-
-	// Game file storage folder (get game file with relative path from here)
-	string base_dir;
-
-	string target_dir;
-
-	// If 0, it will be the default value.
-	double eta1 = 0.0;
-	double eta2 = 0.0;
-	double eta3 = 0.0;
-	uint64_t eta1_epoch = 0; // eta2 is not applied by default
-	uint64_t eta2_epoch = 0; // eta3 is not applied by default
+        }
+        return false;
+    }
+
+    // Shuffle_files(), shuffle_files_quick() subcontracting, writing part.
+    // output_file_name: Name of the file to write
+    // prng: random number
+    // afs: fstream of each teacher phase file
+    // a_count: The number of teacher positions inherent in each file.
+    void shuffle_write(const string& output_file_name, PRNG& prng, vector<fstream>& afs, vector<uint64_t>& a_count)
+    {
+        uint64_t total_sfen_count = 0;
+        for (auto c : a_count)
+            total_sfen_count += c;
+
+        // number of exported phases
+        uint64_t write_sfen_count = 0;
+
+        // Output the progress on the screen for each phase.
+        const uint64_t buffer_size = 10000000;
+
+        auto print_status = [&]()
+        {
+            // Output progress every 10M phase or when all writing is completed
+            if (((write_sfen_count % buffer_size) == 0) ||
+                (write_sfen_count == total_sfen_count))
+                cout << write_sfen_count << " / " << total_sfen_count << endl;
+        };
+
+
+        cout << endl << "write : " << output_file_name << endl;
+
+        fstream fs(output_file_name, ios::out | ios::binary);
+
+        // total teacher positions
+        uint64_t sum = 0;
+        for (auto c : a_count)
+            sum += c;
+
+        while (sum != 0)
+        {
+            auto r = prng.rand(sum);
+
+            // Aspects stored in fs[0] file ... Aspects stored in fs[1] file ...
+            //Think of it as a series like, and determine in which file r is pointing.
+            // The contents of the file are shuffled, so you can take the next element from that file.
+            // Each file has a_count[x] phases, so this process can be written as follows.
+
+            uint64_t n = 0;
+            while (a_count[n] <= r)
+                r -= a_count[n++];
+
+            // This confirms n. Before you forget it, reduce the remaining number.
+
+            --a_count[n];
+            --sum;
+
+            PackedSfenValue psv;
+            // It's better to read and write all at once until the performance is not so good...
+            if (afs[n].read((char*)&psv, sizeof(PackedSfenValue)))
+            {
+                fs.write((char*)&psv, sizeof(PackedSfenValue));
+                ++write_sfen_count;
+                print_status();
+            }
+        }
+        print_status();
+        fs.close();
+        cout << "done!" << endl;
+    }
+
+    // Subcontracting the teacher shuffle "learn shuffle" command.
+    // output_file_name: name of the output file where the shuffled teacher positions will be written
+    void shuffle_files(const vector<string>& filenames, const string& output_file_name, uint64_t buffer_size)
+    {
+        // The destination folder is
+        // tmp/ for temporary writing
+
+        // Temporary file is written to tmp/ folder for each buffer_size phase.
+        // For example, if buffer_size = 20M, you need a buffer of 20M*40bytes = 800MB.
+        // In a PC with a small memory, it would be better to reduce this.
+        // However, if the number of files increases too much, it will not be possible to open at the same time due to OS restrictions.
+        // There should have been a limit of 512 per process on Windows, so you can open here as 500,
+        // The current setting is 500 files x 20M = 10G = 10 billion phases.
+
+        PSVector buf;
+        buf.resize(buffer_size);
+        // ↑ buffer, a marker that indicates how much you have used
+        uint64_t buf_write_marker = 0;
+
+        // File name to write (incremental counter because it is a serial number)
+        uint64_t write_file_count = 0;
+
+        // random number to shuffle
+        // Do not use std::random_device().  Because it always the same integers on MinGW.
+        PRNG prng(std::chrono::system_clock::now().time_since_epoch().count());
+
+        // generate the name of the temporary file
+        auto make_filename = [](uint64_t i)
+        {
+            return "tmp/" + to_string(i) + ".bin";
+        };
+
+        // Exported files in tmp/ folder, number of teacher positions stored in each
+        vector<uint64_t> a_count;
+
+        auto write_buffer = [&](uint64_t size)
+        {
+            // shuffle from buf[0] to buf[size-1]
+            for (uint64_t i = 0; i < size; ++i)
+                swap(buf[i], buf[(uint64_t)(prng.rand(size - i) + i)]);
+
+            // write to a file
+            fstream fs;
+            fs.open(make_filename(write_file_count++), ios::out | ios::binary);
+            fs.write((char*)&buf[0], size * sizeof(PackedSfenValue));
+            fs.close();
+            a_count.push_back(size);
+
+            buf_write_marker = 0;
+            cout << ".";
+        };
+
+        Dependency::mkdir("tmp");
+
+        // Shuffle and export as a 10M phase shredded file.
+        for (auto filename : filenames)
+        {
+            fstream fs(filename, ios::in | ios::binary);
+            cout << endl << "open file = " << filename;
+            while (fs.read((char*)&buf[buf_write_marker], sizeof(PackedSfenValue)))
+                if (++buf_write_marker == buffer_size)
+                    write_buffer(buffer_size);
+
+            // Read in units of sizeof(PackedSfenValue),
+            // Ignore the last remaining fraction. (Fails in fs.read, so exit while)
+            // (The remaining fraction seems to be half-finished data that was created because it was stopped halfway during teacher generation.)
+
+        }
+
+        if (buf_write_marker != 0)
+            write_buffer(buf_write_marker);
+
+        // Only shuffled files have been written write_file_count.
+        // As a second pass, if you open all of them at the same time, select one at random and load one phase at a time
+        // Now you have shuffled.
+
+        // Original file for shirt full + tmp file + file to write requires 3 times the storage capacity of the original file.
+        // 1 billion SSD is not enough for shuffling because it is 400GB for 10 billion phases.
+        // If you want to delete (or delete by hand) the original file at this point after writing to tmp,
+        // The storage capacity is about twice that of the original file.
+        // So, maybe we should have an option to delete the original file.
+
+        // Files are opened at the same time. It is highly possible that this will exceed FOPEN_MAX.
+        // In that case, rather than adjusting buffer_size to reduce the number of files.
+
+        vector<fstream> afs;
+        for (uint64_t i = 0; i < write_file_count; ++i)
+            afs.emplace_back(fstream(make_filename(i), ios::in | ios::binary));
+
+        // Throw to the subcontract function and end.
+        shuffle_write(output_file_name, prng, afs, a_count);
+    }
+
+    // Subcontracting the teacher shuffle "learn shuffleq" command.
+    // This is written in 1 pass.
+    // output_file_name: name of the output file where the shuffled teacher positions will be written
+    void shuffle_files_quick(const vector<string>& filenames, const string& output_file_name)
+    {
+        // random number to shuffle
+        // Do not use std::random_device().  Because it always the same integers on MinGW.
+        PRNG prng(std::chrono::system_clock::now().time_since_epoch().count());
+
+        // number of files
+        size_t file_count = filenames.size();
+
+        // Number of teacher positions stored in each file in filenames
+        vector<uint64_t> a_count(file_count);
+
+        // Count the number of teacher aspects in each file.
+        vector<fstream> afs(file_count);
+
+        for (size_t i = 0; i < file_count; ++i)
+        {
+            auto filename = filenames[i];
+            auto& fs = afs[i];
+
+            fs.open(filename, ios::in | ios::binary);
+            fs.seekg(0, fstream::end);
+            uint64_t eofPos = (uint64_t)fs.tellg();
+            fs.clear(); // Otherwise, the next seek may fail.
+            fs.seekg(0, fstream::beg);
+            uint64_t begPos = (uint64_t)fs.tellg();
+            uint64_t file_size = eofPos - begPos;
+            uint64_t sfen_count = file_size / sizeof(PackedSfenValue);
+            a_count[i] = sfen_count;
+
+            // Output the number of sfen stored in each file.
+            cout << filename << " = " << sfen_count << " sfens." << endl;
+        }
+
+        // Since we know the file size of each file,
+        // open them all at once (already open),
+        // Select one at a time and load one phase at a time
+        // Now you have shuffled.
+
+        // Throw to the subcontract function and end.
+        shuffle_write(output_file_name, prng, afs, a_count);
+    }
+
+    // Subcontracting the teacher shuffle "learn shufflem" command.
+    // Read the whole memory and write it out with the specified file name.
+    void shuffle_files_on_memory(const vector<string>& filenames, const string output_file_name)
+    {
+        PSVector buf;
+
+        for (auto filename : filenames)
+        {
+            std::cout << "read : " << filename << std::endl;
+            read_file_to_memory(filename, [&buf](uint64_t size) {
+                assert((size % sizeof(PackedSfenValue)) == 0);
+                // Expand the buffer and read after the last end.
+                uint64_t last = buf.size();
+                buf.resize(last + size / sizeof(PackedSfenValue));
+                return (void*)&buf[last];
+                });
+        }
+
+        // shuffle from buf[0] to buf[size-1]
+        // Do not use std::random_device().  Because it always the same integers on MinGW.
+        PRNG prng(std::chrono::system_clock::now().time_since_epoch().count());
+        uint64_t size = (uint64_t)buf.size();
+        std::cout << "shuffle buf.size() = " << size << std::endl;
+        for (uint64_t i = 0; i < size; ++i)
+            swap(buf[i], buf[(uint64_t)(prng.rand(size - i) + i)]);
+
+        std::cout << "write : " << output_file_name << endl;
+
+        // If the file to be written exceeds 2GB, it cannot be written in one shot with fstream::write, so use wrapper.
+        write_memory_to_file(output_file_name, (void*)&buf[0], (uint64_t)sizeof(PackedSfenValue) * (uint64_t)buf.size());
+
+        std::cout << "..shuffle_on_memory done." << std::endl;
+    }
+
+    // Learning from the generated game record
+    void learn(Position&, istringstream& is)
+    {
+        auto thread_num = (int)Options["Threads"];
+        SfenReader sr(thread_num);
+
+        LearnerThink learn_think(sr);
+        vector<string> filenames;
+
+        // mini_batch_size 1M aspect by default. This can be increased.
+        auto mini_batch_size = LEARN_MINI_BATCH_SIZE;
+
+        // Number of loops (read the game record file this number of times)
+        int loop = 1;
+
+        // Game file storage folder (get game file with relative path from here)
+        string base_dir;
+
+        string target_dir;
+
+        // If 0, it will be the default value.
+        double eta1 = 0.0;
+        double eta2 = 0.0;
+        double eta3 = 0.0;
+        uint64_t eta1_epoch = 0; // eta2 is not applied by default
+        uint64_t eta2_epoch = 0; // eta3 is not applied by default
 
 #if defined(USE_GLOBAL_OPTIONS)
-	// Save it for later restore.
-	auto oldGlobalOptions = GlobalOptions;
-	// If you hit the eval hash, you can not calculate rmse etc. so turn it off.
-	GlobalOptions.use_eval_hash = false;
-	// If you hit the replacement table, pruning may occur at the previous evaluation value, so turn it off.
-	GlobalOptions.use_hash_probe = false;
+    // Save it for later restore.
+        auto oldGlobalOptions = GlobalOptions;
+        // If you hit the eval hash, you can not calculate rmse etc. so turn it off.
+        GlobalOptions.use_eval_hash = false;
+        // If you hit the replacement table, pruning may occur at the previous evaluation value, so turn it off.
+        GlobalOptions.use_hash_probe = false;
 #endif
 
-	// --- Function that only shuffles the teacher aspect
+        // --- Function that only shuffles the teacher aspect
 
-	// normal shuffle
-	bool shuffle_normal = false;
-	uint64_t buffer_size = 20000000;
-	// fast shuffling assuming each file is shuffled
-	bool shuffle_quick = false;
-	// A function to read the entire file in memory and shuffle it. (Requires file size memory)
-	bool shuffle_on_memory = false;
-	// Conversion of packed sfen. In plain, it consists of sfen(string), evaluation value (integer), move (eg 7g7f, string), result (loss-1, win 1, draw 0)
-	bool use_convert_plain = false;
-	// convert plain format teacher to Yaneura King's bin
-	bool use_convert_bin = false;
-	int ply_minimum = 0;
-	int ply_maximum = 114514;
-	bool interpolate_eval = 0;
-	bool check_invalid_fen = false;
-	bool check_illegal_move = false;
-	// convert teacher in pgn-extract format to Yaneura King's bin
-	bool use_convert_bin_from_pgn_extract = false;
-	bool pgn_eval_side_to_move = false;
-	bool convert_no_eval_fens_as_score_zero = false;
-	// File name to write in those cases (default is "shuffled_sfen.bin")
-	string output_file_name = "shuffled_sfen.bin";
+        // normal shuffle
+        bool shuffle_normal = false;
+        uint64_t buffer_size = 20000000;
+        // fast shuffling assuming each file is shuffled
+        bool shuffle_quick = false;
+        // A function to read the entire file in memory and shuffle it. (Requires file size memory)
+        bool shuffle_on_memory = false;
+        // Conversion of packed sfen. In plain, it consists of sfen(string), evaluation value (integer), move (eg 7g7f, string), result (loss-1, win 1, draw 0)
+        bool use_convert_plain = false;
+        // convert plain format teacher to Yaneura King's bin
+        bool use_convert_bin = false;
+        int ply_minimum = 0;
+        int ply_maximum = 114514;
+        bool interpolate_eval = 0;
+        bool check_invalid_fen = false;
+        bool check_illegal_move = false;
+        // convert teacher in pgn-extract format to Yaneura King's bin
+        bool use_convert_bin_from_pgn_extract = false;
+        bool pgn_eval_side_to_move = false;
+        bool convert_no_eval_fens_as_score_zero = false;
+        // File name to write in those cases (default is "shuffled_sfen.bin")
+        string output_file_name = "shuffled_sfen.bin";
 
-	// If the absolute value of the evaluation value in the deep search of the teacher phase exceeds this value, that phase is discarded.
-	int eval_limit = 32000;
+        // If the absolute value of the evaluation value in the deep search of the teacher phase exceeds this value, that phase is discarded.
+        int eval_limit = 32000;
 
-	// Flag to save the evaluation function file only once near the end.
-	bool save_only_once = false;
+        // Flag to save the evaluation function file only once near the end.
+        bool save_only_once = false;
 
-	// Shuffle about what you are pre-reading on the teacher aspect. (Shuffle of about 10 million phases)
-	// Turn on if you want to pass a pre-shuffled file.
-	bool no_shuffle = false;
+        // Shuffle about what you are pre-reading on the teacher aspect. (Shuffle of about 10 million phases)
+        // Turn on if you want to pass a pre-shuffled file.
+        bool no_shuffle = false;
 
 #if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
-	// elmo lambda
-	ELMO_LAMBDA = 0.33;
-	ELMO_LAMBDA2 = 0.33;
-	ELMO_LAMBDA_LIMIT = 32000;
+        // elmo lambda
+        ELMO_LAMBDA = 0.33;
+        ELMO_LAMBDA2 = 0.33;
+        ELMO_LAMBDA_LIMIT = 32000;
 #endif
 
-	// Discount rate. If this is set to a value other than 0, the slope will be added even at other than the PV termination. (At that time, apply this discount rate)
-	double discount_rate = 0;
+        // Discount rate. If this is set to a value other than 0, the slope will be added even at other than the PV termination. (At that time, apply this discount rate)
+        double discount_rate = 0;
 
-	// if (gamePly <rand(reduction_gameply)) continue;
-	// An option to exclude the early stage from the learning target moderately like
-	// If set to 1, rand(1)==0, so nothing is excluded.
-	int reduction_gameply = 1;
+        // if (gamePly <rand(reduction_gameply)) continue;
+        // An option to exclude the early stage from the learning target moderately like
+        // If set to 1, rand(1)==0, so nothing is excluded.
+        int reduction_gameply = 1;
 
-	// Optional item that does not let you learn KK/KKP/KPP/KPPP
-	array<bool,4> freeze = {};
+        // Optional item that does not let you learn KK/KKP/KPP/KPPP
+        array<bool, 4> freeze = {};
 
 #if defined(EVAL_NNUE)
-	uint64_t nn_batch_size = 1000;
-	double newbob_decay = 1.0;
-	int newbob_num_trials = 2;
-	string nn_options;
+        uint64_t nn_batch_size = 1000;
+        double newbob_decay = 1.0;
+        int newbob_num_trials = 2;
+        string nn_options;
 #endif
 
-	uint64_t eval_save_interval = LEARN_EVAL_SAVE_INTERVAL;
-	uint64_t loss_output_interval = 0;
-	uint64_t mirror_percentage = 0;
+        uint64_t eval_save_interval = LEARN_EVAL_SAVE_INTERVAL;
+        uint64_t loss_output_interval = 0;
+        uint64_t mirror_percentage = 0;
 
-	string validation_set_file_name;
+        string validation_set_file_name;
 
-	// Assume the filenames are staggered.
-	while (true)
-	{
-		string option;
-		is >> option;
+        // Assume the filenames are staggered.
+        while (true)
+        {
+            string option;
+            is >> option;
 
-		if (option == "")
-			break;
+            if (option == "")
+                break;
 
-		// specify the number of phases of mini-batch
-		if (option == "bat")
-		{
-			is >> mini_batch_size;
-			mini_batch_size *= 10000; // Unit is ten thousand
-		}
+            // specify the number of phases of mini-batch
+            if (option == "bat")
+            {
+                is >> mini_batch_size;
+                mini_batch_size *= 10000; // Unit is ten thousand
+            }
 
-		// Specify the folder in which the game record is stored and make it the rooting target.
-		else if (option == "targetdir") is >> target_dir;
+            // Specify the folder in which the game record is stored and make it the rooting target.
+            else if (option == "targetdir") is >> target_dir;
 
-		// Specify the number of loops
-		else if (option == "loop")      is >> loop;
+            // Specify the number of loops
+            else if (option == "loop")      is >> loop;
 
-		// Game file storage folder (get game file with relative path from here)
-		else if (option == "basedir")   is >> base_dir;
+            // Game file storage folder (get game file with relative path from here)
+            else if (option == "basedir")   is >> base_dir;
 
-		// Mini batch size
-		else if (option == "batchsize") is >> mini_batch_size;
+            // Mini batch size
+            else if (option == "batchsize") is >> mini_batch_size;
 
-		// learning rate
-		else if (option == "eta")        is >> eta1;
-		else if (option == "eta1")       is >> eta1; // alias
-		else if (option == "eta2")       is >> eta2;
-		else if (option == "eta3")       is >> eta3;
-		else if (option == "eta1_epoch") is >> eta1_epoch;
-		else if (option == "eta2_epoch") is >> eta2_epoch;
-		// Accept also the old option name.
-		else if (option == "use_draw_in_training" || option == "use_draw_games_in_training") is >> use_draw_games_in_training;
-		// Accept also the old option name.
-		else if (option == "use_draw_in_validation" || option == "use_draw_games_in_validation") is >> use_draw_games_in_validation;
-		// Accept also the old option name.
-		else if (option == "use_hash_in_training" || option == "skip_duplicated_positions_in_training") is >> skip_duplicated_positions_in_training;
-		else if (option == "winning_probability_coefficient") is >> winning_probability_coefficient;
-		// Discount rate
-		else if (option == "discount_rate") is >> discount_rate;
-		// Using WDL with win rate model instead of sigmoid
-		else if (option == "use_wdl") is >> use_wdl;
+            // learning rate
+            else if (option == "eta")        is >> eta1;
+            else if (option == "eta1")       is >> eta1; // alias
+            else if (option == "eta2")       is >> eta2;
+            else if (option == "eta3")       is >> eta3;
+            else if (option == "eta1_epoch") is >> eta1_epoch;
+            else if (option == "eta2_epoch") is >> eta2_epoch;
+            // Accept also the old option name.
+            else if (option == "use_draw_in_training" || option == "use_draw_games_in_training") is >> use_draw_games_in_training;
+            // Accept also the old option name.
+            else if (option == "use_draw_in_validation" || option == "use_draw_games_in_validation") is >> use_draw_games_in_validation;
+            // Accept also the old option name.
+            else if (option == "use_hash_in_training" || option == "skip_duplicated_positions_in_training") is >> skip_duplicated_positions_in_training;
+            else if (option == "winning_probability_coefficient") is >> winning_probability_coefficient;
+            // Discount rate
+            else if (option == "discount_rate") is >> discount_rate;
+            // Using WDL with win rate model instead of sigmoid
+            else if (option == "use_wdl") is >> use_wdl;
 
-		// No learning of KK/KKP/KPP/KPPP.
-		else if (option == "freeze_kk")    is >> freeze[0];
-		else if (option == "freeze_kkp")   is >> freeze[1];
-		else if (option == "freeze_kpp")   is >> freeze[2];
+            // No learning of KK/KKP/KPP/KPPP.
+            else if (option == "freeze_kk")    is >> freeze[0];
+            else if (option == "freeze_kkp")   is >> freeze[1];
+            else if (option == "freeze_kpp")   is >> freeze[2];
 
 #if defined(EVAL_KPPT) || defined(EVAL_KPP_KKPT) || defined(EVAL_KPP_KKPT_FV_VAR) || defined(EVAL_NABLA)
 
 #elif defined(EVAL_KPPPT) || defined(EVAL_KPPP_KKPT) || defined(EVAL_HELICES)
-		else if (option == "freeze_kppp")  is >> freeze[3];
+            else if (option == "freeze_kppp")  is >> freeze[3];
 #elif defined(EVAL_KKPP_KKPT) || defined(EVAL_KKPPT)
-		else if (option == "freeze_kkpp")  is >> freeze[3];
+            else if (option == "freeze_kkpp")  is >> freeze[3];
 #endif
 
 #if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
-		// LAMBDA
-		else if (option == "lambda")       is >> ELMO_LAMBDA;
-		else if (option == "lambda2")      is >> ELMO_LAMBDA2;
-		else if (option == "lambda_limit") is >> ELMO_LAMBDA_LIMIT;
+            // LAMBDA
+            else if (option == "lambda")       is >> ELMO_LAMBDA;
+            else if (option == "lambda2")      is >> ELMO_LAMBDA2;
+            else if (option == "lambda_limit") is >> ELMO_LAMBDA_LIMIT;
 
 #endif
-		else if (option == "reduction_gameply") is >> reduction_gameply;
+            else if (option == "reduction_gameply") is >> reduction_gameply;
 
-		// shuffle related
-		else if (option == "shuffle")	shuffle_normal = true;
-		else if (option == "buffer_size") is >> buffer_size;
-		else if (option == "shuffleq")	shuffle_quick = true;
-		else if (option == "shufflem")	shuffle_on_memory = true;
-		else if (option == "output_file_name") is >> output_file_name;
+            // shuffle related
+            else if (option == "shuffle")   shuffle_normal = true;
+            else if (option == "buffer_size") is >> buffer_size;
+            else if (option == "shuffleq")  shuffle_quick = true;
+            else if (option == "shufflem")  shuffle_on_memory = true;
+            else if (option == "output_file_name") is >> output_file_name;
 
-		else if (option == "eval_limit") is >> eval_limit;
-		else if (option == "save_only_once") save_only_once = true;
-		else if (option == "no_shuffle") no_shuffle = true;
+            else if (option == "eval_limit") is >> eval_limit;
+            else if (option == "save_only_once") save_only_once = true;
+            else if (option == "no_shuffle") no_shuffle = true;
 
 #if defined(EVAL_NNUE)
-		else if (option == "nn_batch_size") is >> nn_batch_size;
-		else if (option == "newbob_decay") is >> newbob_decay;
-		else if (option == "newbob_num_trials") is >> newbob_num_trials;
-		else if (option == "nn_options") is >> nn_options;
+            else if (option == "nn_batch_size") is >> nn_batch_size;
+            else if (option == "newbob_decay") is >> newbob_decay;
+            else if (option == "newbob_num_trials") is >> newbob_num_trials;
+            else if (option == "nn_options") is >> nn_options;
 #endif
-		else if (option == "eval_save_interval") is >> eval_save_interval;
-		else if (option == "loss_output_interval") is >> loss_output_interval;
-		else if (option == "mirror_percentage") is >> mirror_percentage;
-		else if (option == "validation_set_file_name") is >> validation_set_file_name;
+            else if (option == "eval_save_interval") is >> eval_save_interval;
+            else if (option == "loss_output_interval") is >> loss_output_interval;
+            else if (option == "mirror_percentage") is >> mirror_percentage;
+            else if (option == "validation_set_file_name") is >> validation_set_file_name;
 
-		// Rabbit convert related
-		else if (option == "convert_plain") use_convert_plain = true;
-		else if (option == "convert_bin") use_convert_bin = true;
-		else if (option == "interpolate_eval") is >> interpolate_eval;
-		else if (option == "check_invalid_fen") is >> check_invalid_fen;
-		else if (option == "check_illegal_move") is >> check_illegal_move;
-		else if (option == "convert_bin_from_pgn-extract") use_convert_bin_from_pgn_extract = true;
-		else if (option == "pgn_eval_side_to_move") is >> pgn_eval_side_to_move;
-		else if (option == "convert_no_eval_fens_as_score_zero") is >> convert_no_eval_fens_as_score_zero;
-		else if (option == "src_score_min_value") is >> src_score_min_value;
-		else if (option == "src_score_max_value") is >> src_score_max_value;
-		else if (option == "dest_score_min_value") is >> dest_score_min_value;
-		else if (option == "dest_score_max_value") is >> dest_score_max_value;
-		else if (option == "convert_teacher_signal_to_winning_probability") is >> convert_teacher_signal_to_winning_probability;
-		else if (option == "use_raw_nnue_eval") is >> use_raw_nnue_eval;
+            // Rabbit convert related
+            else if (option == "convert_plain") use_convert_plain = true;
+            else if (option == "convert_bin") use_convert_bin = true;
+            else if (option == "interpolate_eval") is >> interpolate_eval;
+            else if (option == "check_invalid_fen") is >> check_invalid_fen;
+            else if (option == "check_illegal_move") is >> check_illegal_move;
+            else if (option == "convert_bin_from_pgn-extract") use_convert_bin_from_pgn_extract = true;
+            else if (option == "pgn_eval_side_to_move") is >> pgn_eval_side_to_move;
+            else if (option == "convert_no_eval_fens_as_score_zero") is >> convert_no_eval_fens_as_score_zero;
+            else if (option == "src_score_min_value") is >> src_score_min_value;
+            else if (option == "src_score_max_value") is >> src_score_max_value;
+            else if (option == "dest_score_min_value") is >> dest_score_min_value;
+            else if (option == "dest_score_max_value") is >> dest_score_max_value;
+            else if (option == "convert_teacher_signal_to_winning_probability") is >> convert_teacher_signal_to_winning_probability;
+            else if (option == "use_raw_nnue_eval") is >> use_raw_nnue_eval;
 
-		// Otherwise, it's a filename.
-		else
-			filenames.push_back(option);
-	}
-	if (loss_output_interval == 0)
-		loss_output_interval = LEARN_RMSE_OUTPUT_INTERVAL * mini_batch_size;
+            // Otherwise, it's a filename.
+            else
+                filenames.push_back(option);
+        }
+        if (loss_output_interval == 0)
+            loss_output_interval = LEARN_RMSE_OUTPUT_INTERVAL * mini_batch_size;
 
-	cout << "learn command , ";
+        cout << "learn command , ";
 
-	// Issue a warning if OpenMP is disabled.
+        // Issue a warning if OpenMP is disabled.
 #if !defined(_OPENMP)
-	cout << "Warning! OpenMP disabled." << endl;
+        cout << "Warning! OpenMP disabled." << endl;
 #endif
 
-	// Display learning game file
-	if (target_dir != "")
-	{
-		string kif_base_dir = Path::Combine(base_dir, target_dir);
+        // Display learning game file
+        if (target_dir != "")
+        {
+            string kif_base_dir = Path::Combine(base_dir, target_dir);
 
-		// Remove this folder. Keep it relative to base_dir.
+            // Remove this folder. Keep it relative to base_dir.
 #if defined(_MSC_VER)
-		// If you use std::tr2, warning C4996 will appear, so suppress it.
-		// * std::tr2 issued a deprecation warning by default under std:c++14, and was deleted by default in /std:c++17.
-		#pragma warning(push)
-		#pragma warning(disable:4996)
+        // If you use std::tr2, warning C4996 will appear, so suppress it.
+        // * std::tr2 issued a deprecation warning by default under std:c++14, and was deleted by default in /std:c++17.
+#pragma warning(push)
+#pragma warning(disable:4996)
 
-		namespace sys = std::filesystem;
-		sys::path p(kif_base_dir); // Origin of enumeration
-		std::for_each(sys::directory_iterator(p), sys::directory_iterator(),
-			[&](const sys::path& p) {
-			if (sys::is_regular_file(p))
-				filenames.push_back(Path::Combine(target_dir, p.filename().generic_string()));
-		});
-		#pragma warning(pop)
+            namespace sys = std::filesystem;
+            sys::path p(kif_base_dir); // Origin of enumeration
+            std::for_each(sys::directory_iterator(p), sys::directory_iterator(),
+                [&](const sys::path& p) {
+                    if (sys::is_regular_file(p))
+                        filenames.push_back(Path::Combine(target_dir, p.filename().generic_string()));
+                });
+#pragma warning(pop)
 
 #elif defined(__GNUC__)
 
-		auto ends_with = [](std::string const & value, std::string const & ending)
-		{
-			if (ending.size() > value.size()) return false;
-			return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
-		};
+            auto ends_with = [](std::string const& value, std::string const& ending)
+            {
+                if (ending.size() > value.size()) return false;
+                return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
+            };
 
-		// It can't be helped, so read it using dirent.h.
-		DIR *dp; // pointer to directory
-		dirent* entry; // entry point returned by readdir()
+            // It can't be helped, so read it using dirent.h.
+            DIR* dp; // pointer to directory
+            dirent* entry; // entry point returned by readdir()
 
-		dp = opendir(kif_base_dir.c_str());
-		if (dp != NULL)
-		{
-			do {
-				entry = readdir(dp);
-				// Only list files ending with ".bin"
-				// →I hate this restriction when generating files with serial numbers...
-				if (entry != NULL  && ends_with(entry->d_name, ".bin")  )
-				{
-					//cout << entry->d_name << endl;
-					filenames.push_back(Path::Combine(target_dir, entry->d_name));
-				}
-			} while (entry != NULL);
-			closedir(dp);
-		}
+            dp = opendir(kif_base_dir.c_str());
+            if (dp != NULL)
+            {
+                do {
+                    entry = readdir(dp);
+                    // Only list files ending with ".bin"
+                    // →I hate this restriction when generating files with serial numbers...
+                    if (entry != NULL && ends_with(entry->d_name, ".bin"))
+                    {
+                        //cout << entry->d_name << endl;
+                        filenames.push_back(Path::Combine(target_dir, entry->d_name));
+                    }
+                } while (entry != NULL);
+                closedir(dp);
+            }
 #endif
-	}
+        }
 
-	cout << "learn from ";
-	for (auto s : filenames)
-		cout << s << " , ";
-	cout << endl;
-	if (!validation_set_file_name.empty())
-	{
-		cout << "validation set  : " << validation_set_file_name << endl;
-	}
+        cout << "learn from ";
+        for (auto s : filenames)
+            cout << s << " , ";
+        cout << endl;
+        if (!validation_set_file_name.empty())
+        {
+            cout << "validation set  : " << validation_set_file_name << endl;
+        }
 
-	cout << "base dir        : " << base_dir   << endl;
-	cout << "target dir      : " << target_dir << endl;
+        cout << "base dir        : " << base_dir << endl;
+        cout << "target dir      : " << target_dir << endl;
 
-	// shuffle mode
-	if (shuffle_normal)
-	{
-		cout << "buffer_size     : " << buffer_size << endl;
-		cout << "shuffle mode.." << endl;
-		shuffle_files(filenames,output_file_name , buffer_size);
-		return;
-	}
-	if (shuffle_quick)
-	{
-		cout << "quick shuffle mode.." << endl;
-		shuffle_files_quick(filenames, output_file_name);
-		return;
-	}
-	if (shuffle_on_memory)
-	{
-		cout << "shuffle on memory.." << endl;
-		shuffle_files_on_memory(filenames,output_file_name);
-		return;
-	}
-	if (use_convert_plain)
-	{
-		Eval::init_NNUE();
-		cout << "convert_plain.." << endl;
-		convert_plain(filenames, output_file_name);
-		return;
-	}
-	if (use_convert_bin)
-	{
-		Eval::init_NNUE();
-		cout << "convert_bin.." << endl;
-		convert_bin(filenames,output_file_name, ply_minimum, ply_maximum, interpolate_eval, check_invalid_fen, check_illegal_move);
-		return;
+        // shuffle mode
+        if (shuffle_normal)
+        {
+            cout << "buffer_size     : " << buffer_size << endl;
+            cout << "shuffle mode.." << endl;
+            shuffle_files(filenames, output_file_name, buffer_size);
+            return;
+        }
+        if (shuffle_quick)
+        {
+            cout << "quick shuffle mode.." << endl;
+            shuffle_files_quick(filenames, output_file_name);
+            return;
+        }
+        if (shuffle_on_memory)
+        {
+            cout << "shuffle on memory.." << endl;
+            shuffle_files_on_memory(filenames, output_file_name);
+            return;
+        }
+        if (use_convert_plain)
+        {
+            Eval::init_NNUE();
+            cout << "convert_plain.." << endl;
+            convert_plain(filenames, output_file_name);
+            return;
+        }
+        if (use_convert_bin)
+        {
+            Eval::init_NNUE();
+            cout << "convert_bin.." << endl;
+            convert_bin(
+                filenames, 
+                output_file_name, 
+                ply_minimum, 
+                ply_maximum, 
+                interpolate_eval, 
+                src_score_min_value,
+                src_score_max_value,
+                dest_score_min_value,
+                dest_score_max_value,
+                check_invalid_fen, 
+                check_illegal_move);
+            return;
 
-	}
-	if (use_convert_bin_from_pgn_extract)
-	{
-		Eval::init_NNUE();
-		cout << "convert_bin_from_pgn-extract.." << endl;
-		convert_bin_from_pgn_extract(filenames, output_file_name, pgn_eval_side_to_move, convert_no_eval_fens_as_score_zero);
-		return;
-	}
+        }
+        if (use_convert_bin_from_pgn_extract)
+        {
+            Eval::init_NNUE();
+            cout << "convert_bin_from_pgn-extract.." << endl;
+            convert_bin_from_pgn_extract(filenames, output_file_name, pgn_eval_side_to_move, convert_no_eval_fens_as_score_zero);
+            return;
+        }
 
-	cout << "loop              : " << loop << endl;
-	cout << "eval_limit        : " << eval_limit << endl;
-	cout << "save_only_once    : " << (save_only_once ? "true" : "false") << endl;
-	cout << "no_shuffle        : " << (no_shuffle ? "true" : "false") << endl;
+        cout << "loop              : " << loop << endl;
+        cout << "eval_limit        : " << eval_limit << endl;
+        cout << "save_only_once    : " << (save_only_once ? "true" : "false") << endl;
+        cout << "no_shuffle        : " << (no_shuffle ? "true" : "false") << endl;
 
-	// Insert the file name for the number of loops.
-	for (int i = 0; i < loop; ++i)
-		// sfen reader, I'll read it in reverse order so I'll reverse it here. I'm sorry.
-		for (auto it = filenames.rbegin(); it != filenames.rend(); ++it)
-			sr.filenames.push_back(Path::Combine(base_dir, *it));
+        // Insert the file name for the number of loops.
+        for (int i = 0; i < loop; ++i)
+            // sfen reader, I'll read it in reverse order so I'll reverse it here. I'm sorry.
+            for (auto it = filenames.rbegin(); it != filenames.rend(); ++it)
+                sr.filenames.push_back(Path::Combine(base_dir, *it));
 
 #if !defined(EVAL_NNUE)
-	cout << "Gradient Method   : " << LEARN_UPDATE      << endl;
+        cout << "Gradient Method   : " << LEARN_UPDATE << endl;
 #endif
-	cout << "Loss Function     : " << LOSS_FUNCTION     << endl;
-	cout << "mini-batch size   : " << mini_batch_size   << endl;
+        cout << "Loss Function     : " << LOSS_FUNCTION << endl;
+        cout << "mini-batch size   : " << mini_batch_size << endl;
 #if defined(EVAL_NNUE)
-	cout << "nn_batch_size     : " << nn_batch_size     << endl;
-	cout << "nn_options        : " << nn_options        << endl;
+        cout << "nn_batch_size     : " << nn_batch_size << endl;
+        cout << "nn_options        : " << nn_options << endl;
 #endif
-	cout << "learning rate     : " << eta1 << " , " << eta2 << " , " << eta3 << endl;
-	cout << "eta_epoch         : " << eta1_epoch << " , " << eta2_epoch << endl;
-	cout << "use_draw_games_in_training : " << use_draw_games_in_training << endl;
-	cout << "use_draw_games_in_validation : " << use_draw_games_in_validation << endl;
-	cout << "skip_duplicated_positions_in_training : " << skip_duplicated_positions_in_training << endl;
+        cout << "learning rate     : " << eta1 << " , " << eta2 << " , " << eta3 << endl;
+        cout << "eta_epoch         : " << eta1_epoch << " , " << eta2_epoch << endl;
+        cout << "use_draw_games_in_training : " << use_draw_games_in_training << endl;
+        cout << "use_draw_games_in_validation : " << use_draw_games_in_validation << endl;
+        cout << "skip_duplicated_positions_in_training : " << skip_duplicated_positions_in_training << endl;
 #if defined(EVAL_NNUE)
-	if (newbob_decay != 1.0) {
-		cout << "scheduling        : newbob with decay = " << newbob_decay
-		     << ", " << newbob_num_trials << " trials" << endl;
-	} else {
-		cout << "scheduling        : default" << endl;
-	}
+        if (newbob_decay != 1.0) {
+            cout << "scheduling        : newbob with decay = " << newbob_decay
+                << ", " << newbob_num_trials << " trials" << endl;
+        }
+        else {
+            cout << "scheduling        : default" << endl;
+        }
 #endif
-	cout << "discount rate     : " << discount_rate     << endl;
+        cout << "discount rate     : " << discount_rate << endl;
 
-	// If reduction_gameply is set to 0, rand(0) will be divided by 0, so correct it to 1.
-	reduction_gameply = max(reduction_gameply, 1);
-	cout << "reduction_gameply : " << reduction_gameply << endl;
+        // If reduction_gameply is set to 0, rand(0) will be divided by 0, so correct it to 1.
+        reduction_gameply = max(reduction_gameply, 1);
+        cout << "reduction_gameply : " << reduction_gameply << endl;
 
 #if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
-	cout << "LAMBDA            : " << ELMO_LAMBDA       << endl;
-	cout << "LAMBDA2           : " << ELMO_LAMBDA2      << endl;
-	cout << "LAMBDA_LIMIT      : " << ELMO_LAMBDA_LIMIT << endl;
+        cout << "LAMBDA            : " << ELMO_LAMBDA << endl;
+        cout << "LAMBDA2           : " << ELMO_LAMBDA2 << endl;
+        cout << "LAMBDA_LIMIT      : " << ELMO_LAMBDA_LIMIT << endl;
 #endif
-	cout << "mirror_percentage : " << mirror_percentage << endl;
-	cout << "eval_save_interval  : " << eval_save_interval << " sfens" << endl;
-	cout << "loss_output_interval: " << loss_output_interval << " sfens" << endl;
+        cout << "mirror_percentage : " << mirror_percentage << endl;
+        cout << "eval_save_interval  : " << eval_save_interval << " sfens" << endl;
+        cout << "loss_output_interval: " << loss_output_interval << " sfens" << endl;
 
 #if defined(EVAL_KPPT) || defined(EVAL_KPP_KKPT) || defined(EVAL_KPP_KKPT_FV_VAR) || defined(EVAL_NABLA)
-	cout << "freeze_kk/kkp/kpp      : " << freeze[0] << " , " << freeze[1] << " , " << freeze[2] << endl;
+        cout << "freeze_kk/kkp/kpp      : " << freeze[0] << " , " << freeze[1] << " , " << freeze[2] << endl;
 #elif defined(EVAL_KPPPT) || defined(EVAL_KPPP_KKPT) || defined(EVAL_HELICES)
-	cout << "freeze_kk/kkp/kpp/kppp : " << freeze[0] << " , " << freeze[1] << " , " << freeze[2] << " , " << freeze[3] << endl;
+        cout << "freeze_kk/kkp/kpp/kppp : " << freeze[0] << " , " << freeze[1] << " , " << freeze[2] << " , " << freeze[3] << endl;
 #elif defined(EVAL_KKPP_KKPT) || defined(EVAL_KKPPT)
-	cout << "freeze_kk/kkp/kpp/kkpp : " << freeze[0] << " , " << freeze[1] << " , " << freeze[2] << " , " << freeze[3] << endl;
+        cout << "freeze_kk/kkp/kpp/kkpp : " << freeze[0] << " , " << freeze[1] << " , " << freeze[2] << " , " << freeze[3] << endl;
 #endif
 
-	// -----------------------------------
-	// various initialization
-	// -----------------------------------
+        // -----------------------------------
+        // various initialization
+        // -----------------------------------
 
-	cout << "init.." << endl;
+        cout << "init.." << endl;
 
-	// Read evaluation function parameters
-	Eval::init_NNUE();
+        // Read evaluation function parameters
+        Eval::init_NNUE();
 
 #if !defined(EVAL_NNUE)
-	cout << "init_grad.." << endl;
+        cout << "init_grad.." << endl;
 
-	// Initialize gradient array of merit function parameters
-	Eval::init_grad(eta1,eta1_epoch,eta2,eta2_epoch,eta3);
+        // Initialize gradient array of merit function parameters
+        Eval::init_grad(eta1, eta1_epoch, eta2, eta2_epoch, eta3);
 #else
-	cout << "init_training.." << endl;
-	Eval::NNUE::InitializeTraining(eta1,eta1_epoch,eta2,eta2_epoch,eta3);
-	Eval::NNUE::SetBatchSize(nn_batch_size);
-	Eval::NNUE::SetOptions(nn_options);
-	if (newbob_decay != 1.0 && !Options["SkipLoadingEval"]) {
-		learn_think.best_nn_directory = std::string(Options["EvalDir"]);
-	}
+        cout << "init_training.." << endl;
+        Eval::NNUE::InitializeTraining(eta1, eta1_epoch, eta2, eta2_epoch, eta3);
+        Eval::NNUE::SetBatchSize(nn_batch_size);
+        Eval::NNUE::SetOptions(nn_options);
+        if (newbob_decay != 1.0 && !Options["SkipLoadingEval"]) {
+            learn_think.best_nn_directory = std::string(Options["EvalDir"]);
+        }
 #endif
 
 #if 0
-	// A test to give a gradient of 1.0 to the initial stage of Hirate.
-	pos.set_hirate();
-	cout << Eval::evaluate(pos) << endl;
-	//Eval::print_eval_stat(pos);
-	Eval::add_grad(pos, BLACK, 32.0 , false);
-	Eval::update_weights(1);
-	pos.state()->sum.p[2][0] = VALUE_NOT_EVALUATED;
-	cout << Eval::evaluate(pos) << endl;
-	//Eval::print_eval_stat(pos);
+        // A test to give a gradient of 1.0 to the initial stage of Hirate.
+        pos.set_hirate();
+        cout << Eval::evaluate(pos) << endl;
+        //Eval::print_eval_stat(pos);
+        Eval::add_grad(pos, BLACK, 32.0, false);
+        Eval::update_weights(1);
+        pos.state()->sum.p[2][0] = VALUE_NOT_EVALUATED;
+        cout << Eval::evaluate(pos) << endl;
+        //Eval::print_eval_stat(pos);
 #endif
 
-	cout << "init done." << endl;
+        cout << "init done." << endl;
 
-	// Reflect other option settings.
-	learn_think.discount_rate = discount_rate;
-	learn_think.eval_limit = eval_limit;
-	learn_think.save_only_once = save_only_once;
-	learn_think.sr.no_shuffle = no_shuffle;
-	learn_think.freeze = freeze;
-	learn_think.reduction_gameply = reduction_gameply;
+        // Reflect other option settings.
+        learn_think.discount_rate = discount_rate;
+        learn_think.eval_limit = eval_limit;
+        learn_think.save_only_once = save_only_once;
+        learn_think.sr.no_shuffle = no_shuffle;
+        learn_think.freeze = freeze;
+        learn_think.reduction_gameply = reduction_gameply;
 #if defined(EVAL_NNUE)
-	learn_think.newbob_scale = 1.0;
-	learn_think.newbob_decay = newbob_decay;
-	learn_think.newbob_num_trials = newbob_num_trials;
+        learn_think.newbob_scale = 1.0;
+        learn_think.newbob_decay = newbob_decay;
+        learn_think.newbob_num_trials = newbob_num_trials;
 #endif
-	learn_think.eval_save_interval = eval_save_interval;
-	learn_think.loss_output_interval = loss_output_interval;
-	learn_think.mirror_percentage = mirror_percentage;
+        learn_think.eval_save_interval = eval_save_interval;
+        learn_think.loss_output_interval = loss_output_interval;
+        learn_think.mirror_percentage = mirror_percentage;
 
-	// Start a thread that loads the phase file in the background
-	// (If this is not started, mse cannot be calculated.)
-	learn_think.start_file_read_worker();
+        // Start a thread that loads the phase file in the background
+        // (If this is not started, mse cannot be calculated.)
+        learn_think.start_file_read_worker();
 
-	learn_think.mini_batch_size = mini_batch_size;
+        learn_think.mini_batch_size = mini_batch_size;
 
-	if (validation_set_file_name.empty()) {
-	// Get about 10,000 data for mse calculation.
-		sr.read_for_mse();
-	} else {
-		sr.read_validation_set(validation_set_file_name, eval_limit);
-	}
+        if (validation_set_file_name.empty()) {
+            // Get about 10,000 data for mse calculation.
+            sr.read_for_mse();
+        }
+        else {
+            sr.read_validation_set(validation_set_file_name, eval_limit);
+        }
 
-	// Calculate rmse once at this point (timing of 0 sfen)
-	// sr.calc_rmse();
+        // Calculate rmse once at this point (timing of 0 sfen)
+        // sr.calc_rmse();
 #if defined(EVAL_NNUE)
-	if (newbob_decay != 1.0) {
-		learn_think.calc_loss(0, -1);
-		learn_think.best_loss = learn_think.latest_loss_sum / learn_think.latest_loss_count;
-		learn_think.latest_loss_sum = 0.0;
-		learn_think.latest_loss_count = 0;
-		cout << "initial loss: " << learn_think.best_loss << endl;
-	}
+        if (newbob_decay != 1.0) {
+            learn_think.calc_loss(0, -1);
+            learn_think.best_loss = learn_think.latest_loss_sum / learn_think.latest_loss_count;
+            learn_think.latest_loss_sum = 0.0;
+            learn_think.latest_loss_count = 0;
+            cout << "initial loss: " << learn_think.best_loss << endl;
+        }
 #endif
 
-	// -----------------------------------
-	// start learning evaluation function parameters
-	// -----------------------------------
+        // -----------------------------------
+        // start learning evaluation function parameters
+        // -----------------------------------
 
-	// Start learning.
-	learn_think.go_think();
+        // Start learning.
+        learn_think.go_think();
 
-	// Save once at the end.
-	learn_think.save(true);
+        // Save once at the end.
+        learn_think.save(true);
 
 #if defined(USE_GLOBAL_OPTIONS)
-	// Restore Global Options.
-	GlobalOptions = oldGlobalOptions;
+        // Restore Global Options.
+        GlobalOptions = oldGlobalOptions;
 #endif
-}
+    }
 
 
 } // namespace Learner
diff --git a/src/learn/multi_think.h b/src/learn/multi_think.h
index 6e6c695c..6225144c 100644
--- a/src/learn/multi_think.h
+++ b/src/learn/multi_think.h
@@ -11,12 +11,15 @@
 #include "../thread_win32_osx.h"
 
 #include <atomic>
+#include <limits>
 
 // Learning from a game record, when making yourself think and generating a fixed track, etc.
 // Helper class used when multiple threads want to call Search::think() individually.
 // Derive and use this class.
 struct MultiThink
 {
+	static constexpr std::uint64_t LOOP_COUNT_FINISHED = std::numeric_limits<std::uint64_t>::max();
+
 	MultiThink() : prng(std::chrono::system_clock::now().time_since_epoch().count())
 	{
 		loop_count = 0;
@@ -62,7 +65,7 @@ struct MultiThink
 	uint64_t get_next_loop_count() {
 		std::unique_lock<std::mutex> lk(loop_mutex);
 		if (loop_count >= loop_max)
-			return UINT64_MAX;
+			return LOOP_COUNT_FINISHED;
 		return loop_count++;
 	}
 

From aa2de712302a2379d8aa26127d86455ad276f512 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Wed, 2 Sep 2020 08:05:08 +0200
Subject: [PATCH 014/398] Update CPU contributors list

with fishtest data of Sept. 2 2020

closes https://github.com/official-stockfish/Stockfish/pull/3095

No functional change
---
 Top CPU Contributors.txt | 319 +++++++++++++++++++++------------------
 1 file changed, 169 insertions(+), 150 deletions(-)

diff --git a/Top CPU Contributors.txt b/Top CPU Contributors.txt
index 0ea5ac72..482e9000 100644
--- a/Top CPU Contributors.txt	
+++ b/Top CPU Contributors.txt	
@@ -1,154 +1,173 @@
-Contributors with >10,000 CPU hours as of January 7, 2020
+Contributors with >10,000 CPU hours as of Sept 2, 2020
 Thank you!
 
 Username                  CPU Hours   Games played
 --------------------------------------------------
-noobpwnftw                  9305707      695548021
-mlang                        780050       61648867
-dew                          621626       43921547
-mibere                       524702       42238645
-crunchy                      354587       27344275
-cw                           354495       27274181
-fastgm                       332801       22804359
-JojoM                        295750       20437451
-CSU_Dynasty                  262015       21828122
-Fisherman                    232181       18939229
-ctoks                        218866       17622052
-glinscott                    201989       13780820
-tvijlbrief                   201204       15337115
-velislav                     188630       14348485
-gvreuls                      187164       15149976
-bking_US                     180289       11876016
-nordlandia                   172076       13467830
-leszek                       157152       11443978
-Thanar                       148021       12365359
-spams                        141975       10319326
-drabel                       138073       11121749
-vdv                          137850        9394330
-mgrabiak                     133578       10454324
-TueRens                      132485       10878471
-bcross                       129683       11557084
-marrco                       126078        9356740
-sqrt2                        125830        9724586
-robal                        122873        9593418
-vdbergh                      120766        8926915
-malala                       115926        8002293
-CoffeeOne                    114241        5004100
-dsmith                       113189        7570238
-BrunoBanani                  104644        7436849
-Data                          92328        8220352
-mhoram                        89333        6695109
-davar                         87924        7009424
-xoto                          81094        6869316
-ElbertoOne                    80899        7023771
-grandphish2                   78067        6160199
-brabos                        77212        6186135
-psk                           75733        5984901
-BRAVONE                       73875        5054681
-sunu                          70771        5597972
-sterni1971                    70605        5590573
-MaZePallas                    66886        5188978
-Vizvezdenec                   63708        4967313
-nssy                          63462        5259388
-jromang                       61634        4940891
-teddybaer                     61231        5407666
-Pking_cda                     60099        5293873
-solarlight                    57469        5028306
-dv8silencer                   56913        3883992
-tinker                        54936        4086118
-renouve                       49732        3501516
-Freja                         49543        3733019
-robnjr                        46972        4053117
-rap                           46563        3219146
-Bobo1239                      46036        3817196
-ttruscott                     45304        3649765
-racerschmacer                 44881        3975413
-finfish                       44764        3370515
-eva42                         41783        3599691
-biffhero                      40263        3111352
-bigpen0r                      39817        3291647
-mhunt                         38871        2691355
-ronaldjerum                   38820        3240695
-Antihistamine                 38785        2761312
-pb00067                       38038        3086320
-speedycpu                     37591        3003273
-rkl                           37207        3289580
-VoyagerOne                    37050        3441673
-jbwiebe                       35320        2805433
-cuistot                       34191        2146279
-homyur                        33927        2850481
-manap                         32873        2327384
-gri                           32538        2515779
-oryx                          31267        2899051
-EthanOConnor                  30959        2090311
-SC                            30832        2730764
-csnodgrass                    29505        2688994
-jmdana                        29458        2205261
-strelock                      28219        2067805
-jkiiski                       27832        1904470
-Pyafue                        27533        1902349
-Garf                          27515        2747562
-eastorwest                    27421        2317535
-slakovv                       26903        2021889
-Prcuvu                        24835        2170122
-anst                          24714        2190091
-hyperbolic.tom                24319        2017394
-Patrick_G                     23687        1801617
-Sharaf_DG                     22896        1786697
-nabildanial                   22195        1519409
-chriswk                       21931        1868317
-achambord                     21665        1767323
-Zirie                         20887        1472937
-team-oh                       20217        1636708
-Isidor                        20096        1680691
-ncfish1                       19931        1520927
-nesoneg                       19875        1463031
-Spprtr                        19853        1548165
-JanErik                       19849        1703875
-agg177                        19478        1395014
-SFTUser                       19231        1567999
-xor12                         19017        1680165
-sg4032                        18431        1641865
-rstoesser                     18118        1293588
-MazeOfGalious                 17917        1629593
-j3corre                       17743         941444
-cisco2015                     17725        1690126
-ianh2105                      17706        1632562
-dex                           17678        1467203
-jundery                       17194        1115855
-iisiraider                    17019        1101015
-horst.prack                   17012        1465656
-Adrian.Schmidt123             16563        1281436
-purplefishies                 16342        1092533
-wei                           16274        1745989
-ville                         16144        1384026
-eudhan                        15712        1283717
-OuaisBla                      15581         972000
-DragonLord                    15559        1162790
-dju                           14716         875569
-chris                         14479        1487385
-0xB00B1ES                     14079        1001120
-OssumOpossum                  13776        1007129
-enedene                       13460         905279
-bpfliegel                     13346         884523
-Ente                          13198        1156722
-IgorLeMasson                  13087        1147232
-jpulman                       13000         870599
-ako027ako                     12775        1173203
-Nikolay.IT                    12352        1068349
-Andrew Grant                  12327         895539
-joster                        12008         950160
-AdrianSA                      11996         804972
-Nesa92                        11455        1111993
-fatmurphy                     11345         853210
-Dark_wizzie                   11108        1007152
-modolief                      10869         896470
-mschmidt                      10757         803401
-infinity                      10594         727027
-mabichito                     10524         749391
-Thomas A. Anderson            10474         732094
-thijsk                        10431         719357
-Flopzee                       10339         894821
-crocogoat                     10104        1013854
-SapphireBrand                 10104         969604
-stocky                        10017         699440
+noobpwnftw                 19352969     1231459677
+mlang                        957168       61657446
+dew                          949885       56893432
+mibere                       703817       46865007
+crunchy                      427035       27344275
+cw                           416006       27521077
+JojoM                        415904       24479564
+fastgm                       404873       23953472
+CSU_Dynasty                  335774       22850550
+tvijlbrief                   335199       21871270
+Fisherman                    325053       21786603
+gvreuls                      311480       20751516
+ctoks                        275877       18710423
+velislav                     241267       15596372
+glinscott                    217799       13780820
+nordlandia                   211692       13484886
+bcross                       206213       14934233
+bking_US                     198894       11876016
+leszek                       189170       11446821
+mgrabiak                     183896       11778092
+drabel                       181408       12489478
+TueRens                      181349       12192000
+Thanar                       179852       12365359
+vdv                          175171        9881246
+robal                        166948       10702862
+spams                        157128       10319326
+marrco                       149947        9376421
+sqrt2                        147963        9724586
+vdbergh                      137041        8926915
+CoffeeOne                    136294        5004100
+malala                       136182        8002293
+mhoram                       128934        8177193
+davar                        122092        7960001
+dsmith                       122059        7570238
+xoto                         119696        8222144
+grandphish2                  116481        7582197
+Data                         113305        8220352
+BrunoBanani                  112960        7436849
+ElbertoOne                    99028        7023771
+MaZePallas                    98571        6362619
+brabos                        92118        6186135
+psk                           89957        5984901
+sunu                          88463        6007033
+sterni1971                    86948        5613788
+Vizvezdenec                   83752        5343724
+BRAVONE                       81239        5054681
+nssy                          76497        5259388
+teddybaer                     75125        5407666
+Pking_cda                     73776        5293873
+jromang                       70695        4940891
+solarlight                    70517        5028306
+dv8silencer                   70287        3883992
+Bobo1239                      68515        4652287
+racerschmacer                 67468        4935996
+manap                         66273        4121774
+tinker                        63458        4213726
+linrock                       59082        4516053
+robnjr                        57262        4053117
+Freja                         56938        3733019
+ttruscott                     56005        3679485
+renouve                       53811        3501516
+cuistot                       52532        3014920
+finfish                       51360        3370515
+eva42                         51272        3599691
+rkl                           50759        3840947
+rap                           49985        3219146
+pb00067                       49727        3298270
+ronaldjerum                   47654        3240695
+bigpen0r                      47278        3291647
+biffhero                      46564        3111352
+VoyagerOne                    45386        3445881
+speedycpu                     43842        3003273
+jbwiebe                       43305        2805433
+Antihistamine                 41788        2761312
+mhunt                         41735        2691355
+eastorwest                    40387        2812173
+homyur                        39893        2850481
+gri                           39871        2515779
+oryx                          38228        2941656
+0x3C33                        37773        2529097
+SC                            37290        2731014
+csnodgrass                    36207        2688994
+jmdana                        36108        2205261
+strelock                      34716        2074055
+Garf                          33800        2747562
+EthanOConnor                  33370        2090311
+slakovv                       32915        2021889
+Spprtr                        32591        2139601
+Prcuvu                        30377        2170122
+anst                          30301        2190091
+jkiiski                       30136        1904470
+hyperbolic.tom                29840        2017394
+Pyafue                        29650        1902349
+OuaisBla                      27629        1578000
+chriswk                       26902        1868317
+achambord                     26582        1767323
+Patrick_G                     26276        1801617
+yorkman                       26193        1992080
+SFTUser                       25182        1675689
+nabildanial                   24942        1519409
+Sharaf_DG                     24765        1786697
+ncfish1                       24411        1520927
+agg177                        23890        1395014
+JanErik                       23408        1703875
+Isidor                        23388        1680691
+Norabor                       22976        1587862
+cisco2015                     22880        1759669
+Zirie                         22542        1472937
+team-oh                       22272        1636708
+MazeOfGalious                 21978        1629593
+sg4032                        21945        1643065
+ianh2105                      21725        1632562
+xor12                         21628        1680365
+dex                           21612        1467203
+nesoneg                       21494        1463031
+horst.prack                   20878        1465656
+0xB00B1ES                     20590        1208666
+j3corre                       20405         941444
+Adrian.Schmidt123             20316        1281436
+wei                           19973        1745989
+rstoesser                     19569        1293588
+eudhan                        19274        1283717
+Ente                          19070        1373058
+jundery                       18445        1115855
+iisiraider                    18247        1101015
+ville                         17883        1384026
+chris                         17698        1487385
+purplefishies                 17595        1092533
+DragonLord                    17014        1162790
+dju                           16515         929427
+IgorLeMasson                  16064        1147232
+ako027ako                     15671        1173203
+Nikolay.IT                    15154        1068349
+Andrew Grant                  15114         895539
+yurikvelo                     15027        1165616
+OssumOpossum                  14857        1007129
+enedene                       14476         905279
+bpfliegel                     14298         884523
+jpulman                       13982         870599
+joster                        13794         950160
+Nesa92                        13786        1114691
+Dark_wizzie                   13422        1007152
+Hjax                          13350         900887
+Fifis                         13313         965473
+mabichito                     12903         749391
+thijsk                        12886         722107
+crocogoat                     12876        1048802
+AdrianSA                      12860         804972
+Flopzee                       12698         894821
+fatmurphy                     12547         853210
+SapphireBrand                 12416         969604
+modolief                      12386         896470
+scuzzi                        12362         833465
+pgontarz                      12151         848794
+stocky                        11954         699440
+mschmidt                      11941         803401
+infinity                      11470         727027
+torbjo                        11387         728873
+Thomas A. Anderson            11372         732094
+snicolet                      11106         869170
+amicic                        10779         733593
+rpngn                         10712         688203
+d64                           10680         771144
+basepi                        10637         744851
+jjoshua2                      10559         670905
+dzjp                          10343         732529
+ols                           10259         570669
+lbraesch                      10252         647825

From c306d838697011da0a960758dde3f7ede6849060 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Wed, 2 Sep 2020 09:12:04 +0200
Subject: [PATCH 015/398] Stockfish 12

Official release version of Stockfish 12

Bench: 3624569

-----------------------

It is our pleasure to release Stockfish 12 to users world-wide

Downloads will be freely available at

https://stockfishchess.org/download/

This version 12 of Stockfish plays significantly stronger than
any of its predecessors. In a match against Stockfish 11,
Stockfish 12 will typically win at least ten times more game pairs
than it loses.

This jump in strength, visible in regular progression tests during
development[1], results from the introduction of an efficiently
updatable neural network (NNUE) for the evaluation in Stockfish[2],
and associated tuning of the engine as a whole. The concept of the
NNUE evaluation was first introduced in shogi, and ported to
Stockfish afterward. Stockfish remains a CPU-only engine, since the
NNUE networks can be very efficiently evaluated on CPUs. The
recommended parameters of the NNUE network are embedded in
distributed binaries, and Stockfish will use NNUE by default.

Both the NNUE and the classical evaluations are available, and
can be used to assign values to positions that are later used in
alpha-beta (PVS) search to find the best move. The classical
evaluation computes this value as a function of various chess
concepts, handcrafted by experts, tested and tuned using fishtest.
The NNUE evaluation computes this value with a neural network based
on basic inputs. The network is optimized and trained on the
evaluations of millions of positions.

The Stockfish project builds on a thriving community of enthusiasts
that contribute their expertise, time, and resources to build a free
and open source chess engine that is robust, widely available, and
very strong. We invite chess fans to join the fishtest testing
framework and programmers to contribute on github[3].

Stay safe and enjoy chess!

The Stockfish team

[1] https://github.com/glinscott/fishtest/wiki/Regression-Tests
[2] https://github.com/official-stockfish/Stockfish/commit/84f3e867903f62480c33243dd0ecbffd342796fc
[3] https://stockfishchess.org/get-involved/
---
 src/misc.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/misc.cpp b/src/misc.cpp
index 3fbdea35..22070f0e 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -65,7 +65,7 @@ namespace {
 
 /// Version number. If Version is left empty, then compile date in the format
 /// DD-MM-YY and show in engine_info.
-const string Version = "";
+const string Version = "12";
 
 /// Our fancy logging facility. The trick here is to replace cin.rdbuf() and
 /// cout.rdbuf() with two Tie objects that tie cin and cout to a file stream. We

From 9d5dc3d33f5774284a2854d5bf223fc55f91af51 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 3 Sep 2020 11:45:14 +0200
Subject: [PATCH 016/398] Fix compilation issues.

---
 src/learn/convert.cpp |  2 --
 src/learn/gensfen.cpp | 18 +++++++++---------
 src/learn/learner.cpp |  7 ++++---
 3 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/src/learn/convert.cpp b/src/learn/convert.cpp
index ebee8a96..387ac39b 100644
--- a/src/learn/convert.cpp
+++ b/src/learn/convert.cpp
@@ -1,5 +1,3 @@
-#define EVAL_LEARN
-
 #if defined(EVAL_LEARN)
 
 // evaluate header for learning
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 38bed2d5..e69528ac 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -1,6 +1,4 @@
-﻿#define EVAL_LEARN
-
-#if defined(EVAL_LEARN)
+﻿#if defined(EVAL_LEARN)
 
 #include "../eval/evaluate_common.h"
 
@@ -319,6 +317,7 @@ namespace Learner
             Position& pos,
             std::vector<StateInfo, AlignedAllocator<StateInfo>>& states,
             int ply,
+            int depth,
             vector<Move>& pv);
 
         // Min and max depths for search during gensfen
@@ -662,9 +661,10 @@ namespace Learner
     }
 
     Value MultiThinkGenSfen::evaluate_leaf(
-        Position& pos, 
+        Position& pos,
         std::vector<StateInfo, AlignedAllocator<StateInfo>>& states,
         int ply,
+        int depth,
         vector<Move>& pv)
     {
         auto rootColor = pos.side_to_move();
@@ -899,16 +899,16 @@ namespace Learner
                         // Result is added after the whole game is done.
                         pos.sfen_pack(psv.sfen);
 
-                        // Get the value of evaluate() as seen from the 
+                        // Get the value of evaluate() as seen from the
                         // root color on the leaf node of the PV line.
-                        // I don't know the goodness and badness of using the 
+                        // I don't know the goodness and badness of using the
                         // return value of search() as it is.
                         // TODO: Consider using search value instead of evaluate_leaf.
                         //       Maybe give it as an option.
-                        
-                        // Use PV moves to reach the leaf node and use the value 
+
+                        // Use PV moves to reach the leaf node and use the value
                         // that evaluated() is called on that leaf node.
-                        const auto leaf_value = evaluate_leaf(pos, states, ply, search_pv);
+                        const auto leaf_value = evaluate_leaf(pos, states, ply, depth, search_pv);
 
                         // If for some reason the leaf node couldn't yield an eval
                         // we fallback to search value.
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index c897dd93..2cf9d9f5 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -13,8 +13,6 @@
 // → I will not be involved in the engine because it is a problem that the GUI should assist.
 // etc..
 
-#define EVAL_LEARN
-
 #if defined(EVAL_LEARN)
 
 #include "../eval/evaluate_common.h"
@@ -98,10 +96,13 @@ namespace Learner
     // probabilities in the trainer. Sometimes we want to use the winning probabilities in the training
     // data directly. In those cases, we set false to this variable.
     static bool convert_teacher_signal_to_winning_probability = true;
+
     // Use raw NNUE eval value in the Eval::evaluate(). If hybrid eval is enabled, training data
     // generation and training don't work well.
     // https://discordapp.com/channels/435943710472011776/733545871911813221/748524079761326192
-    static bool use_raw_nnue_eval = true;
+    // This CANNOT be static since it's used elsewhere.
+    bool use_raw_nnue_eval = true;
+
     // Using WDL with win rate model instead of sigmoid
     static bool use_wdl = false;
 

From 2688194d44b9971ca3755d3dc7eba984b8c13350 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 3 Sep 2020 11:47:36 +0200
Subject: [PATCH 017/398] Fix #91

---
 src/learn/gensfen.cpp | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index e69528ac..8b6bf951 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -506,6 +506,8 @@ namespace Learner
 
         // From the final stage (one step before) to the first stage, give information on the outcome of the game for each stage.
         // The phases stored in sfens are assumed to be continuous (in order).
+        bool quit = false;
+        int num_sfens_to_commit = 0;
         for (auto it = sfens.rbegin(); it != sfens.rend(); ++it)
         {
             // If is_win == 0 (draw), multiply by -1 and it will remain 0 (draw)
@@ -517,19 +519,25 @@ namespace Learner
             auto now_loop_count = get_next_loop_count();
             if (now_loop_count == LOOP_COUNT_FINISHED)
             {
-                return true;
+                quit = true;
+                break;
             }
 
+            ++num_sfens_to_commit;
+        }
+
+        // Write sfens in move order to make potential compression easier
+        for (auto it = sfens.end() - num_sfens_to_commit; it != sfens.end(); ++it)
+        {
             // Write out one sfen.
             sfen_writer.write(thread_id, *it);
-
 #if 0
             pos.set_from_packed_sfen(it->sfen);
             cout << pos << "Win : " << it->is_win << " , " << it->score << endl;
 #endif
         }
 
-        return false;
+        return quit;
     }
 
     optional<Move> MultiThinkGenSfen::choose_random_move(

From 327e92aefe3eae316af9400b3e5d106c7e1de09c Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 3 Sep 2020 11:47:58 +0200
Subject: [PATCH 018/398] Remove trailing whitespaces.

---
 src/learn/convert.cpp |  20 ++++----
 src/learn/gensfen.cpp | 108 +++++++++++++++++++++---------------------
 src/learn/learner.cpp |  34 ++++++-------
 3 files changed, 81 insertions(+), 81 deletions(-)

diff --git a/src/learn/convert.cpp b/src/learn/convert.cpp
index 387ac39b..b84dc2f8 100644
--- a/src/learn/convert.cpp
+++ b/src/learn/convert.cpp
@@ -59,16 +59,16 @@ namespace Learner
     }
 
     void convert_bin(
-        const vector<string>& filenames, 
-        const string& output_file_name, 
-        const int ply_minimum, 
-        const int ply_maximum, 
-        const int interpolate_eval, 
+        const vector<string>& filenames,
+        const string& output_file_name,
+        const int ply_minimum,
+        const int ply_maximum,
+        const int interpolate_eval,
         const int src_score_min_value,
         const int src_score_max_value,
         const int dest_score_min_value,
         const int dest_score_max_value,
-        const bool check_invalid_fen, 
+        const bool check_invalid_fen,
         const bool check_illegal_move)
     {
         std::cout << "check_invalid_fen=" << check_invalid_fen << std::endl;
@@ -268,9 +268,9 @@ namespace Learner
     }
 
     void convert_bin_from_pgn_extract(
-        const vector<string>& filenames, 
-        const string& output_file_name, 
-        const bool pgn_eval_side_to_move, 
+        const vector<string>& filenames,
+        const string& output_file_name,
+        const bool pgn_eval_side_to_move,
         const bool convert_no_eval_fens_as_score_zero)
     {
         std::cout << "pgn_eval_side_to_move=" << pgn_eval_side_to_move << std::endl;
@@ -471,7 +471,7 @@ namespace Learner
     }
 
     void convert_plain(
-        const vector<string>& filenames, 
+        const vector<string>& filenames,
         const string& output_file_name)
     {
         Position tpos;
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 8b6bf951..89fa49e0 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -46,7 +46,7 @@
 #include <shared_mutex>
 #endif
 
-using namespace std; 
+using namespace std;
 
 namespace Learner
 {
@@ -54,7 +54,7 @@ namespace Learner
     static bool detect_draw_by_consecutive_low_score = false;
     static bool detect_draw_by_insufficient_mating_material = false;
 
-    // Use raw NNUE eval value in the Eval::evaluate(). 
+    // Use raw NNUE eval value in the Eval::evaluate().
     // If hybrid eval is enabled, training data
     // generation and training don't work well.
     // https://discordapp.com/channels/435943710472011776/733545871911813221/748524079761326192
@@ -66,7 +66,7 @@ namespace Learner
         // Amount of sfens required to flush the buffer.
         static constexpr size_t SFEN_WRITE_SIZE = 5000;
 
-        // Current status is output after 
+        // Current status is output after
         // each (SFEN_WRITE_SIZE * STATUS_OUTPUT_PERIOD) sfens
         static constexpr uint64_t STATUS_OUTPUT_PERIOD = 40;
 
@@ -106,7 +106,7 @@ namespace Learner
             // This buffer is prepared for each thread.
             auto& buf = sfen_buffers[thread_id];
 
-            // Secure since there is no buf at the first time 
+            // Secure since there is no buf at the first time
             // and immediately after writing the thread buffer.
             if (!buf)
             {
@@ -185,7 +185,7 @@ namespace Learner
 
                         sfen_write_count += buf->size();
 #if 1
-                        // Add the processed number here, and if it exceeds save_every, 
+                        // Add the processed number here, and if it exceeds save_every,
                         // change the file name and reset this counter.
                         sfen_write_count_current_file += buf->size();
                         if (sfen_write_count_current_file >= save_every)
@@ -197,8 +197,8 @@ namespace Learner
                             // Sequential number attached to the file
                             int n = (int)(sfen_write_count / save_every);
 
-                            // Rename the file and open it again. 
-                            // Add ios::app in consideration of overwriting. 
+                            // Rename the file and open it again.
+                            // Add ios::app in consideration of overwriting.
                             // (Depending on the operation, it may not be necessary.)
                             string new_filename = filename + "_" + std::to_string(n);
                             output_file_stream.open(new_filename, ios::out | ios::binary | ios::app);
@@ -208,13 +208,13 @@ namespace Learner
                         // Output '.' every time when writing a game record.
                         std::cout << ".";
 
-                        // Output the number of phases processed 
+                        // Output the number of phases processed
                         // every STATUS_OUTPUT_PERIOD times
-                        // Finally, the remainder of the teacher phase 
-                        // of each thread is written out, 
+                        // Finally, the remainder of the teacher phase
+                        // of each thread is written out,
                         // so halfway numbers are displayed, but is it okay?
-                        // If you overuse the threads to the maximum number 
-                        // of logical cores, the console will be clogged, 
+                        // If you overuse the threads to the maximum number
+                        // of logical cores, the console will be clogged,
                         // so it may be beneficial to increase that value.
                         if ((++batch_counter % STATUS_OUTPUT_PERIOD) == 0)
                         {
@@ -255,7 +255,7 @@ namespace Learner
         // buffer before writing to file
         // sfen_buffers is the buffer for each thread
         // sfen_buffers_pool is a buffer for writing.
-        // After loading the phase in the former buffer by SFEN_WRITE_SIZE, 
+        // After loading the phase in the former buffer by SFEN_WRITE_SIZE,
         // transfer it to the latter.
         std::vector<std::unique_ptr<PSVector>> sfen_buffers;
         std::vector<std::unique_ptr<PSVector>> sfen_buffers_pool;
@@ -263,7 +263,7 @@ namespace Learner
         // Mutex required to access sfen_buffers_pool
         std::mutex mutex;
 
-        // Number of sfens written in total, and the 
+        // Number of sfens written in total, and the
         // number of sfens written in the current file.
         uint64_t sfen_write_count = 0;
         uint64_t sfen_write_count_current_file = 0;
@@ -281,9 +281,9 @@ namespace Learner
         // It must be 2**N because it will be used as the mask to calculate hash_index.
         static_assert((GENSFEN_HASH_SIZE& (GENSFEN_HASH_SIZE - 1)) == 0);
 
-        MultiThinkGenSfen(int search_depth_min_, int search_depth_max_, SfenWriter& sw_) : 
-            search_depth_min(search_depth_min_), 
-            search_depth_max(search_depth_max_), 
+        MultiThinkGenSfen(int search_depth_min_, int search_depth_max_, SfenWriter& sw_) :
+            search_depth_min(search_depth_min_),
+            search_depth_max(search_depth_max_),
             sfen_writer(sw_)
         {
             hash.resize(GENSFEN_HASH_SIZE);
@@ -346,8 +346,8 @@ namespace Learner
 
         // For when using multi pv instead of random move.
         // random_multi_pv is the number of candidates for MultiPV.
-        // When adopting the move of the candidate move, the difference 
-        // between the evaluation value of the move of the 1st place 
+        // When adopting the move of the candidate move, the difference
+        // between the evaluation value of the move of the 1st place
         // and the evaluation value of the move of the Nth place is.
         // Must be in the range random_multi_pv_diff.
         // random_multi_pv_depth is the search depth for MultiPV.
@@ -355,7 +355,7 @@ namespace Learner
         int random_multi_pv_diff;
         int random_multi_pv_depth;
 
-        // The minimum and maximum ply (number of steps from 
+        // The minimum and maximum ply (number of steps from
         // the initial phase) of the sfens to write out.
         int write_minply;
         int write_maxply;
@@ -382,7 +382,7 @@ namespace Learner
         // move score in CP
         constexpr int adj_draw_score = 0;
 
-        // For the time being, it will be treated as a 
+        // For the time being, it will be treated as a
         // draw at the maximum number of steps to write.
         const int ply = move_hist_scores.size();
 
@@ -403,18 +403,18 @@ namespace Learner
         {
             Tablebases::rank_root_moves(pos, rootMoves);
         }
-        else 
+        else
         {
             // If there is no legal move
-            return pos.checkers() 
-                ? -1 /* mate */ 
+            return pos.checkers()
+                ? -1 /* mate */
                 : 0 /* stalemate */;
         }
 
         // Adjudicate game to a draw if the last 4 scores of each engine is 0.
-        if (detect_draw_by_consecutive_low_score) 
+        if (detect_draw_by_consecutive_low_score)
         {
-            if (ply >= adj_draw_ply) 
+            if (ply >= adj_draw_ply)
             {
                 int num_cons_plies_within_draw_score = 0;
                 bool is_adj_draw = false;
@@ -432,14 +432,14 @@ namespace Learner
                         break;
                     }
 
-                    if (num_cons_plies_within_draw_score >= adj_draw_cnt) 
+                    if (num_cons_plies_within_draw_score >= adj_draw_cnt)
                     {
                         is_adj_draw = true;
                         break;
                     }
                 }
 
-                if (is_adj_draw) 
+                if (is_adj_draw)
                 {
                     return 0;
                 }
@@ -447,33 +447,33 @@ namespace Learner
         }
 
         // Draw by insufficient mating material
-        if (detect_draw_by_insufficient_mating_material) 
+        if (detect_draw_by_insufficient_mating_material)
         {
-            if (pos.count<ALL_PIECES>() <= 4) 
+            if (pos.count<ALL_PIECES>() <= 4)
             {
                 int num_pieces = pos.count<ALL_PIECES>();
 
                 // (1) KvK
-                if (num_pieces == 2) 
+                if (num_pieces == 2)
                 {
                     return 0;
                 }
 
                 // (2) KvK + 1 minor piece
-                if (num_pieces == 3) 
+                if (num_pieces == 3)
                 {
                     int minor_pc = pos.count<BISHOP>(WHITE) + pos.count<KNIGHT>(WHITE) +
                         pos.count<BISHOP>(BLACK) + pos.count<KNIGHT>(BLACK);
-                    if (minor_pc == 1) 
+                    if (minor_pc == 1)
                     {
                         return 0;
                     }
                 }
 
                 // (3) KBvKB, bishops of the same color
-                else if (num_pieces == 4) 
+                else if (num_pieces == 4)
                 {
-                    if (pos.count<BISHOP>(WHITE) == 1 && pos.count<BISHOP>(BLACK) == 1) 
+                    if (pos.count<BISHOP>(WHITE) == 1 && pos.count<BISHOP>(BLACK) == 1)
                     {
                         // Color of bishops is black.
                         if ((pos.pieces(WHITE, BISHOP) & DarkSquares)
@@ -498,7 +498,7 @@ namespace Learner
     // Write out the phases loaded in sfens to a file.
     // lastTurnIsWin: win/loss in the next phase after the final phase in sfens
     // 1 when winning. -1 when losing. Pass 0 for a draw.
-    // Return value: true if the specified number of 
+    // Return value: true if the specified number of
     // sfens has already been reached and the process ends.
     bool MultiThinkGenSfen::commit_psv(PSVector& sfens, size_t thread_id, int8_t lastTurnIsWin)
     {
@@ -570,7 +570,7 @@ namespace Learner
                     // Normally one move from legal move
                     random_move = list.at((size_t)prng.rand((uint64_t)list.size()));
                 }
-                else 
+                else
                 {
                     // if you can move the king, move the king
                     Move moves[8]; // Near 8
@@ -589,7 +589,7 @@ namespace Learner
                         // move to move the king
                         random_move = moves[prng.rand(n)];
 
-                        // In Apery method, at this time there is a 1/2 chance 
+                        // In Apery method, at this time there is a 1/2 chance
                         // that the opponent will also move randomly
                         if (prng.rand(2) == 0)
                         {
@@ -604,7 +604,7 @@ namespace Learner
                     }
                 }
             }
-            else 
+            else
             {
                 Learner::search(pos, random_multi_pv_depth, random_multi_pv);
 
@@ -614,7 +614,7 @@ namespace Learner
                 uint64_t s = min((uint64_t)rm.size(), (uint64_t)random_multi_pv);
                 for (uint64_t i = 1; i < s; ++i)
                 {
-                    // The difference from the evaluation value of rm[0] must 
+                    // The difference from the evaluation value of rm[0] must
                     // be within the range of random_multi_pv_diff.
                     // It can be assumed that rm[x].score is arranged in descending order.
                     if (rm[0].score > rm[i].score + random_multi_pv_diff)
@@ -641,7 +641,7 @@ namespace Learner
 
         // Make an array like a[0] = 0 ,a[1] = 1, ...
         // Fisher-Yates shuffle and take out the first N items.
-        // Actually, I only want N pieces, so I only need 
+        // Actually, I only want N pieces, so I only need
         // to shuffle the first N pieces with Fisher-Yates.
 
         vector<int> a;
@@ -688,9 +688,9 @@ namespace Learner
 #endif
             pos.do_move(m, states[ply++]);
 
-            // Because the difference calculation of evaluate() cannot be 
+            // Because the difference calculation of evaluate() cannot be
             // performed unless each node evaluate() is called!
-            // If the depth is 8 or more, it seems 
+            // If the depth is 8 or more, it seems
             // faster not to calculate this difference.
 #if defined(EVAL_NNUE)
             if (depth < 8)
@@ -709,7 +709,7 @@ namespace Learner
             // VALUE_NONE and let the caller assign a value to the position.
             return VALUE_NONE;
         }
-        else 
+        else
         {
             v = Eval::evaluate(pos);
 
@@ -733,7 +733,7 @@ namespace Learner
     // thread_id = 0..Threads.size()-1
     void MultiThinkGenSfen::thread_worker(size_t thread_id)
     {
-        // For the time being, it will be treated as a draw 
+        // For the time being, it will be treated as a draw
         // at the maximum number of steps to write.
         // Maximum StateInfo + Search PV to advance to leaf buffer
         std::vector<StateInfo, AlignedAllocator<StateInfo>> states(
@@ -768,7 +768,7 @@ namespace Learner
             vector<uint8_t> random_move_flag = generate_random_move_flags();
 
             // A counter that keeps track of the number of random moves
-            // When random_move_minply == -1, random moves are 
+            // When random_move_minply == -1, random moves are
             // performed continuously, so use it at this time.
             // Used internally by choose_random_move.
             int actual_random_move_count = 0;
@@ -804,19 +804,19 @@ namespace Learner
 
                     if (random_move_minply != -1)
                     {
-                        // Random move is performed with a certain 
+                        // Random move is performed with a certain
                         // probability even in the constant phase.
                         goto RANDOM_MOVE;
                     }
                     else
                     {
-                        // When -1 is specified as random_move_minply, 
-                        // it points according to the standard until 
+                        // When -1 is specified as random_move_minply,
+                        // it points according to the standard until
                         // it goes out of the standard.
-                        // Prepare an innumerable number of situations 
-                        // that have left the constant as 
+                        // Prepare an innumerable number of situations
+                        // that have left the constant as
                         // ConsiderationBookMoveCount true using a huge constant
-                        // Used for purposes such as performing 
+                        // Used for purposes such as performing
                         // a random move 5 times from there.
                         goto DO_MOVE;
                     }
@@ -931,7 +931,7 @@ namespace Learner
 
                 SKIP_SAVE:;
 
-                    // For some reason, We could not get PV (hit the substitution table etc. and got stuck?) 
+                    // For some reason, We could not get PV (hit the substitution table etc. and got stuck?)
                     // so go to the next game. It's a rare case, so you can ignore it.
                     if (search_pv.size() == 0)
                     {
@@ -949,7 +949,7 @@ namespace Learner
                 {
                     next_move = random_move.value();
 
-                    // We don't have the whole game yet, but it ended, 
+                    // We don't have the whole game yet, but it ended,
                     // so the writing process ends and the next game starts.
                     if (!is_ok(next_move))
                     {
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 2cf9d9f5..88f2a0c3 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -421,7 +421,7 @@ namespace Learner
                         continue;
                     sfen_for_mse.push_back(p);
                 }
-                else 
+                else
                 {
                     break;
                 }
@@ -815,17 +815,17 @@ namespace Learner
             // Assign work to each thread using TaskDispatcher.
             // A task definition for that.
             // It is not possible to capture pos used in ↑, so specify the variables you want to capture one by one.
-            auto task = 
+            auto task =
                 [
-                    &ps, 
-                    &test_sum_cross_entropy_eval, 
-                    &test_sum_cross_entropy_win, 
-                    &test_sum_cross_entropy, 
-                    &test_sum_entropy_eval, 
-                    &test_sum_entropy_win, 
-                    &test_sum_entropy, 
-                    &sum_norm, 
-                    &task_count, 
+                    &ps,
+                    &test_sum_cross_entropy_eval,
+                    &test_sum_cross_entropy_win,
+                    &test_sum_cross_entropy,
+                    &test_sum_entropy_eval,
+                    &test_sum_entropy_win,
+                    &test_sum_entropy,
+                    &sum_norm,
+                    &task_count,
                     &move_accord_count
                 ](size_t task_thread_id)
             {
@@ -1906,16 +1906,16 @@ namespace Learner
             Eval::init_NNUE();
             cout << "convert_bin.." << endl;
             convert_bin(
-                filenames, 
-                output_file_name, 
-                ply_minimum, 
-                ply_maximum, 
-                interpolate_eval, 
+                filenames,
+                output_file_name,
+                ply_minimum,
+                ply_maximum,
+                interpolate_eval,
                 src_score_min_value,
                 src_score_max_value,
                 dest_score_min_value,
                 dest_score_max_value,
-                check_invalid_fen, 
+                check_invalid_fen,
                 check_illegal_move);
             return;
 

From 571c2d6d8daf70de884c493b40cf0279e9b48c61 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Fri, 4 Sep 2020 07:46:06 +0200
Subject: [PATCH 019/398] Restore development version

have fun!

No functional change
---
 src/misc.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/misc.cpp b/src/misc.cpp
index 22070f0e..3fbdea35 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -65,7 +65,7 @@ namespace {
 
 /// Version number. If Version is left empty, then compile date in the format
 /// DD-MM-YY and show in engine_info.
-const string Version = "12";
+const string Version = "";
 
 /// Our fancy logging facility. The trick here is to replace cin.rdbuf() and
 /// cout.rdbuf() with two Tie objects that tie cin and cout to a file stream. We

From 0e1f734b05ee5c67e9a17ae0e2045a64209dee05 Mon Sep 17 00:00:00 2001
From: Unai Corzo <corzounai@gmail.com>
Date: Wed, 2 Sep 2020 16:45:49 +0200
Subject: [PATCH 020/398] Less pruning in qsearch

do not prune moves that give discovery checks, even if with negative SSE.

STC https://tests.stockfishchess.org/tests/view/5f4cb5e8ba100690c5cc5d25
LLR: 2.96 (-2.94,2.94) {-0.25,1.25}
Total: 91328 W: 9940 L: 9667 D: 71721
Ptnml(0-2): 491, 7345, 29693, 7670, 465

LTC https://tests.stockfishchess.org/tests/view/5f4dbc2eba100690c5cc5dac
LLR: 2.97 (-2.94,2.94) {0.25,1.25}
Total: 52448 W: 2799 L: 2586 D: 47063
Ptnml(0-2): 53, 2220, 21459, 2445, 47

closes https://github.com/official-stockfish/Stockfish/pull/3098

bench: 4031192
---
 src/search.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/search.cpp b/src/search.cpp
index b79fa6be..0d823c8e 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1564,7 +1564,9 @@ moves_loop: // When in check, search starts from here
       }
 
       // Do not search moves with negative SEE values
-      if (!ss->inCheck && !pos.see_ge(move))
+      if (   !ss->inCheck
+          && !(givesCheck && pos.is_discovery_check_on_king(~pos.side_to_move(), move))
+          && !pos.see_ge(move))
           continue;
 
       // Speculative prefetch as early as possible

From d6530f7d49ef45e38dacafd8a3a838130113265c Mon Sep 17 00:00:00 2001
From: Unai Corzo <corzounai@gmail.com>
Date: Thu, 3 Sep 2020 12:18:42 +0200
Subject: [PATCH 021/398] Simplify singularQuietLMR

remove formerPV dependence

STC https://tests.stockfishchess.org/tests/view/5f4cb922ba100690c5cc5d35
LLR: 2.96 (-2.94,2.94) {-1.25,0.25}
Total: 113672 W: 12347 L: 12368 D: 88957
Ptnml(0-2): 566, 9537, 36699, 9420, 614

LTC https://tests.stockfishchess.org/tests/view/5f4e8474ba100690c5cc5e12
LLR: 2.93 (-2.94,2.94) {-0.75,0.25}
Total: 43032 W: 2298 L: 2227 D: 38507
Ptnml(0-2): 45, 1940, 17475, 2011, 45

closes https://github.com/official-stockfish/Stockfish/pull/3102

bench: 3290084
---
 src/search.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/search.cpp b/src/search.cpp
index 0d823c8e..b5e190c8 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1196,7 +1196,7 @@ moves_loop: // When in check, search starts from here
 
           // Decrease reduction if ttMove has been singularly extended (~3 Elo)
           if (singularQuietLMR)
-              r -= 1 + formerPv;
+              r--;
 
           if (!captureOrPromotion)
           {

From 2a696115094882b7dc5c024a97ed7dc2bdc98642 Mon Sep 17 00:00:00 2001
From: VoyagerOne <excelgeek@gmail.com>
Date: Wed, 2 Sep 2020 16:58:44 -0400
Subject: [PATCH 022/398] LMR Simplification

remove reduction at non-check cut nodes for second move at low depths

STC:
LLR: 2.95 (-2.94,2.94) {-1.25,0.25}
Total: 61712 W: 6594 L: 6543 D: 48575
Ptnml(0-2): 293, 5085, 20082, 5070, 326
https://tests.stockfishchess.org/tests/view/5f5007d6ba100690c5cc5ea9

LTC:
LLR: 2.94 (-2.94,2.94) {-0.75,0.25}
Total: 57544 W: 2983 L: 2925 D: 51636
Ptnml(0-2): 47, 2568, 23495, 2604, 58
https://tests.stockfishchess.org/tests/view/5f50c597ba100690c5cc5ef7

closes https://github.com/official-stockfish/Stockfish/pull/3103

Bench: 3952302
---
 src/search.cpp | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index b5e190c8..a7692841 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1168,13 +1168,6 @@ moves_loop: // When in check, search starts from here
       {
           Depth r = reduction(improving, depth, moveCount);
 
-          // Decrease reduction at non-check cut nodes for second move at low depths
-          if (   cutNode
-              && depth <= 10
-              && moveCount <= 2
-              && !ss->inCheck)
-              r--;
-
           // Decrease reduction if the ttHit running average is large
           if (thisThread->ttHitAverage > 509 * TtHitAverageResolution * TtHitAverageWindow / 1024)
               r--;

From 9cc482c7889cbc5f6d92e1b69ccd28d422a44a32 Mon Sep 17 00:00:00 2001
From: Sergio Vieri <sergio.vieri.hp@gmail.com>
Date: Thu, 3 Sep 2020 20:22:51 +0800
Subject: [PATCH 023/398] Update default net to nn-308d71810dff.nnue

equivalent to 20200903-1739

Net trained from scratch, so it has quite different features extracted compared to the previous net (82215d0fd0df).

STC:
LLR: 2.98 (-2.94,2.94) {-0.25,1.25}
Total: 108328 W: 14048 L: 13719 D: 80561
Ptnml(0-2): 842, 10039, 32062, 10390, 831
https://tests.stockfishchess.org/tests/view/5f50e053ba100690c5cc5f00

LTC:
LLR: 2.96 (-2.94,2.94) {0.25,1.25}
Total: 13872 W: 1059 L: 890 D: 11923
Ptnml(0-2): 30, 724, 5270, 871, 41
https://tests.stockfishchess.org/tests/view/5f51821fba100690c5cc5f36

closes https://github.com/official-stockfish/Stockfish/pull/3104

Bench: 3832716
---
 src/evaluate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/evaluate.h b/src/evaluate.h
index d701f5a7..3da6a9fe 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -38,7 +38,7 @@ namespace Eval {
   // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
   // for the build process (profile-build and fishtest) to work. Do not change the
   // name of the macro, as it is used in the Makefile.
-  #define EvalFileDefaultName   "nn-82215d0fd0df.nnue"
+  #define EvalFileDefaultName   "nn-308d71810dff.nnue"
 
   namespace NNUE {
 

From 9a063fc3cbc8f522215392db232eeb0e04e71b2c Mon Sep 17 00:00:00 2001
From: Vizvezdenec <Vizvezdenec@gmail.com>
Date: Fri, 4 Sep 2020 15:53:59 +0300
Subject: [PATCH 024/398] Adjust penalty on refuted early quiet moves

This patch changes how previous early moves are penalized in case
search finds a best move. Here, the first quiet move that was not
a transposition table move is penalized.

passed STC
https://tests.stockfishchess.org/tests/view/5f51d839ba100690c5cc5f69
LLR: 2.94 (-2.94,2.94) {-0.25,1.25}
Total: 10088 W: 1150 L: 997 D: 7941
Ptnml(0-2): 41, 772, 3278, 899, 54

passed LTC
https://tests.stockfishchess.org/tests/view/5f51e435ba100690c5cc5f76
LLR: 2.93 (-2.94,2.94) {0.25,1.25}
Total: 30808 W: 1564 L: 1405 D: 27839
Ptnml(0-2): 19, 1245, 12717, 1404, 19

closes https://github.com/official-stockfish/Stockfish/pull/3106

bench 3983758
---
 src/search.cpp | 42 +++++++++++++++++++++---------------------
 src/search.h   |  1 +
 2 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index a7692841..4aeadc28 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -597,7 +597,7 @@ namespace {
     Move ttMove, move, excludedMove, bestMove;
     Depth extension, newDepth;
     Value bestValue, value, ttValue, eval, maxValue, probCutBeta;
-    bool ttHit, formerPv, givesCheck, improving, didLMR, priorCapture;
+    bool formerPv, givesCheck, improving, didLMR, priorCapture;
     bool captureOrPromotion, doFullDepthSearch, moveCountPruning,
          ttCapture, singularQuietLMR;
     Piece movedPiece;
@@ -664,12 +664,12 @@ namespace {
     // position key in case of an excluded move.
     excludedMove = ss->excludedMove;
     posKey = excludedMove == MOVE_NONE ? pos.key() : pos.key() ^ make_key(excludedMove);
-    tte = TT.probe(posKey, ttHit);
-    ttValue = ttHit ? value_from_tt(tte->value(), ss->ply, pos.rule50_count()) : VALUE_NONE;
+    tte = TT.probe(posKey, ss->ttHit);
+    ttValue = ss->ttHit ? value_from_tt(tte->value(), ss->ply, pos.rule50_count()) : VALUE_NONE;
     ttMove =  rootNode ? thisThread->rootMoves[thisThread->pvIdx].pv[0]
-            : ttHit    ? tte->move() : MOVE_NONE;
+            : ss->ttHit    ? tte->move() : MOVE_NONE;
     if (!excludedMove)
-        ss->ttPv = PvNode || (ttHit && tte->is_pv());
+        ss->ttPv = PvNode || (ss->ttHit && tte->is_pv());
     formerPv = ss->ttPv && !PvNode;
 
     if (   ss->ttPv
@@ -681,11 +681,11 @@ namespace {
 
     // thisThread->ttHitAverage can be used to approximate the running average of ttHit
     thisThread->ttHitAverage =   (TtHitAverageWindow - 1) * thisThread->ttHitAverage / TtHitAverageWindow
-                                + TtHitAverageResolution * ttHit;
+                                + TtHitAverageResolution * ss->ttHit;
 
     // At non-PV nodes we check for an early TT cutoff
     if (  !PvNode
-        && ttHit
+        && ss->ttHit
         && tte->depth() >= depth
         && ttValue != VALUE_NONE // Possible in case of TT access race
         && (ttValue >= beta ? (tte->bound() & BOUND_LOWER)
@@ -778,7 +778,7 @@ namespace {
         improving = false;
         goto moves_loop;
     }
-    else if (ttHit)
+    else if (ss->ttHit)
     {
         // Never assume anything about values stored in TT
         ss->staticEval = eval = tte->eval();
@@ -882,14 +882,14 @@ namespace {
         // there and in further interactions with transposition table cutoff depth is set to depth - 3
         // because probCut search has depth set to depth - 4 but we also do a move before it
         // so effective depth is equal to depth - 3
-        && !(   ttHit
+        && !(   ss->ttHit
              && tte->depth() >= depth - 3
              && ttValue != VALUE_NONE
              && ttValue < probCutBeta))
     {
         // if ttMove is a capture and value from transposition table is good enough produce probCut
         // cutoff without digging into actual probCut search
-        if (   ttHit
+        if (   ss->ttHit
             && tte->depth() >= depth - 3
             && ttValue != VALUE_NONE
             && ttValue >= probCutBeta
@@ -933,7 +933,7 @@ namespace {
                 if (value >= probCutBeta)
                 {
                     // if transposition table doesn't have equal or more deep info write probCut data into it
-                    if ( !(ttHit
+                    if ( !(ss->ttHit
                        && tte->depth() >= depth - 3
                        && ttValue != VALUE_NONE))
                         tte->save(posKey, value_to_tt(value, ss->ply), ttPv,
@@ -1423,7 +1423,7 @@ moves_loop: // When in check, search starts from here
     Move ttMove, move, bestMove;
     Depth ttDepth;
     Value bestValue, value, ttValue, futilityValue, futilityBase, oldAlpha;
-    bool ttHit, pvHit, givesCheck, captureOrPromotion;
+    bool pvHit, givesCheck, captureOrPromotion;
     int moveCount;
 
     if (PvNode)
@@ -1453,13 +1453,13 @@ moves_loop: // When in check, search starts from here
                                                   : DEPTH_QS_NO_CHECKS;
     // Transposition table lookup
     posKey = pos.key();
-    tte = TT.probe(posKey, ttHit);
-    ttValue = ttHit ? value_from_tt(tte->value(), ss->ply, pos.rule50_count()) : VALUE_NONE;
-    ttMove = ttHit ? tte->move() : MOVE_NONE;
-    pvHit = ttHit && tte->is_pv();
+    tte = TT.probe(posKey, ss->ttHit);
+    ttValue = ss->ttHit ? value_from_tt(tte->value(), ss->ply, pos.rule50_count()) : VALUE_NONE;
+    ttMove = ss->ttHit ? tte->move() : MOVE_NONE;
+    pvHit = ss->ttHit && tte->is_pv();
 
     if (  !PvNode
-        && ttHit
+        && ss->ttHit
         && tte->depth() >= ttDepth
         && ttValue != VALUE_NONE // Only in case of TT access race
         && (ttValue >= beta ? (tte->bound() & BOUND_LOWER)
@@ -1474,7 +1474,7 @@ moves_loop: // When in check, search starts from here
     }
     else
     {
-        if (ttHit)
+        if (ss->ttHit)
         {
             // Never assume anything about values stored in TT
             if ((ss->staticEval = bestValue = tte->eval()) == VALUE_NONE)
@@ -1493,7 +1493,7 @@ moves_loop: // When in check, search starts from here
         // Stand pat. Return immediately if static value is at least beta
         if (bestValue >= beta)
         {
-            if (!ttHit)
+            if (!ss->ttHit)
                 tte->save(posKey, value_to_tt(bestValue, ss->ply), false, BOUND_LOWER,
                           DEPTH_NONE, MOVE_NONE, ss->staticEval);
 
@@ -1711,8 +1711,8 @@ moves_loop: // When in check, search starts from here
     else
         captureHistory[moved_piece][to_sq(bestMove)][captured] << bonus1;
 
-    // Extra penalty for a quiet TT or main killer move in previous ply when it gets refuted
-    if (   ((ss-1)->moveCount == 1 || ((ss-1)->currentMove == (ss-1)->killers[0]))
+    // Extra penalty for a quiet early move that was not a TT move or main killer move in previous ply when it gets refuted
+    if (   ((ss-1)->moveCount == 1 + (ss-1)->ttHit || ((ss-1)->currentMove == (ss-1)->killers[0]))
         && !pos.captured_piece())
             update_continuation_histories(ss-1, pos.piece_on(prevSq), prevSq, -bonus1);
 
diff --git a/src/search.h b/src/search.h
index 79085189..f60da4a5 100644
--- a/src/search.h
+++ b/src/search.h
@@ -49,6 +49,7 @@ struct Stack {
   int moveCount;
   bool inCheck;
   bool ttPv;
+  bool ttHit;
 };
 
 
From d539da19d2b13d70a81ab863f54046add0bc3b38 Mon Sep 17 00:00:00 2001
From: SFisGOD <jonathandumale@gmail.com>
Date: Fri, 4 Sep 2020 17:14:50 +0800
Subject: [PATCH 025/398] Use classical eval more often

If there is a moderate imbalance, use classical eval with small probability (1/16),
as derived from the node counter.

STC:
LLR: 2.94 (-2.94,2.94) {-0.25,1.25}
Total: 32320 W: 3562 L: 3377 D: 25381
Ptnml(0-2): 144, 2609, 10478, 2776, 153
https://tests.stockfishchess.org/tests/view/5f520615ba100690c5cc5f80

LTC:
LLR: 2.95 (-2.94,2.94) {0.25,1.25}
Total: 21032 W: 1116 L: 974 D: 18942
Ptnml(0-2): 20, 837, 8664, 971, 24
https://tests.stockfishchess.org/tests/view/5f522eaaba100690c5cc5f8c

closes https://github.com/official-stockfish/Stockfish/pull/3107

Bench: 4109324
---
 src/evaluate.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 09f36513..db8379da 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -1015,12 +1015,16 @@ make_v:
 
 Value Eval::evaluate(const Position& pos) {
 
+  bool useClassical = abs(eg_value(pos.psq_score())) * 16 > NNUEThreshold1 * (16 + pos.rule50_count());
   bool classical = !Eval::useNNUE
-                ||  abs(eg_value(pos.psq_score())) * 16 > NNUEThreshold1 * (16 + pos.rule50_count());
+                ||  useClassical
+                || (abs(eg_value(pos.psq_score())) > PawnValueMg / 8 && !(pos.this_thread()->nodes & 0xF));
   Value v = classical ? Evaluation<NO_TRACE>(pos).value()
                       : NNUE::evaluate(pos) * 5 / 4 + Tempo;
 
-  if (classical && Eval::useNNUE && abs(v) * 16 < NNUEThreshold2 * (16 + pos.rule50_count()))
+  if (   useClassical 
+      && Eval::useNNUE 
+      && abs(v) * 16 < NNUEThreshold2 * (16 + pos.rule50_count()))
       v = NNUE::evaluate(pos) * 5 / 4 + Tempo;
 
   // Damp down the evaluation linearly when shuffling

From 0612adec41f26ff618da76f57f7049d0cb2a38f8 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 4 Sep 2020 20:53:40 +0200
Subject: [PATCH 026/398] Fix incorrect early exit in evaluate_leaf.

---
 src/learn/gensfen.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 89fa49e0..23d7e2c6 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -702,12 +702,13 @@ namespace Learner
 
         // Reach leaf
         Value v;
-        if (pos.checkers()) {
+        if (pos.checkers())
+        {
             // Sometime a king is checked.  An example is a case that a checkmate is
             // found in the search.  If Eval::evaluate() is called whne a king is
             // checked, classic eval crashes by an assertion. To avoid crashes, return
             // VALUE_NONE and let the caller assign a value to the position.
-            return VALUE_NONE;
+            v = VALUE_NONE;
         }
         else
         {

From e9e6e47a93c5512204a55aa7416bf133b8ef6671 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 6 Sep 2020 12:47:37 +0200
Subject: [PATCH 027/398] Fix write_out_draw_game_in_training_data_generation
 flag not being respected.

---
 src/learn/gensfen.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 23d7e2c6..39edc699 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -502,6 +502,12 @@ namespace Learner
     // sfens has already been reached and the process ends.
     bool MultiThinkGenSfen::commit_psv(PSVector& sfens, size_t thread_id, int8_t lastTurnIsWin)
     {
+        if (!write_out_draw_game_in_training_data_generation && lastTurnIsWin == 0)
+        {
+            // We didn't write anything so why quit.
+            return false;
+        }
+
         int8_t is_win = lastTurnIsWin;
 
         // From the final stage (one step before) to the first stage, give information on the outcome of the game for each stage.

From 3bf418e63f93036d7ec5049e73ac945e75a901a0 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Sun, 6 Sep 2020 20:38:29 +0200
Subject: [PATCH 028/398] Fix some uninitialized variables with gensfen

fixes valgrind errors as seen with:

```
setoption name Use NNUE value true
isready
gensfen depth 6 loop 10 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data/training_data.bin use_raw_nnue_eval 0
quit
```

the latter script now runs without valgrind errors on linux
---
 src/learn/gensfen.cpp | 2 ++
 src/search.cpp        | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 39edc699..eeeb7b2e 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -1159,6 +1159,8 @@ namespace Learner
         // Show if the training data generator uses NNUE.
         Eval::verify_NNUE();
 
+        Threads.main()->ponder = false;
+
         // Create and execute threads as many as Options["Threads"].
         {
             SfenWriter sfen_writer(output_file_name, thread_num);
diff --git a/src/search.cpp b/src/search.cpp
index 2d848bcd..8f258ae4 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -2035,6 +2035,8 @@ namespace Learner
       th->completedDepth = 0;
       th->selDepth = 0;
       th->rootDepth = 0;
+      th->nmpMinPly = th->bestMoveChanges = 0;
+      th->ttHitAverage = TtHitAverageWindow * TtHitAverageResolution / 2;
 
 	  // Zero initialization of the number of search nodes
       th->nodes = 0;

From 3a06de298b06a1d2aed43d7c968dbd49ea44b662 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Sun, 6 Sep 2020 21:46:08 +0200
Subject: [PATCH 029/398] Define BLAS variables in Makefile

makes it a little easier to change the BLAS library used,
doesn't hardcode the mingw headers. Works on Linux with openblas installed.
Should be no change on Windows.
---
 src/Makefile | 35 ++++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 0c6b21e5..eef17406 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -28,6 +28,21 @@ else
 EXE = stockfish
 endif
 
+### Establish the operating system name
+KERNEL = $(shell uname -s)
+ifeq ($(KERNEL),Linux)
+	OS = $(shell uname -o)
+endif
+
+### BLAS libraries
+ifeq ($(KERNEL),Linux)
+	BLASCXXFLAGS =
+	BLASLDFLAGS = -lopenblas
+else
+	BLASCXXFLAGS = -I/mingw64/include/OpenBLAS
+	BLASLDFLAGS = -lopenblas -Wl,-s -static
+endif
+
 ### Installation dir definitions
 PREFIX = /usr/local
 BINDIR = $(PREFIX)/bin
@@ -61,12 +76,6 @@ OBJS = $(notdir $(SRCS:.cpp=.o))
 
 VPATH = syzygy:nnue:nnue/features:eval:extra:learn
 
-### Establish the operating system name
-KERNEL = $(shell uname -s)
-ifeq ($(KERNEL),Linux)
-	OS = $(shell uname -o)
-endif
-
 ### ==========================================================================
 ### Section 2. High-level Configuration
 ### ==========================================================================
@@ -308,7 +317,7 @@ endif
 ### ==========================================================================
 
 ### 3.1 Selecting compiler (default = gcc)
-CXXFLAGS += -Wall -Wcast-qual -fno-exceptions -std=c++17 $(EXTRACXXFLAGS) $(LEARNCXXFLAGS)
+CXXFLAGS += -g -Wall -Wcast-qual -fno-exceptions -std=c++17 $(EXTRACXXFLAGS) $(LEARNCXXFLAGS)
 DEPENDFLAGS += -std=c++17
 LDFLAGS += $(EXTRALDFLAGS) $(LEARNLDFLAGS)
 
@@ -890,16 +899,16 @@ icc-profile-use:
 
 learn: config-sanity
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
-	EXTRACXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS -I/mingw64/include/OpenBLAS -fopenmp ' \
-	EXTRALDFLAGS=' -lopenblas -fopenmp -Wl,-s -static ' \
+	EXTRACXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
+	EXTRALDFLAGS=' $(BLASLDFLAGS) -fopenmp  ' \
 	all
 
 profile-learn: config-sanity objclean profileclean
 	@echo ""
 	@echo "Step 1/4. Building instrumented executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make) \
-	LEARNCXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS -I/mingw64/include/OpenBLAS -fopenmp ' \
-	LEARNLDFLAGS=' -lopenblas -fopenmp -Wl,-s -static '
+	LEARNCXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
+	LEARNLDFLAGS='  $(BLASLDLAGS) -fopenmp '
 	@echo ""
 	@echo "Step 2/4. Running benchmark for pgo-build ..."
 	$(PGOGENSFEN)
@@ -907,8 +916,8 @@ profile-learn: config-sanity objclean profileclean
 	@echo "Step 3/4. Building optimized executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) objclean
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_use) \
-	LEARNCXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS -I/mingw64/include/OpenBLAS -fopenmp ' \
-	LEARNLDFLAGS=' -lopenblas -fopenmp -Wl,-s -static '
+	LEARNCXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
+	LEARNLDFLAGS=' $(BLASLDFLAGS) -fopenmp '
 	@echo ""
 	@echo "Step 4/4. Deleting profile data ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) profileclean

From edbbc1a4df941b7e41bb0b4b34adfe7db90f3ec7 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Sun, 6 Sep 2020 22:13:42 +0200
Subject: [PATCH 030/398] Remove some warnings

---
 src/learn/gensfen.cpp                       |  2 +-
 src/misc.h                                  |  2 +-
 src/nnue/trainer/trainer_affine_transform.h |  8 ++++----
 src/nnue/trainer/trainer_clipped_relu.h     |  8 ++++----
 src/nnue/trainer/trainer_input_slice.h      | 16 ++++++++--------
 src/nnue/trainer/trainer_sum.h              | 18 +++++++++---------
 6 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index eeeb7b2e..6c8c455e 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -92,7 +92,7 @@ namespace Learner
             {
                 // All buffers should be empty since file_worker_thread
                 // should have written everything before exiting.
-                for (const auto& p : sfen_buffers) { assert(p == nullptr); }
+                for (const auto& p : sfen_buffers) { assert(p == nullptr); (void)p ; }
                 assert(sfen_buffers_pool.empty());
             }
 #endif
diff --git a/src/misc.h b/src/misc.h
index 19bb008c..d73d0633 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -259,7 +259,7 @@ public:
   template <typename U> AlignedAllocator(const AlignedAllocator<U>&) {}
 
   T* allocate(std::size_t n) { return (T*)std_aligned_alloc(alignof(T), n * sizeof(T)); }
-  void deallocate(T* p, std::size_t n) { std_aligned_free(p); }
+  void deallocate(T* p, std::size_t ) { std_aligned_free(p); }
 };
 
 // --------------------
diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h
index db56c1c0..da11ca29 100644
--- a/src/nnue/trainer/trainer_affine_transform.h
+++ b/src/nnue/trainer/trainer_affine_transform.h
@@ -25,9 +25,9 @@ class Trainer<Layers::AffineTransform<PreviousLayer, OutputDimensions>> {
  public:
   // factory function
   static std::shared_ptr<Trainer> Create(
-      LayerType* target_layer, FeatureTransformer* feature_transformer) {
+      LayerType* target_layer, FeatureTransformer* ft) {
     return std::shared_ptr<Trainer>(
-        new Trainer(target_layer, feature_transformer));
+        new Trainer(target_layer, ft));
   }
 
   // Set options such as hyperparameters
@@ -186,11 +186,11 @@ class Trainer<Layers::AffineTransform<PreviousLayer, OutputDimensions>> {
 
  private:
   // constructor
-  Trainer(LayerType* target_layer, FeatureTransformer* feature_transformer) :
+  Trainer(LayerType* target_layer, FeatureTransformer* ft) :
       batch_size_(0),
       batch_input_(nullptr),
       previous_layer_trainer_(Trainer<PreviousLayer>::Create(
-          &target_layer->previous_layer_, feature_transformer)),
+          &target_layer->previous_layer_, ft)),
       target_layer_(target_layer),
       biases_(),
       weights_(),
diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index fd7b1a07..bd59a02d 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -23,9 +23,9 @@ class Trainer<Layers::ClippedReLU<PreviousLayer>> {
  public:
   // factory function
   static std::shared_ptr<Trainer> Create(
-      LayerType* target_layer, FeatureTransformer* feature_transformer) {
+      LayerType* target_layer, FeatureTransformer* ft) {
     return std::shared_ptr<Trainer>(
-        new Trainer(target_layer, feature_transformer));
+        new Trainer(target_layer, ft));
   }
 
   // Set options such as hyperparameters
@@ -78,10 +78,10 @@ class Trainer<Layers::ClippedReLU<PreviousLayer>> {
 
  private:
   // constructor
-  Trainer(LayerType* target_layer, FeatureTransformer* feature_transformer) :
+  Trainer(LayerType* target_layer, FeatureTransformer* ft) :
       batch_size_(0),
       previous_layer_trainer_(Trainer<PreviousLayer>::Create(
-          &target_layer->previous_layer_, feature_transformer)),
+          &target_layer->previous_layer_, ft)),
       target_layer_(target_layer) {
     std::fill(std::begin(min_activations_), std::end(min_activations_),
               std::numeric_limits<LearnFloatType>::max());
diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
index 33e39244..7d9e76c3 100644
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -18,10 +18,10 @@ class SharedInputTrainer {
  public:
   // factory function
   static std::shared_ptr<SharedInputTrainer> Create(
-      FeatureTransformer* feature_transformer) {
+      FeatureTransformer* ft) {
     static std::shared_ptr<SharedInputTrainer> instance;
     if (!instance) {
-      instance.reset(new SharedInputTrainer(feature_transformer));
+      instance.reset(new SharedInputTrainer(ft));
     }
     ++instance->num_referrers_;
     return instance;
@@ -105,13 +105,13 @@ class SharedInputTrainer {
 
  private:
   // constructor
-  SharedInputTrainer(FeatureTransformer* feature_transformer) :
+  SharedInputTrainer(FeatureTransformer* ft) :
       batch_size_(0),
       num_referrers_(0),
       num_calls_(0),
       current_operation_(Operation::kNone),
       feature_transformer_trainer_(Trainer<FeatureTransformer>::Create(
-          feature_transformer)),
+          ft)),
       output_(nullptr) {
   }
 
@@ -161,8 +161,8 @@ class Trainer<Layers::InputSlice<OutputDimensions, Offset>> {
  public:
   // factory function
   static std::shared_ptr<Trainer> Create(
-      LayerType* /*target_layer*/, FeatureTransformer* feature_transformer) {
-    return std::shared_ptr<Trainer>(new Trainer(feature_transformer));
+      LayerType* /*target_layer*/, FeatureTransformer* ft) {
+    return std::shared_ptr<Trainer>(new Trainer(ft));
   }
 
   // Set options such as hyperparameters
@@ -218,9 +218,9 @@ class Trainer<Layers::InputSlice<OutputDimensions, Offset>> {
 
  private:
   // constructor
-  Trainer(FeatureTransformer* feature_transformer):
+  Trainer(FeatureTransformer* ft):
       batch_size_(0),
-      shared_input_trainer_(SharedInputTrainer::Create(feature_transformer)) {
+      shared_input_trainer_(SharedInputTrainer::Create(ft)) {
   }
 
   // number of input/output dimensions
diff --git a/src/nnue/trainer/trainer_sum.h b/src/nnue/trainer/trainer_sum.h
index fb5b1532..f7bf3b3d 100644
--- a/src/nnue/trainer/trainer_sum.h
+++ b/src/nnue/trainer/trainer_sum.h
@@ -25,9 +25,9 @@ class Trainer<Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>> :
  public:
   // factory function
   static std::shared_ptr<Trainer> Create(
-      LayerType* target_layer, FeatureTransformer* feature_transformer) {
+      LayerType* target_layer, FeatureTransformer* ft) {
     return std::shared_ptr<Trainer>(
-        new Trainer(target_layer, feature_transformer));
+        new Trainer(target_layer, ft));
   }
 
   // Set options such as hyperparameters
@@ -74,11 +74,11 @@ class Trainer<Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>> :
 
  private:
   // constructor
-  Trainer(LayerType* target_layer, FeatureTransformer* feature_transformer):
-      Tail(target_layer, feature_transformer),
+  Trainer(LayerType* target_layer, FeatureTransformer* ft):
+      Tail(target_layer, ft),
       batch_size_(0),
       previous_layer_trainer_(Trainer<FirstPreviousLayer>::Create(
-          &target_layer->previous_layer_, feature_transformer)),
+          &target_layer->previous_layer_, ft)),
       target_layer_(target_layer) {
   }
 
@@ -110,9 +110,9 @@ class Trainer<Layers::Sum<PreviousLayer>> {
  public:
   // factory function
   static std::shared_ptr<Trainer> Create(
-      LayerType* target_layer, FeatureTransformer* feature_transformer) {
+      LayerType* target_layer, FeatureTransformer* ft) {
     return std::shared_ptr<Trainer>(
-        new Trainer(target_layer, feature_transformer));
+        new Trainer(target_layer, ft));
   }
 
   // Set options such as hyperparameters
@@ -154,10 +154,10 @@ class Trainer<Layers::Sum<PreviousLayer>> {
 
  private:
   // constructor
-  Trainer(LayerType* target_layer, FeatureTransformer* feature_transformer) :
+  Trainer(LayerType* target_layer, FeatureTransformer* ft) :
       batch_size_(0),
       previous_layer_trainer_(Trainer<PreviousLayer>::Create(
-          &target_layer->previous_layer_, feature_transformer)),
+          &target_layer->previous_layer_, ft)),
       target_layer_(target_layer) {
   }
 

From e9e52faae7f85a0b4ff96ae1c457556fcf5ce5ae Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Mon, 7 Sep 2020 08:08:43 +0200
Subject: [PATCH 031/398] Typo fix

---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index eef17406..a8d7a13c 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -908,7 +908,7 @@ profile-learn: config-sanity objclean profileclean
 	@echo "Step 1/4. Building instrumented executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make) \
 	LEARNCXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
-	LEARNLDFLAGS='  $(BLASLDLAGS) -fopenmp '
+	LEARNLDFLAGS='  $(BLASLDFLAGS) -fopenmp '
 	@echo ""
 	@echo "Step 2/4. Running benchmark for pgo-build ..."
 	$(PGOGENSFEN)

From 31e8be3008a87716447582d3f0e7e4cabc3d4e22 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Mon, 7 Sep 2020 08:38:14 +0200
Subject: [PATCH 032/398] First little CI step for the learner

---
 .travis.yml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 092c7f53..a689702a 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -7,7 +7,7 @@ matrix:
       compiler: gcc
       addons:
         apt:
-          packages: ['g++-8', 'g++-8-multilib', 'g++-multilib', 'valgrind', 'expect', 'curl']
+          packages: ['g++-8', 'g++-8-multilib', 'g++-multilib', 'valgrind', 'expect', 'curl', 'openblas']
       env:
         - COMPILER=g++-8
         - COMP=gcc
@@ -16,7 +16,7 @@ matrix:
       compiler: clang
       addons:
         apt:
-          packages: ['clang-10', 'llvm-10-dev', 'g++-multilib', 'valgrind', 'expect', 'curl']
+          packages: ['clang-10', 'llvm-10-dev', 'g++-multilib', 'valgrind', 'expect', 'curl', 'openblas']
       env:
         - COMPILER=clang++-10
         - COMP=clang
@@ -74,6 +74,9 @@ script:
   # workaround: exclude a custom version of llvm+clang, which doesn't find llvm-profdata on ubuntu
   - if [[ "$TRAVIS_OS_NAME" != "linux" || "$COMP" == "gcc" ]]; then make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref; fi
 
+  # start some basic learner CI
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64-modern learn; fi
+
   # compile only for some more advanced architectures (might not run in travis)
   - make clean && make -j2 ARCH=x86-64-avx2 build
   - make clean && make -j2 ARCH=x86-64-bmi2 build

From bccc71afb412253507adcb64bf2acfc6618321e8 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Mon, 7 Sep 2020 08:50:59 +0200
Subject: [PATCH 033/398] fix openblas package name?

---
 .travis.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index a689702a..501c2d4b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -7,7 +7,7 @@ matrix:
       compiler: gcc
       addons:
         apt:
-          packages: ['g++-8', 'g++-8-multilib', 'g++-multilib', 'valgrind', 'expect', 'curl', 'openblas']
+          packages: ['g++-8', 'g++-8-multilib', 'g++-multilib', 'valgrind', 'expect', 'curl', 'libopenblas-dev']
       env:
         - COMPILER=g++-8
         - COMP=gcc
@@ -16,7 +16,7 @@ matrix:
       compiler: clang
       addons:
         apt:
-          packages: ['clang-10', 'llvm-10-dev', 'g++-multilib', 'valgrind', 'expect', 'curl', 'openblas']
+          packages: ['clang-10', 'llvm-10-dev', 'g++-multilib', 'valgrind', 'expect', 'curl', 'libopenblas-dev']
       env:
         - COMPILER=clang++-10
         - COMP=clang

From e004e47e5a16689fcdaa34a7d2b38016feeb83d1 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Mon, 7 Sep 2020 16:21:40 +0900
Subject: [PATCH 034/398] Commented out an unused function parameter to remove
 a compile warning.

---
 src/misc.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/misc.cpp b/src/misc.cpp
index 851280fe..5c2a4637 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -743,7 +743,7 @@ namespace Dependency {
 // The function to dig a folder on linux is good for the time being... Only used to save the evaluation function file...
 
 namespace Dependency {
-    int mkdir(std::string dir_name)
+    int mkdir(std::string /* dir_name */)
     {
         return 0;
     }

From 4cc98d80f8a5e6eb0b716a47bc4eb8b877b5a979 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Mon, 7 Sep 2020 18:56:41 +0900
Subject: [PATCH 035/398] Replaced the utility function to create a directory
 to std::filesystem.

---
 src/learn/learner.cpp              |  3 +-
 src/misc.cpp                       | 63 ------------------------------
 src/misc.h                         |  5 ---
 src/nnue/evaluate_nnue_learner.cpp |  3 +-
 4 files changed, 4 insertions(+), 70 deletions(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 88f2a0c3..7021fd7f 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -39,6 +39,7 @@
 #include <memory>
 #include <limits>
 #include <optional>
+#include <filesystem>
 
 #if defined (_OPENMP)
 #include <omp.h>
@@ -1467,7 +1468,7 @@ namespace Learner
             cout << ".";
         };
 
-        Dependency::mkdir("tmp");
+        std::filesystem::create_directory("tmp");
 
         // Shuffle and export as a 10M phase shredded file.
         for (auto filename : filenames)
diff --git a/src/misc.cpp b/src/misc.cpp
index 5c2a4637..a23b1205 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -687,66 +687,3 @@ int write_memory_to_file(std::string filename, void* ptr, uint64_t size)
     fs.close();
     return 0;
 }
-
-// ----------------------------
-//     mkdir wrapper
-// ----------------------------
-
-// Specify relative to the current folder. Returns 0 on success, non-zero on failure.
-// Create a folder. Japanese is not used.
-// In case of gcc under msys2 environment, folder creation fails with _wmkdir(). Cause unknown.
-// Use _mkdir() because there is no help for it.
-
-#if defined(_WIN32)
-// for Windows
-
-#if defined(_MSC_VER)
-#include <codecvt> // I need this because I want wstring to mkdir
-#include <locale> // This is required for wstring_convert.
-
-namespace Dependency {
-    int mkdir(std::string dir_name)
-    {
-        std::wstring_convert<std::codecvt_utf8<wchar_t>, wchar_t> cv;
-        return _wmkdir(cv.from_bytes(dir_name).c_str());
-        //	::CreateDirectory(cv.from_bytes(dir_name).c_str(),NULL);
-    }
-}
-
-#elif defined(__GNUC__) 
-
-#include <direct.h>
-namespace Dependency {
-    int mkdir(std::string dir_name)
-    {
-        return _mkdir(dir_name.c_str());
-    }
-}
-
-#endif
-#elif defined(__linux__)
-
-// In the linux environment, this symbol _LINUX is defined in the makefile.
-
-// mkdir implementation for Linux.
-#include "sys/stat.h"
-
-namespace Dependency {
-    int mkdir(std::string dir_name)
-    {
-        return ::mkdir(dir_name.c_str(), 0777);
-    }
-}
-#else
-
-// In order to judge whether it is a Linux environment, we have to divide the makefile..
-// The function to dig a folder on linux is good for the time being... Only used to save the evaluation function file...
-
-namespace Dependency {
-    int mkdir(std::string /* dir_name */)
-    {
-        return 0;
-    }
-}
-
-#endif
diff --git a/src/misc.h b/src/misc.h
index d73d0633..c918a351 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -273,11 +273,6 @@ namespace Dependency
   // So when calling getline() on fstream,
   // just write getline() instead of std::getline() and use this function.
   extern bool getline(std::ifstream& fs, std::string& s);
-
-  // Create a folder.
-  // Specify relative to the current folder. Japanese is not used for dir_name.
-  // Returns 0 on success, non-zero on failure.
-  extern int mkdir(std::string dir_name);
 }
 
 #endif // #ifndef MISC_H_INCLUDED
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 650f443e..13d9d578 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -4,6 +4,7 @@
 
 #include <random>
 #include <fstream>
+#include <filesystem>
 
 #include "../learn/learn.h"
 #include "../learn/learning_tools.h"
@@ -207,7 +208,7 @@ void save_eval(std::string dir_name) {
   // mkdir() will fail if this folder already exists, but
   // Apart from that. If not, I just want you to make it.
   // Also, assume that the folders up to EvalSaveDir have been dug.
-  Dependency::mkdir(eval_dir);
+  std::filesystem::create_directories(eval_dir);
 
   if (Options["SkipLoadingEval"] && NNUE::trainer) {
     NNUE::SendMessages({{"clear_unobserved_feature_weights"}});

From e638d66bbe2b155a7498ca717e44942726125503 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 7 Sep 2020 19:54:25 +0200
Subject: [PATCH 036/398] Only add -s flag to the linker if debug=no

---
 src/Makefile | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index a8d7a13c..db8213c0 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -34,15 +34,6 @@ ifeq ($(KERNEL),Linux)
 	OS = $(shell uname -o)
 endif
 
-### BLAS libraries
-ifeq ($(KERNEL),Linux)
-	BLASCXXFLAGS =
-	BLASLDFLAGS = -lopenblas
-else
-	BLASCXXFLAGS = -I/mingw64/include/OpenBLAS
-	BLASLDFLAGS = -lopenblas -Wl,-s -static
-endif
-
 ### Installation dir definitions
 PREFIX = /usr/local
 BINDIR = $(PREFIX)/bin
@@ -141,6 +132,20 @@ neon = no
 ARCH = x86-64-modern
 STRIP = strip
 
+### BLAS libraries
+ifeq ($(KERNEL),Linux)
+	BLASCXXFLAGS =
+	BLASLDFLAGS = -lopenblas
+else
+	BLASCXXFLAGS = -I/mingw64/include/OpenBLAS
+
+	ifeq ($(debug),yes)
+		BLASLDFLAGS = -lopenblas -Wl,-static
+	else
+		BLASLDFLAGS = -lopenblas -Wl,-s -static
+	endif
+endif
+
 ### 2.2 Architecture specific
 
 ifeq ($(findstring x86,$(ARCH)),x86)

From 6e8f82ad76c4fb48f34c27bf7fd5185759b2e087 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Mon, 7 Sep 2020 20:14:21 +0200
Subject: [PATCH 037/398] Fix small CI failures

1) Only access UCI option if defined
2) disable -Werror for now.
3) disable a few target that don't have _mm_malloc.
4) Add profile-learn target, with small speedup.
5) just test on Linux + gcc (skip macOS, unclear openblas, skip linux+clang, unclear omp/std::filesystem).
---
 .travis.yml                | 59 +++++++++++++++++++++-----------------
 src/Makefile               |  4 +--
 src/nnue/evaluate_nnue.cpp |  2 ++
 src/ucioption.cpp          |  2 +-
 4 files changed, 37 insertions(+), 30 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 501c2d4b..5859f97b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -12,28 +12,28 @@ matrix:
         - COMPILER=g++-8
         - COMP=gcc
 
-    - os: linux
-      compiler: clang
-      addons:
-        apt:
-          packages: ['clang-10', 'llvm-10-dev', 'g++-multilib', 'valgrind', 'expect', 'curl', 'libopenblas-dev']
-      env:
-        - COMPILER=clang++-10
-        - COMP=clang
-
-    - os: osx
-      osx_image: xcode12
-      compiler: gcc
-      env:
-        - COMPILER=g++
-        - COMP=gcc
-
-    - os: osx
-      osx_image: xcode12
-      compiler: clang
-      env:
-        - COMPILER=clang++
-        - COMP=clang
+#    - os: linux
+#      compiler: clang
+#      addons:
+#        apt:
+#          packages: ['clang-10', 'llvm-10-dev', 'g++-multilib', 'valgrind', 'expect', 'curl', 'openblas']
+#      env:
+#        - COMPILER=clang++-10
+#        - COMP=clang
+#
+#    - os: osx
+#      osx_image: xcode12
+#      compiler: gcc
+#      env:
+#        - COMPILER=g++
+#        - COMP=gcc
+#
+#    - os: osx
+#      osx_image: xcode12
+#      compiler: clang
+#      env:
+#        - COMPILER=clang++
+#        - COMP=clang
 
 branches:
   only:
@@ -65,17 +65,22 @@ script:
   - make clean && make -j2 ARCH=x86-64-ssse3 build && ../tests/signature.sh $benchref
   - make clean && make -j2 ARCH=x86-64-sse3-popcnt build && ../tests/signature.sh $benchref
   - make clean && make -j2 ARCH=x86-64 build && ../tests/signature.sh $benchref
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-64 build && ../tests/signature.sh $benchref; fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 optimize=no debug=yes build && ../tests/signature.sh $benchref; fi
+  # TODO avoid _mm_malloc
+  # - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-64 build && ../tests/signature.sh $benchref; fi
+  # - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 optimize=no debug=yes build && ../tests/signature.sh $benchref; fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32-sse41-popcnt build && ../tests/signature.sh $benchref; fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32-sse2 build && ../tests/signature.sh $benchref; fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 build && ../tests/signature.sh $benchref; fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-32 build && ../tests/signature.sh $benchref; fi
+  # TODO avoid _mm_malloc
+  # - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 build && ../tests/signature.sh $benchref; fi
+  # - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-32 build && ../tests/signature.sh $benchref; fi
   # workaround: exclude a custom version of llvm+clang, which doesn't find llvm-profdata on ubuntu
   - if [[ "$TRAVIS_OS_NAME" != "linux" || "$COMP" == "gcc" ]]; then make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref; fi
 
   # start some basic learner CI
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64-modern learn; fi
+  #TODO enable -Werror
+  - export CXXFLAGS=""
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && LDFLAGS="-lstdc++fs" make -j2 ARCH=x86-64-modern learn; fi
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && LDFLAGS="-lstdc++fs" make -j2 ARCH=x86-64-modern profile-learn; fi
 
   # compile only for some more advanced architectures (might not run in travis)
   - make clean && make -j2 ARCH=x86-64-avx2 build
diff --git a/src/Makefile b/src/Makefile
index db8213c0..9db13e44 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -40,7 +40,7 @@ BINDIR = $(PREFIX)/bin
 
 ### Built-in benchmark for pgo-builds
 PGOBENCH = ./$(EXE) bench
-PGOGENSFEN = ./$(EXE) gensfen depth 3 loop 100000
+PGOGENSFEN = ./$(EXE) gensfen depth 3 loop 1000
 
 ### Source and object files
 SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp \
@@ -908,7 +908,7 @@ learn: config-sanity
 	EXTRALDFLAGS=' $(BLASLDFLAGS) -fopenmp  ' \
 	all
 
-profile-learn: config-sanity objclean profileclean
+profile-learn: net config-sanity objclean profileclean
 	@echo ""
 	@echo "Step 1/4. Building instrumented executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make) \
diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index a2845c96..5c8cee71 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -184,11 +184,13 @@ namespace Eval::NNUE {
 
     Initialize();
 
+#if defined(EVAL_NNUE)
     if (Options["SkipLoadingEval"])
     {
       std::cout << "info string SkipLoadingEval set to true, Net not loaded!" << std::endl;
       return true;
     }
+#endif
 
     fileName = evalFile;
 
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index 519160cf..0007b559 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -96,7 +96,7 @@ void init(OptionsMap& o) {
 #if defined(EVAL_LEARN)
   // When learning the evaluation function, you can change the folder to save the evaluation function.
   // Evalsave by default. This folder shall be prepared in advance.
-  // Automatically dig a folder under this folder like "0/", "1/", ... and save the evaluation function file there.
+  // Automatically create a folder under this folder like "0/", "1/", ... and save the evaluation function file there.
   o["EvalSaveDir"] << Option("evalsave");
 #endif
 }

From e5f05fa2b9f60503e121102ba94390e6974ced1e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 7 Sep 2020 14:32:05 +0200
Subject: [PATCH 038/398] Add a script to extract a contiguous range of entries
 from a .bin file.

---
 script/extract_bin.py | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 script/extract_bin.py

diff --git a/script/extract_bin.py b/script/extract_bin.py
new file mode 100644
index 00000000..9574aa17
--- /dev/null
+++ b/script/extract_bin.py
@@ -0,0 +1,42 @@
+import sys
+
+ENTRY_SIZE = 40
+NUM_ENTRIES_IN_CHUNK = 1024*1024
+
+def copy(infile, outfile, count, times):
+    if times > 1:
+        outfile.write(infile.read(count*ENTRY_SIZE)*times)
+    else:
+        offset = 0
+        while offset < count:
+            to_read = NUM_ENTRIES_IN_CHUNK if offset + NUM_ENTRIES_IN_CHUNK <= count else count - offset
+
+            outfile.write(infile.read(to_read*ENTRY_SIZE))
+
+            offset += NUM_ENTRIES_IN_CHUNK
+
+def work():
+    filename = sys.argv[1]
+    offset = int(sys.argv[2])
+    count = int(sys.argv[3])
+    times = int(sys.argv[4]) if len(sys.argv) >= 5 else 1
+
+    with open(filename, 'rb') as infile:
+        infile.seek(offset * ENTRY_SIZE)
+        filename_parts = filename.split('.')
+        out_path = '.'.join(filename_parts[:-1]) + '_' + str(offset) + '_' + str(count) + '_' + str(times) + '.' + filename_parts[-1]
+        with open(out_path, 'wb') as outfile:
+            copy(infile, outfile, count, times)
+
+def show_help():
+    print('Usage: python extract_bin.py filename offset count [times]')
+    print('filename - the path to the .bin file to process')
+    print('offset - the number of sfens to skip')
+    print('count - the number of sfens to extract')
+    print('times - the number of times to repeat the extracted sfens. Default = 1')
+    print('The result is saved in a new file named `filename.stem`_`offset`_`count`_`times`.bin')
+
+if len(sys.argv) < 4:
+    show_help()
+else:
+    work()

From 58863c32436c22ea05121e039850253510d923d1 Mon Sep 17 00:00:00 2001
From: noobpwnftw <noobpwnftw@users.noreply.github.com>
Date: Tue, 8 Sep 2020 11:39:21 +0800
Subject: [PATCH 039/398] Update gensfen.cpp

---
 src/learn/gensfen.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 6c8c455e..4214233b 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -58,7 +58,7 @@ namespace Learner
     // If hybrid eval is enabled, training data
     // generation and training don't work well.
     // https://discordapp.com/channels/435943710472011776/733545871911813221/748524079761326192
-    static bool use_raw_nnue_eval = true;
+    extern bool use_raw_nnue_eval;
 
     // Helper class for exporting Sfen
     struct SfenWriter

From 832c414b0d78263595b4e7cd6d19c87e61519010 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 7 Sep 2020 23:03:53 +0200
Subject: [PATCH 040/398] First batch of reorganization.

---
 src/learn/learner.cpp | 402 +++++++++++++++++++++++++-----------------
 src/misc.cpp          |  21 ++-
 src/misc.h            |  32 +++-
 3 files changed, 278 insertions(+), 177 deletions(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 7021fd7f..98c8e32e 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -66,7 +66,7 @@ using namespace std;
 //extern Book::BookMoveSelector book;
 
 template <typename T>
-T operator += (std::atomic<T>& x, const T rhs)
+T operator +=(std::atomic<T>& x, const T rhs)
 {
     T old = x.load(std::memory_order_consume);
     // It is allowed that the value is rewritten from other thread at this timing.
@@ -84,8 +84,9 @@ namespace Learner
     static bool use_draw_games_in_training = false;
     static bool use_draw_games_in_validation = false;
     static bool skip_duplicated_positions_in_training = true;
-    // 1.0 / PawnValueEg / 4.0 * log(10.0)
-    static double winning_probability_coefficient = 0.00276753015984861260098316280611;
+
+    static double winning_probability_coefficient = 1.0 / PawnValueEg / 4.0 * std::log(10.0);
+
     // Score scale factors.  ex) If we set src_score_min_value = 0.0,
     // src_score_max_value = 1.0, dest_score_min_value = 0.0,
     // dest_score_max_value = 10000.0, [0.0, 1.0] will be scaled to [0, 10000].
@@ -93,6 +94,7 @@ namespace Learner
     static double src_score_max_value = 1.0;
     static double dest_score_min_value = 0.0;
     static double dest_score_max_value = 1.0;
+
     // Assume teacher signals are the scores of deep searches, and convert them into winning
     // probabilities in the trainer. Sometimes we want to use the winning probabilities in the training
     // data directly. In those cases, we set false to this variable.
@@ -102,7 +104,7 @@ namespace Learner
     // generation and training don't work well.
     // https://discordapp.com/channels/435943710472011776/733545871911813221/748524079761326192
     // This CANNOT be static since it's used elsewhere.
-    bool use_raw_nnue_eval = true;
+    bool use_raw_nnue_eval = false;
 
     // Using WDL with win rate model instead of sigmoid
     static bool use_wdl = false;
@@ -111,38 +113,37 @@ namespace Learner
     // command to learn from the generated game (learn)
     // -----------------------------------
 
-    // ordinary sigmoid function
-    double sigmoid(double x)
-    {
-        return 1.0 / (1.0 + std::exp(-x));
-    }
-
     // A function that converts the evaluation value to the winning rate [0,1]
     double winning_percentage(double value)
     {
         // 1/(1+10^(-Eval/4))
         // = 1/(1+e^(-Eval/4*ln(10))
         // = sigmoid(Eval/4*ln(10))
-        return sigmoid(value * winning_probability_coefficient);
+        return Math::sigmoid(value * winning_probability_coefficient);
     }
 
     // A function that converts the evaluation value to the winning rate [0,1]
     double winning_percentage_wdl(double value, int ply)
     {
+        constexpr double wdl_total = 1000.0;
+        constexpr double draw_score = 0.5;
+
         double wdl_w = UCI::win_rate_model_double(value, ply);
         double wdl_l = UCI::win_rate_model_double(-value, ply);
-        double wdl_d = 1000.0 - wdl_w - wdl_l;
+        double wdl_d = wdl_total - wdl_w - wdl_l;
 
-        return (wdl_w + wdl_d / 2.0) / 1000.0;
+        return (wdl_w + wdl_d * draw_score) / wdl_total;
     }
 
     // A function that converts the evaluation value to the winning rate [0,1]
     double winning_percentage(double value, int ply)
     {
-        if (use_wdl) {
+        if (use_wdl) 
+        {
             return winning_percentage_wdl(value, ply);
         }
-        else {
+        else 
+        {
             return winning_percentage(value);
         }
     }
@@ -151,7 +152,7 @@ namespace Learner
     {
         double p = deep_win_rate;
         double q = winning_percentage(shallow_eval, ply);
-        return -p * std::log(q) - (1 - p) * std::log(1 - q);
+        return -p * std::log(q) - (1.0 - p) * std::log(1.0 - q);
     }
 
     double calc_d_cross_entropy_of_winning_percentage(double deep_win_rate, double shallow_eval, int ply)
@@ -164,17 +165,6 @@ namespace Learner
         return ((y2 - y1) / epsilon) / winning_probability_coefficient;
     }
 
-    double dsigmoid(double x)
-    {
-        // Sigmoid function
-        // f(x) = 1/(1+exp(-x))
-        // the first derivative is
-        // f'(x) = df/dx = f(x)・{ 1-f(x)}
-        // becomes
-
-        return sigmoid(x) * (1.0 - sigmoid(x));
-    }
-
     // When the objective function is the sum of squares of the difference in winning percentage
 #if defined (LOSS_FUNCTION_IS_WINNING_PERCENTAGE)
 // function to calculate the gradient
@@ -202,7 +192,7 @@ namespace Learner
 
         double p = winning_percentage(deep);
         double q = winning_percentage(shallow);
-        return (q - p) * dsigmoid(double(shallow) / 600.0);
+        return (q - p) * Math::dsigmoid(double(shallow) / 600.0);
     }
 #endif
 
@@ -253,39 +243,75 @@ namespace Learner
     double ELMO_LAMBDA2 = 0.33;
     double ELMO_LAMBDA_LIMIT = 32000;
 
+    // Training Formula · Issue #71 · nodchip/Stockfish https://github.com/nodchip/Stockfish/issues/71
+    double get_scaled_signal(double signal)
+    {
+        double scaled_signal = signal;
+
+        // Normalize to [0.0, 1.0].
+        scaled_signal =
+            (scaled_signal - src_score_min_value)
+            / (src_score_max_value - src_score_min_value);
+
+        // Scale to [dest_score_min_value, dest_score_max_value].
+        scaled_signal =
+            scaled_signal * (dest_score_max_value - dest_score_min_value)
+            + dest_score_min_value;
+
+        return scaled_signal;
+    }
+
+    // Teacher winning probability.
+    double calculate_p(double teacher_signal, int ply)
+    {
+        const double scaled_teacher_signal = get_scaled_signal(teacher_signal);
+
+        // Teacher winning probability.
+        double p = scaled_teacher_signal;
+        if (convert_teacher_signal_to_winning_probability) 
+        {
+            p = winning_percentage(scaled_teacher_signal);
+        }
+    }
+
+    double calculate_lambda(double teacher_signal)
+    {
+        // If the evaluation value in deep search exceeds ELMO_LAMBDA_LIMIT, apply ELMO_LAMBDA2 instead of ELMO_LAMBDA.
+        const double lambda =
+            (std::abs(teacher_signal) >= ELMO_LAMBDA_LIMIT)
+            ? ELMO_LAMBDA2
+            : ELMO_LAMBDA;
+
+        return lambda;
+    }
+
+    double calculate_t(int game_result)
+    {
+        // Use 1 as the correction term if the expected win rate is 1, 0 if you lose, and 0.5 if you draw.
+        // game_result = 1,0,-1 so add 1 and divide by 2.
+        const double t = double(game_result + 1) * 0.5;
+
+        return t;
+    }
+
     double calc_grad(Value teacher_signal, Value shallow, const PackedSfenValue& psv)
     {
         // elmo (WCSC27) method
         // Correct with the actual game wins and losses.
-
-        // Training Formula · Issue #71 · nodchip/Stockfish https://github.com/nodchip/Stockfish/issues/71
-        double scaled_teacher_signal = teacher_signal;
-        // Normalize to [0.0, 1.0].
-        scaled_teacher_signal = (scaled_teacher_signal - src_score_min_value) / (src_score_max_value - src_score_min_value);
-        // Scale to [dest_score_min_value, dest_score_max_value].
-        scaled_teacher_signal = scaled_teacher_signal * (dest_score_max_value - dest_score_min_value) + dest_score_min_value;
-
         const double q = winning_percentage(shallow, psv.gamePly);
-        // Teacher winning probability.
-        double p = scaled_teacher_signal;
-        if (convert_teacher_signal_to_winning_probability) {
-            p = winning_percentage(scaled_teacher_signal, psv.gamePly);
-        }
-
-        // Use 1 as the correction term if the expected win rate is 1, 0 if you lose, and 0.5 if you draw.
-        // game_result = 1,0,-1 so add 1 and divide by 2.
-        const double t = double(psv.game_result + 1) / 2;
-
-        // If the evaluation value in deep search exceeds ELMO_LAMBDA_LIMIT, apply ELMO_LAMBDA2 instead of ELMO_LAMBDA.
-        const double lambda = (abs(teacher_signal) >= ELMO_LAMBDA_LIMIT) ? ELMO_LAMBDA2 : ELMO_LAMBDA;
+        const double p = calculate_p(teacher_signal, psv.gamePly);
+        const double t = calculate_t(psv.game_result);
+        const double lambda = calculate_lambda(teacher_signal);
 
         double grad;
-        if (use_wdl) {
-            double dce_p = calc_d_cross_entropy_of_winning_percentage(p, shallow, psv.gamePly);
-            double dce_t = calc_d_cross_entropy_of_winning_percentage(t, shallow, psv.gamePly);
+        if (use_wdl) 
+        {
+            const double dce_p = calc_d_cross_entropy_of_winning_percentage(p, shallow, psv.gamePly);
+            const double dce_t = calc_d_cross_entropy_of_winning_percentage(t, shallow, psv.gamePly);
             grad = lambda * dce_p + (1.0 - lambda) * dce_t;
         }
-        else {
+        else 
+        {
             // Use the actual win rate as a correction term.
             // This is the idea of ​​elmo (WCSC27), modern O-parts.
             grad = lambda * (q - p) + (1.0 - lambda) * (q - t);
@@ -296,30 +322,25 @@ namespace Learner
 
     // Calculate cross entropy during learning
     // The individual cross entropy of the win/loss term and win rate term of the elmo expression is returned to the arguments cross_entropy_eval and cross_entropy_win.
-    void calc_cross_entropy(Value teacher_signal, Value shallow, const PackedSfenValue& psv,
-        double& cross_entropy_eval, double& cross_entropy_win, double& cross_entropy,
-        double& entropy_eval, double& entropy_win, double& entropy)
+    void calc_cross_entropy(
+        Value teacher_signal, 
+        Value shallow, 
+        const PackedSfenValue& psv,
+        double& cross_entropy_eval, 
+        double& cross_entropy_win, 
+        double& cross_entropy,
+        double& entropy_eval, 
+        double& entropy_win, 
+        double& entropy)
     {
-        // Training Formula · Issue #71 · nodchip/Stockfish https://github.com/nodchip/Stockfish/issues/71
-        double scaled_teacher_signal = teacher_signal;
-        // Normalize to [0.0, 1.0].
-        scaled_teacher_signal = (scaled_teacher_signal - src_score_min_value) / (src_score_max_value - src_score_min_value);
-        // Scale to [dest_score_min_value, dest_score_max_value].
-        scaled_teacher_signal = scaled_teacher_signal * (dest_score_max_value - dest_score_min_value) + dest_score_min_value;
-
         // Teacher winning probability.
-        double p = scaled_teacher_signal;
-        if (convert_teacher_signal_to_winning_probability) {
-            p = winning_percentage(scaled_teacher_signal);
-        }
-        const double q /* eval_winrate    */ = winning_percentage(shallow);
-        const double t = double(psv.game_result + 1) / 2;
+        const double q = winning_percentage(shallow, psv.gamePly);
+        const double p = calculate_p(teacher_signal, psv.gamePly);
+        const double t = calculate_t(psv.game_result);
+        const double lambda = calculate_lambda(teacher_signal);
 
         constexpr double epsilon = 0.000001;
 
-        // If the evaluation value in deep search exceeds ELMO_LAMBDA_LIMIT, apply ELMO_LAMBDA2 instead of ELMO_LAMBDA.
-        const double lambda = (abs(teacher_signal) >= ELMO_LAMBDA_LIMIT) ? ELMO_LAMBDA2 : ELMO_LAMBDA;
-
         const double m = (1.0 - lambda) * t + lambda * p;
 
         cross_entropy_eval =
@@ -343,7 +364,8 @@ namespace Learner
     // Other variations may be prepared as the objective function..
 
 
-    double calc_grad(Value shallow, const PackedSfenValue& psv) {
+    double calc_grad(Value shallow, const PackedSfenValue& psv) 
+    {
         return calc_grad((Value)psv.score, shallow, psv);
     }
 
@@ -363,8 +385,14 @@ namespace Learner
         // SFEN_READ_SIZE is a multiple of THREAD_BUFFER_SIZE.
         static constexpr const size_t SFEN_READ_SIZE = LEARN_SFEN_READ_SIZE;
 
+        // hash to limit the reading of the same situation
+        // Is there too many 64 million phases? Or Not really..
+        // It must be 2**N because it will be used as the mask to calculate hash_index.
+        static constexpr uint64_t READ_SFEN_HASH_SIZE = 64 * 1024 * 1024;
+
         // Do not use std::random_device().  Because it always the same integers on MinGW.
-        SfenReader(int thread_num) : prng(std::chrono::system_clock::now().time_since_epoch().count())
+        SfenReader(int thread_num) : 
+            prng(std::chrono::system_clock::now().time_since_epoch().count())
         {
             packed_sfens.resize(thread_num);
             total_read = 0;
@@ -398,6 +426,7 @@ namespace Learner
                     cout << "Error! read packed sfen , failed." << endl;
                     break;
                 }
+
                 sfen_for_mse.push_back(ps);
 
                 // Get the hash key.
@@ -418,8 +447,10 @@ namespace Learner
                 {
                     if (eval_limit < abs(p.score))
                         continue;
+
                     if (!use_draw_games_in_validation && p.game_result == 0)
                         continue;
+
                     sfen_for_mse.push_back(p);
                 }
                 else
@@ -436,7 +467,7 @@ namespace Learner
             auto& thread_ps = packed_sfens[thread_id];
 
             // Fill the read buffer if there is no remaining buffer, but if it doesn't even exist, finish.
-            if ((thread_ps == nullptr || thread_ps->size() == 0) // If the buffer is empty, fill it.
+            if ((thread_ps == nullptr || thread_ps->empty()) // If the buffer is empty, fill it.
                 && !read_to_thread_buffer_impl(thread_id))
                 return false;
 
@@ -444,11 +475,11 @@ namespace Learner
             // Since the filling of the thread buffer with the phase has been completed successfully
             // thread_ps->rbegin() is alive.
 
-            ps = *(thread_ps->rbegin());
+            ps = thread_ps->back();
             thread_ps->pop_back();
 
             // If you've run out of buffers, call delete yourself to free this buffer.
-            if (thread_ps->size() == 0)
+            if (thread_ps->empty())
             {
                 thread_ps.reset();
             }
@@ -507,7 +538,7 @@ namespace Learner
                     return false;
 
                 // Get the next file name.
-                string filename = *filenames.rbegin();
+                string filename = filenames.back();
                 filenames.pop_back();
 
                 fs.open(filename, ios::in | ios::binary);
@@ -523,6 +554,7 @@ namespace Learner
                 // This size() is read only, so you don't need to lock it.
                 while (!stop_flag && packed_sfens_pool.size() >= SFEN_READ_SIZE / THREAD_BUFFER_SIZE)
                     sleep(100);
+
                 if (stop_flag)
                     return;
 
@@ -555,9 +587,7 @@ namespace Learner
 
                 if (!no_shuffle)
                 {
-                    auto size = sfens.size();
-                    for (size_t i = 0; i < size; ++i)
-                        swap(sfens[i], sfens[(size_t)(prng.rand((uint64_t)size - i) + i)]);
+                    Algo::shuffle(sfens, prng);
                 }
 
                 // Divide this by THREAD_BUFFER_SIZE. There should be size pieces.
@@ -591,6 +621,13 @@ namespace Learner
             }
         }
 
+        // Determine if it is a phase for calculating rmse.
+        // (The computational aspects of rmse should not be used for learning.)
+        bool is_for_rmse(Key key) const
+        {
+            return sfen_for_mse_hash.count(key) != 0;
+        }
+
         // sfen files
         vector<string> filenames;
 
@@ -613,17 +650,6 @@ namespace Learner
 
         bool stop_flag;
 
-        // Determine if it is a phase for calculating rmse.
-        // (The computational aspects of rmse should not be used for learning.)
-        bool is_for_rmse(Key key) const
-        {
-            return sfen_for_mse_hash.count(key) != 0;
-        }
-
-        // hash to limit the reading of the same situation
-        // Is there too many 64 million phases? Or Not really..
-        // It must be 2**N because it will be used as the mask to calculate hash_index.
-        static const uint64_t READ_SFEN_HASH_SIZE = 64 * 1024 * 1024;
         vector<Key> hash; // 64MB*8 = 512MB
 
         // test phase for mse calculation
@@ -663,7 +689,10 @@ namespace Learner
     // Class to generate sfen with multiple threads
     struct LearnerThink : public MultiThink
     {
-        LearnerThink(SfenReader& sr_) :sr(sr_), stop_flag(false), save_only_once(false)
+        LearnerThink(SfenReader& sr_) : 
+            sr(sr_), 
+            stop_flag(false), 
+            save_only_once(false)
         {
 #if defined ( LOSS_FUNCTION_IS_ELMO_METHOD )
             learn_sum_cross_entropy_eval = 0.0;
@@ -686,7 +715,12 @@ namespace Learner
         virtual void thread_worker(size_t thread_id);
 
         // Start a thread that loads the phase file in the background.
-        void start_file_read_worker() { sr.start_file_read_worker(); }
+        void start_file_read_worker() 
+        { 
+            sr.start_file_read_worker(); 
+        }
+
+        Value get_shallow_value(Position& task_pos);
 
         // save merit function parameters to a file
         bool save(bool is_final = false);
@@ -753,6 +787,33 @@ namespace Learner
         TaskDispatcher task_dispatcher;
     };
 
+    Value LearnerThink::get_shallow_value(Position& task_pos)
+    {
+        // Evaluation value for shallow search
+        // The value of evaluate() may be used, but when calculating loss, learn_cross_entropy and
+        // Use qsearch() because it is difficult to compare the values.
+        // EvalHash has been disabled in advance. (If not, the same value will be returned every time)
+        const auto [_, pv] = qsearch(task_pos);
+
+        std::vector<StateInfo, AlignedAllocator<StateInfo>> states(pv.size());
+        for (size_t i = 0; i < pv.size(); ++i)
+        {
+            task_pos.do_move(pv[i], states[i]);
+            Eval::NNUE::update_eval(task_pos);
+        }
+
+        const auto rootColor = task_pos.side_to_move();
+        const Value shallow_value =
+            (rootColor == task_pos.side_to_move())
+            ? Eval::evaluate(task_pos)
+            : -Eval::evaluate(task_pos);
+
+        for (auto it = pv.rbegin(); it != pv.rend(); ++it)
+            task_pos.undo_move(*it);
+
+        return shallow_value;
+    }
+
     void LearnerThink::calc_loss(size_t thread_id, uint64_t done)
     {
         // There is no point in hitting the replacement table, so at this timing the generation of the replacement table is updated.
@@ -800,8 +861,6 @@ namespace Learner
         pos.set(StartFEN, false, &si, th);
         std::cout << "hirate eval = " << Eval::evaluate(pos);
 
-        //Eval::print_eval_stat(pos);
-
         // It's better to parallelize here, but it's a bit troublesome because the search before slave has not finished.
         // I created a mechanism to call task, so I will use it.
 
@@ -818,6 +877,7 @@ namespace Learner
             // It is not possible to capture pos used in ↑, so specify the variables you want to capture one by one.
             auto task =
                 [
+                    this,
                     &ps,
                     &test_sum_cross_entropy_eval,
                     &test_sum_cross_entropy_win,
@@ -830,7 +890,6 @@ namespace Learner
                     &move_accord_count
                 ](size_t task_thread_id)
             {
-                // Does C++ properly capture a new ps instance for each loop?.
                 auto task_th = Threads[task_thread_id];
                 auto& task_pos = task_th->rootPos;
                 StateInfo task_si;
@@ -840,26 +899,7 @@ namespace Learner
                     cout << "Error! : illegal packed sfen " << task_pos.fen() << endl;
                 }
 
-                // Evaluation value for shallow search
-                // The value of evaluate() may be used, but when calculating loss, learn_cross_entropy and
-                // Use qsearch() because it is difficult to compare the values.
-                // EvalHash has been disabled in advance. (If not, the same value will be returned every time)
-                auto task_search_result = qsearch(task_pos);
-
-                auto shallow_value = task_search_result.first;
-                {
-                    const auto rootColor = task_pos.side_to_move();
-                    const auto pv = task_search_result.second;
-                    std::vector<StateInfo, AlignedAllocator<StateInfo>> states(pv.size());
-                    for (size_t i = 0; i < pv.size(); ++i)
-                    {
-                        task_pos.do_move(pv[i], states[i]);
-                        Eval::NNUE::update_eval(task_pos);
-                    }
-                    shallow_value = (rootColor == task_pos.side_to_move()) ? Eval::evaluate(task_pos) : -Eval::evaluate(task_pos);
-                    for (auto it = pv.rbegin(); it != pv.rend(); ++it)
-                        task_pos.undo_move(*it);
-                }
+                const Value shallow_value = get_shallow_value(task_pos);
 
                 // Evaluation value of deep search
                 auto deep_value = (Value)ps.score;
@@ -887,7 +927,17 @@ namespace Learner
 #if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
                 double test_cross_entropy_eval, test_cross_entropy_win, test_cross_entropy;
                 double test_entropy_eval, test_entropy_win, test_entropy;
-                calc_cross_entropy(deep_value, shallow_value, ps, test_cross_entropy_eval, test_cross_entropy_win, test_cross_entropy, test_entropy_eval, test_entropy_win, test_entropy);
+                calc_cross_entropy(
+                    deep_value, 
+                    shallow_value, 
+                    ps, 
+                    test_cross_entropy_eval, 
+                    test_cross_entropy_win, 
+                    test_cross_entropy, 
+                    test_entropy_eval, 
+                    test_entropy_win, 
+                    test_entropy);
+
                 // The total cross entropy need not be abs() by definition.
                 test_sum_cross_entropy_eval += test_cross_entropy_eval;
                 test_sum_cross_entropy_win += test_cross_entropy_win;
@@ -900,8 +950,8 @@ namespace Learner
 
                 // Determine if the teacher's move and the score of the shallow search match
                 {
-                    auto r = search(task_pos, 1);
-                    if ((uint16_t)r.second[0] == ps.move)
+                    const auto [value, pv] = search(task_pos, 1);
+                    if ((uint16_t)pv[0] == ps.move)
                         move_accord_count.fetch_add(1, std::memory_order_relaxed);
                 }
 
@@ -950,6 +1000,7 @@ namespace Learner
                 << " , test_entropy = " << test_sum_entropy / sr.sfen_for_mse.size()
                 << " , norm = " << sum_norm
                 << " , move accuracy = " << (move_accord_count * 100.0 / sr.sfen_for_mse.size()) << "%";
+
             if (done != static_cast<uint64_t>(-1))
             {
                 cout
@@ -962,7 +1013,8 @@ namespace Learner
             }
             cout << endl;
         }
-        else {
+        else 
+        {
             cout << "Error! : sr.sfen_for_mse.size() = " << sr.sfen_for_mse.size() << " ,  done = " << done << endl;
         }
 
@@ -978,7 +1030,6 @@ namespace Learner
 #endif
     }
 
-
     void LearnerThink::thread_worker(size_t thread_id)
     {
 #if defined(_OPENMP)
@@ -1092,7 +1143,9 @@ namespace Learner
             }
 
             PackedSfenValue ps;
-        RetryRead:;
+
+        RETRY_READ:;
+
             if (!sr.read_to_thread_buffer(thread_id, ps))
             {
                 // ran out of thread pool for my thread.
@@ -1106,16 +1159,14 @@ namespace Learner
             // The evaluation value exceeds the learning target value.
             // Ignore this aspect information.
             if (eval_limit < abs(ps.score))
-                goto RetryRead;
-
+                goto RETRY_READ;
 
             if (!use_draw_games_in_training && ps.game_result == 0)
-                goto RetryRead;
-
+                goto RETRY_READ;
 
             // Skip over the opening phase
             if (ps.gamePly < prng.rand(reduction_gameply))
-                goto RetryRead;
+                goto RETRY_READ;
 
 #if 0
             auto sfen = pos.sfen_unpack(ps.data);
@@ -1129,20 +1180,24 @@ namespace Learner
                 // I got a strange sfen. Should be debugged!
                 // Since it is an illegal sfen, it may not be displayed with pos.sfen(), but it is better than not.
                 cout << "Error! : illigal packed sfen = " << pos.fen() << endl;
-                goto RetryRead;
+                goto RETRY_READ;
             }
+
 #if !defined(EVAL_NNUE)
+            if (skip_duplicated_positions_in_training)
             {
-                auto key = pos.key();
+                const auto key = pos.key();
+
                 // Exclude the phase used for rmse calculation.
-                if (sr.is_for_rmse(key) && skip_duplicated_positions_in_training)
-                    goto RetryRead;
+                if (sr.is_for_rmse(key))
+                    goto RETRY_READ;
 
                 // Exclude the most recently used aspect.
-                auto hash_index = size_t(key & (sr.READ_SFEN_HASH_SIZE - 1));
-                auto key2 = sr.hash[hash_index];
-                if (key == key2 && skip_duplicated_positions_in_training)
-                    goto RetryRead;
+                const auto hash_index = size_t(key & (sr.READ_SFEN_HASH_SIZE - 1));
+                const auto key2 = sr.hash[hash_index];
+                if (key == key2)
+                    goto RETRY_READ;
+
                 sr.hash[hash_index] = key; // Replace with the current key.
             }
 #endif
@@ -1152,22 +1207,21 @@ namespace Learner
             // (shouldn't write out such teacher aspect itself, but may have written it out with an old generation routine)
         // Skip the position if there are no legal moves (=checkmated or stalemate).
             if (MoveList<LEGAL>(pos).size() == 0)
-                goto RetryRead;
+                goto RETRY_READ;
 
             // I can read it, so try displaying it.
             //      cout << pos << value << endl;
 
             // Evaluation value of shallow search (qsearch)
-            auto r = qsearch(pos);
-            auto pv = r.second;
+            const auto [shallow_value, pv] = qsearch(pos);
 
             // Evaluation value of deep search
-            auto deep_value = (Value)ps.score;
+            const auto deep_value = (Value)ps.score;
 
             // I feel that the mini batch has a better gradient.
             // Go to the leaf node as it is, add only to the gradient array, and later try AdaGrad at the time of rmse aggregation.
 
-            auto rootColor = pos.side_to_move();
+            const auto rootColor = pos.side_to_move();
 
             // If the initial PV is different, it is better not to use it for learning.
             // If it is the result of searching a completely different place, it may become noise.
@@ -1203,13 +1257,26 @@ namespace Learner
                 // I don't think this is a very desirable property, as the aspect that gives that gradient will be different.
                 // I have turned off the substitution table, but since the pv array has not been updated due to one stumbling block etc...
 
-                Value shallow_value = (rootColor == pos.side_to_move()) ? Eval::evaluate(pos) : -Eval::evaluate(pos);
+                const Value shallow_value = 
+                    (rootColor == pos.side_to_move()) 
+                    ? Eval::evaluate(pos) 
+                    : -Eval::evaluate(pos);
 
 #if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
                 // Calculate loss for training data
                 double learn_cross_entropy_eval, learn_cross_entropy_win, learn_cross_entropy;
                 double learn_entropy_eval, learn_entropy_win, learn_entropy;
-                calc_cross_entropy(deep_value, shallow_value, ps, learn_cross_entropy_eval, learn_cross_entropy_win, learn_cross_entropy, learn_entropy_eval, learn_entropy_win, learn_entropy);
+                calc_cross_entropy(
+                    deep_value, 
+                    shallow_value, 
+                    ps, 
+                    learn_cross_entropy_eval, 
+                    learn_cross_entropy_win, 
+                    learn_cross_entropy, 
+                    learn_entropy_eval, 
+                    learn_entropy_win, 
+                    learn_entropy);
+
                 learn_sum_cross_entropy_eval += learn_cross_entropy_eval;
                 learn_sum_cross_entropy_win += learn_cross_entropy_win;
                 learn_sum_cross_entropy += learn_cross_entropy;
@@ -1266,7 +1333,8 @@ namespace Learner
                 Eval::NNUE::update_eval(pos);
             }
 
-            if (illegal_move) {
+            if (illegal_move) 
+            {
                 sync_cout << "An illical move was detected... Excluded the position from the learning data..." << sync_endl;
                 continue;
             }
@@ -1284,7 +1352,6 @@ namespace Learner
             dj_dw = calc_grad(deep_value, shallow_value, ps);
             Eval::add_grad(pos, rootColor, dj_dw, without_kpp);
 #endif
-
         }
 
     }
@@ -1301,14 +1368,17 @@ namespace Learner
             // Do not dig a subfolder because I want to save it only once.
             Eval::save_eval("");
         }
-        else if (is_final) {
+        else if (is_final) 
+        {
             Eval::save_eval("final");
             return true;
         }
-        else {
+        else 
+        {
             static int dir_number = 0;
             const std::string dir_name = std::to_string(dir_number++);
             Eval::save_eval(dir_name);
+
 #if defined(EVAL_NNUE)
             if (newbob_decay != 1.0 && latest_loss_count > 0) {
                 static int trials = newbob_num_trials;
@@ -1316,22 +1386,28 @@ namespace Learner
                 latest_loss_sum = 0.0;
                 latest_loss_count = 0;
                 cout << "loss: " << latest_loss;
-                if (latest_loss < best_loss) {
+                if (latest_loss < best_loss) 
+                {
                     cout << " < best (" << best_loss << "), accepted" << endl;
                     best_loss = latest_loss;
                     best_nn_directory = Path::Combine((std::string)Options["EvalSaveDir"], dir_name);
                     trials = newbob_num_trials;
                 }
-                else {
+                else 
+                {
                     cout << " >= best (" << best_loss << "), rejected" << endl;
-                    if (best_nn_directory.empty()) {
+                    if (best_nn_directory.empty()) 
+                    {
                         cout << "WARNING: no improvement from initial model" << endl;
                     }
-                    else {
+                    else 
+                    {
                         cout << "restoring parameters from " << best_nn_directory << endl;
                         Eval::NNUE::RestoreParameters(best_nn_directory);
                     }
-                    if (--trials > 0 && !is_final) {
+
+                    if (--trials > 0 && !is_final) 
+                    {
                         cout << "reducing learning rate scale from " << newbob_scale
                             << " to " << (newbob_scale * newbob_decay)
                             << " (" << trials << " more trials)" << endl;
@@ -1339,7 +1415,9 @@ namespace Learner
                         Eval::NNUE::SetGlobalLearningRateScale(newbob_scale);
                     }
                 }
-                if (trials == 0) {
+                
+                if (trials == 0) 
+                {
                     cout << "converged" << endl;
                     return true;
                 }
@@ -1371,10 +1449,11 @@ namespace Learner
             // Output progress every 10M phase or when all writing is completed
             if (((write_sfen_count % buffer_size) == 0) ||
                 (write_sfen_count == total_sfen_count))
+            {
                 cout << write_sfen_count << " / " << total_sfen_count << endl;
+            }
         };
 
-
         cout << endl << "write : " << output_file_name << endl;
 
         fstream fs(output_file_name, ios::out | ios::binary);
@@ -1453,9 +1532,7 @@ namespace Learner
 
         auto write_buffer = [&](uint64_t size)
         {
-            // shuffle from buf[0] to buf[size-1]
-            for (uint64_t i = 0; i < size; ++i)
-                swap(buf[i], buf[(uint64_t)(prng.rand(size - i) + i)]);
+            Algo::shuffle(buf, prng);
 
             // write to a file
             fstream fs;
@@ -1533,13 +1610,8 @@ namespace Learner
             auto& fs = afs[i];
 
             fs.open(filename, ios::in | ios::binary);
-            fs.seekg(0, fstream::end);
-            uint64_t eofPos = (uint64_t)fs.tellg();
-            fs.clear(); // Otherwise, the next seek may fail.
-            fs.seekg(0, fstream::beg);
-            uint64_t begPos = (uint64_t)fs.tellg();
-            uint64_t file_size = eofPos - begPos;
-            uint64_t sfen_count = file_size / sizeof(PackedSfenValue);
+            const uint64_t file_size = get_file_size(fs);
+            const uint64_t sfen_count = file_size / sizeof(PackedSfenValue);
             a_count[i] = sfen_count;
 
             // Output the number of sfen stored in each file.
@@ -1578,8 +1650,8 @@ namespace Learner
         PRNG prng(std::chrono::system_clock::now().time_since_epoch().count());
         uint64_t size = (uint64_t)buf.size();
         std::cout << "shuffle buf.size() = " << size << std::endl;
-        for (uint64_t i = 0; i < size; ++i)
-            swap(buf[i], buf[(uint64_t)(prng.rand(size - i) + i)]);
+
+        Algo::shuffle(buf, prng);
 
         std::cout << "write : " << output_file_name << endl;
 
diff --git a/src/misc.cpp b/src/misc.cpp
index a23b1205..5ef5ecdc 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -627,18 +627,27 @@ void* aligned_malloc(size_t size, size_t align)
     return p;
 }
 
+std::uint64_t get_file_size(std::fstream& fs)
+{
+    auto pos = fs.tellg();
+
+    fs.seekg(0, fstream::end);
+    const uint64_t eofPos = (uint64_t)fs.tellg();
+    fs.clear(); // Otherwise, the next seek may fail.
+    fs.seekg(0, fstream::beg);
+    const uint64_t begPos = (uint64_t)fs.tellg();
+    fs.seekg(pos);
+
+    return eofPos - begPos;
+}
+
 int read_file_to_memory(std::string filename, std::function<void* (uint64_t)> callback_func)
 {
     fstream fs(filename, ios::in | ios::binary);
     if (fs.fail())
         return 1;
 
-    fs.seekg(0, fstream::end);
-    uint64_t eofPos = (uint64_t)fs.tellg();
-    fs.clear(); // Otherwise the next seek may fail.
-    fs.seekg(0, fstream::beg);
-    uint64_t begPos = (uint64_t)fs.tellg();
-    uint64_t file_size = eofPos - begPos;
+    const uint64_t file_size = get_file_size(fs);
     //std::cout << "filename = " << filename << " , file_size = " << file_size << endl;
 
     // I know the file size, so call callback_func to get a buffer for this,
diff --git a/src/misc.h b/src/misc.h
index c918a351..5add3b36 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -26,6 +26,8 @@
 #include <ostream>
 #include <string>
 #include <vector>
+#include <utility>
+#include <cmath>
 
 #include "types.h"
 
@@ -155,6 +157,7 @@ std::string now_string();
 // Also, if the buffer cannot be allocated in the callback function or if the file size is different from the expected file size,
 // Return nullptr. At this time, read_file_to_memory() interrupts reading and returns with an error.
 
+std::uint64_t get_file_size(std::fstream& fs);
 int read_file_to_memory(std::string filename, std::function<void* (uint64_t)> callback_func);
 int write_memory_to_file(std::string filename, void* ptr, uint64_t size);
 
@@ -199,20 +202,37 @@ inline std::ostream& operator<<(std::ostream& os, AsyncPRNG& prng)
 
 // Mathematical function used for progress calculation and learning
 namespace Math {
-	// Sigmoid function
-	// = 1.0 / (1.0 + std::exp(-x))
-	double sigmoid(double x);
+    inline double sigmoid(double x)
+    {
+        return 1.0 / (1.0 + std::exp(-x));
+    }
 
-	// Differentiation of sigmoid function
-	// = sigmoid(x) * (1.0-sigmoid(x))
-	double dsigmoid(double x);
+    inline double dsigmoid(double x)
+    {
+        // Sigmoid function
+        // f(x) = 1/(1+exp(-x))
+        // the first derivative is
+        // f'(x) = df/dx = f(x)・{ 1-f(x)}
+        // becomes
+
+        return sigmoid(x) * (1.0 - sigmoid(x));
+    }
 
 	// Clip v so that it fits between [lo,hi].
 	// * In Stockfish, this function is written in bitboard.h.
 	template<class T> constexpr const T& clamp(const T& v, const T& lo, const T& hi) {
 		return v < lo ? lo : v > hi ? hi : v;
 	}
+}
 
+namespace Algo {
+    template <typename Rng, typename T>
+    void shuffle(std::vector<T>& buf, Rng&& prng)
+    {
+        const auto size = buf.size();
+        for (uint64_t i = 0; i < size; ++i)
+            std::swap(buf[i], buf[prng.rand(size - i) + i]);
+    }
 }
 
 // --------------------

From 1482e5215afa1b457418d45805bb57a25f4529f4 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 7 Sep 2020 23:26:38 +0200
Subject: [PATCH 041/398] A second batch of code reorganization.

---
 src/Makefile              |   1 -
 src/learn/convert.cpp     |  10 +--
 src/learn/gensfen.cpp     |   8 +-
 src/learn/gensfen2019.cpp |   1 -
 src/learn/learn.h         |  56 ++++++-------
 src/learn/learner.cpp     | 170 +++++++++++++++-----------------------
 6 files changed, 96 insertions(+), 150 deletions(-)
 delete mode 100644 src/learn/gensfen2019.cpp

diff --git a/src/Makefile b/src/Makefile
index 9db13e44..ca851dba 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -56,7 +56,6 @@ SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp
 	nnue/features/enpassant.cpp \
 	nnue/nnue_test_command.cpp \
 	extra/sfen_packer.cpp \
-	learn/gensfen2019.cpp \
 	learn/learner.cpp \
 	learn/gensfen.cpp \
 	learn/convert.cpp \
diff --git a/src/learn/convert.cpp b/src/learn/convert.cpp
index b84dc2f8..9bd9548d 100644
--- a/src/learn/convert.cpp
+++ b/src/learn/convert.cpp
@@ -25,20 +25,12 @@
 #include <chrono>
 #include <random>
 #include <regex>
+#include <filesystem>
 
 #if defined (_OPENMP)
 #include <omp.h>
 #endif
 
-#if defined(_MSC_VER)
-// The C++ filesystem cannot be used unless it is C++17 or later or MSVC.
-// I tried to use windows.h, but with g++ of msys2 I can not get the files in the folder well.
-// Use dirent.h because there is no help for it.
-#include <filesystem>
-#elif defined(__GNUC__)
-#include <dirent.h>
-#endif
-
 using namespace std;
 
 namespace Learner
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 4214233b..b049192e 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -28,18 +28,12 @@
 #include <memory>
 #include <limits>
 #include <optional>
+#include <filesystem>
 
 #if defined (_OPENMP)
 #include <omp.h>
 #endif
 
-#if defined(_MSC_VER)
-// std::filesystem doesn't work on GCC even though it claims to support C++17.
-#include <filesystem>
-#elif defined(__GNUC__)
-#include <dirent.h>
-#endif
-
 #if defined(EVAL_NNUE)
 #include "../nnue/evaluate_nnue_learner.h"
 #include <climits>
diff --git a/src/learn/gensfen2019.cpp b/src/learn/gensfen2019.cpp
deleted file mode 100644
index 01293b9c..00000000
--- a/src/learn/gensfen2019.cpp
+++ /dev/null
@@ -1 +0,0 @@
-// just a place holder
diff --git a/src/learn/learn.h b/src/learn/learn.h
index e29ed74a..1bc39cf9 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -27,30 +27,6 @@
 // SGD looking only at the sign of the gradient. It requires less memory, but the accuracy is...
 // #define SGD_UPDATE
 
-// ----------------------
-// Settings for learning
-// ----------------------
-
-// mini-batch size.
-// Calculate the gradient by combining this number of phases.
-// If you make it smaller, the number of update_weights() will increase and the convergence will be faster. The gradient is incorrect.
-// If you increase it, the number of update_weights() decreases, so the convergence will be slow. The slope will come out accurately.
-// I don't think you need to change this value in most cases.
-
-#define LEARN_MINI_BATCH_SIZE (1000 * 1000 * 1)
-
-// The number of phases to read from the file at one time. After reading this much, shuffle.
-// It is better to have a certain size, but this number x 40 bytes x 3 times as much memory is consumed. 400MB*3 is consumed in the 10M phase.
-// Must be a multiple of THREAD_BUFFER_SIZE(=10000).
-
-#define LEARN_SFEN_READ_SIZE (1000 * 1000 * 10)
-
-// Saving interval of evaluation function at learning. Save each time you learn this number of phases.
-// Needless to say, the longer the saving interval, the shorter the learning time.
-// Folder name is incremented for each save like 0/, 1/, 2/...
-// By default, once every 1 billion phases.
-#define LEARN_EVAL_SAVE_INTERVAL (1000000000ULL)
-
 
 // ----------------------
 // Select the objective function
@@ -79,10 +55,6 @@
 // debug settings for learning
 // ----------------------
 
-// Reduce the output of rmse during learning to 1 for this number of times.
-// rmse calculation is done in one thread, so it takes some time, so reducing the output is effective.
-#define LEARN_RMSE_OUTPUT_INTERVAL 1
-
 
 // ----------------------
 // learning from zero vector
@@ -205,6 +177,34 @@ typedef float LearnFloatType;
 
 namespace Learner
 {
+	// ----------------------
+	// Settings for learning
+	// ----------------------
+
+	// mini-batch size.
+	// Calculate the gradient by combining this number of phases.
+	// If you make it smaller, the number of update_weights() will increase and the convergence will be faster. The gradient is incorrect.
+	// If you increase it, the number of update_weights() decreases, so the convergence will be slow. The slope will come out accurately.
+	// I don't think you need to change this value in most cases.
+
+	constexpr std::size_t LEARN_MINI_BATCH_SIZE = 1000 * 1000 * 1;
+
+	// The number of phases to read from the file at one time. After reading this much, shuffle.
+	// It is better to have a certain size, but this number x 40 bytes x 3 times as much memory is consumed. 400MB*3 is consumed in the 10M phase.
+	// Must be a multiple of THREAD_BUFFER_SIZE(=10000).
+
+	constexpr std::size_t LEARN_SFEN_READ_SIZE = 1000 * 1000 * 10;
+
+	// Saving interval of evaluation function at learning. Save each time you learn this number of phases.
+	// Needless to say, the longer the saving interval, the shorter the learning time.
+	// Folder name is incremented for each save like 0/, 1/, 2/...
+	// By default, once every 1 billion phases.
+	constexpr std::size_t LEARN_EVAL_SAVE_INTERVAL = 1000000000ULL;
+
+	// Reduce the output of rmse during learning to 1 for this number of times.
+	// rmse calculation is done in one thread, so it takes some time, so reducing the output is effective.
+	constexpr std::size_t LEARN_RMSE_OUTPUT_INTERVAL = 1;
+
 	//Structure in which PackedSfen and evaluation value are integrated
 	// If you write different contents for each option, it will be a problem when reusing the teacher game
 	// For the time being, write all the following members regardless of the options.
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 98c8e32e..ddfaff5a 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -45,15 +45,6 @@
 #include <omp.h>
 #endif
 
-#if defined(_MSC_VER)
-// The C++ filesystem cannot be used unless it is C++17 or later or MSVC.
-// I tried to use windows.h, but with g++ of msys2 I can not get the files in the folder well.
-// Use dirent.h because there is no help for it.
-#include <filesystem>
-#elif defined(__GNUC__)
-#include <dirent.h>
-#endif
-
 #if defined(EVAL_NNUE)
 #include "../nnue/evaluate_nnue_learner.h"
 #include <climits>
@@ -62,8 +53,11 @@
 
 using namespace std;
 
-//// This is defined in the search section.
-//extern Book::BookMoveSelector book;
+
+#if defined(USE_BOOK)
+// This is defined in the search section.
+extern Book::BookMoveSelector book;
+#endif
 
 template <typename T>
 T operator +=(std::atomic<T>& x, const T rhs)
@@ -128,9 +122,9 @@ namespace Learner
         constexpr double wdl_total = 1000.0;
         constexpr double draw_score = 0.5;
 
-        double wdl_w = UCI::win_rate_model_double(value, ply);
-        double wdl_l = UCI::win_rate_model_double(-value, ply);
-        double wdl_d = wdl_total - wdl_w - wdl_l;
+        const double wdl_w = UCI::win_rate_model_double(value, ply);
+        const double wdl_l = UCI::win_rate_model_double(-value, ply);
+        const double wdl_d = wdl_total - wdl_w - wdl_l;
 
         return (wdl_w + wdl_d * draw_score) / wdl_total;
     }
@@ -150,16 +144,17 @@ namespace Learner
 
     double calc_cross_entropy_of_winning_percentage(double deep_win_rate, double shallow_eval, int ply)
     {
-        double p = deep_win_rate;
-        double q = winning_percentage(shallow_eval, ply);
+        const double p = deep_win_rate;
+        const double q = winning_percentage(shallow_eval, ply);
         return -p * std::log(q) - (1.0 - p) * std::log(1.0 - q);
     }
 
     double calc_d_cross_entropy_of_winning_percentage(double deep_win_rate, double shallow_eval, int ply)
     {
         constexpr double epsilon = 0.000001;
-        double y1 = calc_cross_entropy_of_winning_percentage(deep_win_rate, shallow_eval, ply);
-        double y2 = calc_cross_entropy_of_winning_percentage(deep_win_rate, shallow_eval + epsilon, ply);
+
+        const double y1 = calc_cross_entropy_of_winning_percentage(deep_win_rate, shallow_eval, ply);
+        const double y2 = calc_cross_entropy_of_winning_percentage(deep_win_rate, shallow_eval + epsilon, ply);
 
         // Divide by the winning_probability_coefficient to match scale with the sigmoidal win rate
         return ((y2 - y1) / epsilon) / winning_probability_coefficient;
@@ -190,8 +185,8 @@ namespace Learner
         // Also, the coefficient of 1/m is unnecessary if you use the update formula that has the automatic gradient adjustment function like Adam and AdaGrad.
         // Therefore, it is not necessary to save it in memory.
 
-        double p = winning_percentage(deep);
-        double q = winning_percentage(shallow);
+        const double p = winning_percentage(deep, psv.gamePly);
+        const double q = winning_percentage(shallow, psv.gamePly);
         return (q - p) * Math::dsigmoid(double(shallow) / 600.0);
     }
 #endif
@@ -216,8 +211,8 @@ namespace Learner
         // = ...
         // = q-p.
 
-        double p = winning_percentage(deep);
-        double q = winning_percentage(shallow);
+        const double p = winning_percentage(deep, psv.gamePly);
+        const double q = winning_percentage(shallow, psv.gamePly);
 
         return q - p;
     }
@@ -270,8 +265,10 @@ namespace Learner
         double p = scaled_teacher_signal;
         if (convert_teacher_signal_to_winning_probability) 
         {
-            p = winning_percentage(scaled_teacher_signal);
+            p = winning_percentage(scaled_teacher_signal, ply);
         }
+
+        return p;
     }
 
     double calculate_lambda(double teacher_signal)
@@ -534,7 +531,7 @@ namespace Learner
                     fs.close();
 
                 // no more
-                if (filenames.size() == 0)
+                if (filenames.empty())
                     return false;
 
                 // Get the next file name.
@@ -543,6 +540,7 @@ namespace Learner
 
                 fs.open(filename, ios::in | ios::binary);
                 cout << "open filename = " << filename << endl;
+
                 assert(fs);
 
                 return true;
@@ -569,16 +567,12 @@ namespace Learner
                     {
                         sfens.push_back(p);
                     }
-                    else
+                    else if(!open_next_file())
                     {
-                        // read failure
-                        if (!open_next_file())
-                        {
-                            // There was no next file. Abon.
-                            cout << "..end of files." << endl;
-                            end_of_files = true;
-                            return;
-                        }
+                        // There was no next file. Abon.
+                        cout << "..end of files." << endl;
+                        end_of_files = true;
+                        return;
                     }
                 }
 
@@ -702,6 +696,7 @@ namespace Learner
             learn_sum_entropy_win = 0.0;
             learn_sum_entropy = 0.0;
 #endif
+
 #if defined(EVAL_NNUE)
             newbob_scale = 1.0;
             newbob_decay = 1.0;
@@ -1213,7 +1208,7 @@ namespace Learner
             //      cout << pos << value << endl;
 
             // Evaluation value of shallow search (qsearch)
-            const auto [shallow_value, pv] = qsearch(pos);
+            const auto [_, pv] = qsearch(pos);
 
             // Evaluation value of deep search
             const auto deep_value = (Value)ps.score;
@@ -1408,9 +1403,11 @@ namespace Learner
 
                     if (--trials > 0 && !is_final) 
                     {
-                        cout << "reducing learning rate scale from " << newbob_scale
+                        cout
+                            << "reducing learning rate scale from " << newbob_scale
                             << " to " << (newbob_scale * newbob_decay)
                             << " (" << trials << " more trials)" << endl;
+
                         newbob_scale *= newbob_decay;
                         Eval::NNUE::SetGlobalLearningRateScale(newbob_scale);
                     }
@@ -1432,10 +1429,10 @@ namespace Learner
     // prng: random number
     // afs: fstream of each teacher phase file
     // a_count: The number of teacher positions inherent in each file.
-    void shuffle_write(const string& output_file_name, PRNG& prng, vector<fstream>& afs, vector<uint64_t>& a_count)
+    void shuffle_write(const string& output_file_name, PRNG& prng, vector<fstream>& sfen_file_streams, vector<uint64_t>& sfen_count_in_file)
     {
         uint64_t total_sfen_count = 0;
-        for (auto c : a_count)
+        for (auto c : sfen_count_in_file)
             total_sfen_count += c;
 
         // number of exported phases
@@ -1459,39 +1456,39 @@ namespace Learner
         fstream fs(output_file_name, ios::out | ios::binary);
 
         // total teacher positions
-        uint64_t sum = 0;
-        for (auto c : a_count)
-            sum += c;
+        uint64_t sfen_count_left = total_sfen_count;
 
-        while (sum != 0)
+        while (sfen_count_left != 0)
         {
-            auto r = prng.rand(sum);
+            auto r = prng.rand(sfen_count_left);
 
             // Aspects stored in fs[0] file ... Aspects stored in fs[1] file ...
             //Think of it as a series like, and determine in which file r is pointing.
             // The contents of the file are shuffled, so you can take the next element from that file.
             // Each file has a_count[x] phases, so this process can be written as follows.
 
-            uint64_t n = 0;
-            while (a_count[n] <= r)
-                r -= a_count[n++];
+            uint64_t i = 0;
+            while (sfen_count_in_file[i] <= r)
+                r -= sfen_count_in_file[i++];
 
             // This confirms n. Before you forget it, reduce the remaining number.
 
-            --a_count[n];
-            --sum;
+            --sfen_count_in_file[i];
+            --sfen_count_left;
 
             PackedSfenValue psv;
             // It's better to read and write all at once until the performance is not so good...
-            if (afs[n].read((char*)&psv, sizeof(PackedSfenValue)))
+            if (sfen_file_streams[i].read((char*)&psv, sizeof(PackedSfenValue)))
             {
                 fs.write((char*)&psv, sizeof(PackedSfenValue));
                 ++write_sfen_count;
                 print_status();
             }
         }
+
         print_status();
         fs.close();
+
         cout << "done!" << endl;
     }
 
@@ -1509,8 +1506,8 @@ namespace Learner
         // There should have been a limit of 512 per process on Windows, so you can open here as 500,
         // The current setting is 500 files x 20M = 10G = 10 billion phases.
 
-        PSVector buf;
-        buf.resize(buffer_size);
+        PSVector buf(buffer_size);
+
         // ↑ buffer, a marker that indicates how much you have used
         uint64_t buf_write_marker = 0;
 
@@ -1537,7 +1534,7 @@ namespace Learner
             // write to a file
             fstream fs;
             fs.open(make_filename(write_file_count++), ios::out | ios::binary);
-            fs.write((char*)&buf[0], size * sizeof(PackedSfenValue));
+            fs.write(reinterpret_cast<char*>(buf.data()), size * sizeof(PackedSfenValue));
             fs.close();
             a_count.push_back(size);
 
@@ -1552,14 +1549,13 @@ namespace Learner
         {
             fstream fs(filename, ios::in | ios::binary);
             cout << endl << "open file = " << filename;
-            while (fs.read((char*)&buf[buf_write_marker], sizeof(PackedSfenValue)))
+            while (fs.read(reinterpret_cast<char*>(&buf[buf_write_marker]), sizeof(PackedSfenValue)))
                 if (++buf_write_marker == buffer_size)
                     write_buffer(buffer_size);
 
             // Read in units of sizeof(PackedSfenValue),
             // Ignore the last remaining fraction. (Fails in fs.read, so exit while)
             // (The remaining fraction seems to be half-finished data that was created because it was stopped halfway during teacher generation.)
-
         }
 
         if (buf_write_marker != 0)
@@ -1599,20 +1595,20 @@ namespace Learner
         size_t file_count = filenames.size();
 
         // Number of teacher positions stored in each file in filenames
-        vector<uint64_t> a_count(file_count);
+        vector<uint64_t> sfen_count_in_file(file_count);
 
         // Count the number of teacher aspects in each file.
-        vector<fstream> afs(file_count);
+        vector<fstream> sfen_file_streams(file_count);
 
         for (size_t i = 0; i < file_count; ++i)
         {
             auto filename = filenames[i];
-            auto& fs = afs[i];
+            auto& fs = sfen_file_streams[i];
 
             fs.open(filename, ios::in | ios::binary);
             const uint64_t file_size = get_file_size(fs);
             const uint64_t sfen_count = file_size / sizeof(PackedSfenValue);
-            a_count[i] = sfen_count;
+            sfen_count_in_file[i] = sfen_count;
 
             // Output the number of sfen stored in each file.
             cout << filename << " = " << sfen_count << " sfens." << endl;
@@ -1624,7 +1620,7 @@ namespace Learner
         // Now you have shuffled.
 
         // Throw to the subcontract function and end.
-        shuffle_write(output_file_name, prng, afs, a_count);
+        shuffle_write(output_file_name, prng, sfen_file_streams, sfen_count_in_file);
     }
 
     // Subcontracting the teacher shuffle "learn shufflem" command.
@@ -1656,7 +1652,10 @@ namespace Learner
         std::cout << "write : " << output_file_name << endl;
 
         // If the file to be written exceeds 2GB, it cannot be written in one shot with fstream::write, so use wrapper.
-        write_memory_to_file(output_file_name, (void*)&buf[0], (uint64_t)sizeof(PackedSfenValue) * (uint64_t)buf.size());
+        write_memory_to_file(
+            output_file_name, 
+            (void*)&buf[0], 
+            sizeof(PackedSfenValue) * buf.size());
 
         std::cout << "..shuffle_on_memory done." << std::endl;
     }
@@ -1664,7 +1663,7 @@ namespace Learner
     // Learning from the generated game record
     void learn(Position&, istringstream& is)
     {
-        auto thread_num = (int)Options["Threads"];
+        const auto thread_num = (int)Options["Threads"];
         SfenReader sr(thread_num);
 
         LearnerThink learn_think(sr);
@@ -1889,13 +1888,6 @@ namespace Learner
         {
             string kif_base_dir = Path::Combine(base_dir, target_dir);
 
-            // Remove this folder. Keep it relative to base_dir.
-#if defined(_MSC_VER)
-        // If you use std::tr2, warning C4996 will appear, so suppress it.
-        // * std::tr2 issued a deprecation warning by default under std:c++14, and was deleted by default in /std:c++17.
-#pragma warning(push)
-#pragma warning(disable:4996)
-
             namespace sys = std::filesystem;
             sys::path p(kif_base_dir); // Origin of enumeration
             std::for_each(sys::directory_iterator(p), sys::directory_iterator(),
@@ -1903,36 +1895,6 @@ namespace Learner
                     if (sys::is_regular_file(p))
                         filenames.push_back(Path::Combine(target_dir, p.filename().generic_string()));
                 });
-#pragma warning(pop)
-
-#elif defined(__GNUC__)
-
-            auto ends_with = [](std::string const& value, std::string const& ending)
-            {
-                if (ending.size() > value.size()) return false;
-                return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
-            };
-
-            // It can't be helped, so read it using dirent.h.
-            DIR* dp; // pointer to directory
-            dirent* entry; // entry point returned by readdir()
-
-            dp = opendir(kif_base_dir.c_str());
-            if (dp != NULL)
-            {
-                do {
-                    entry = readdir(dp);
-                    // Only list files ending with ".bin"
-                    // →I hate this restriction when generating files with serial numbers...
-                    if (entry != NULL && ends_with(entry->d_name, ".bin"))
-                    {
-                        //cout << entry->d_name << endl;
-                        filenames.push_back(Path::Combine(target_dir, entry->d_name));
-                    }
-                } while (entry != NULL);
-                closedir(dp);
-            }
-#endif
         }
 
         cout << "learn from ";
@@ -1990,6 +1952,7 @@ namespace Learner
                 dest_score_max_value,
                 check_invalid_fen,
                 check_illegal_move);
+
             return;
 
         }
@@ -1997,7 +1960,12 @@ namespace Learner
         {
             Eval::init_NNUE();
             cout << "convert_bin_from_pgn-extract.." << endl;
-            convert_bin_from_pgn_extract(filenames, output_file_name, pgn_eval_side_to_move, convert_no_eval_fens_as_score_zero);
+            convert_bin_from_pgn_extract(
+                filenames, 
+                output_file_name, 
+                pgn_eval_side_to_move, 
+                convert_no_eval_fens_as_score_zero);
+
             return;
         }
 
@@ -2154,12 +2122,6 @@ namespace Learner
 #endif
     }
 
-
 } // namespace Learner
 
-#if defined(GENSFEN2019)
-#include "gensfen2019.cpp"
-#endif
-
-
 #endif // EVAL_LEARN

From a0b2d6a01e39627e9ea87b234a18067e4e404faf Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 7 Sep 2020 23:33:32 +0200
Subject: [PATCH 042/398] Note a potential defect in sfen packer.

---
 src/extra/sfen_packer.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/extra/sfen_packer.cpp b/src/extra/sfen_packer.cpp
index ac789ce8..fd013fa2 100644
--- a/src/extra/sfen_packer.cpp
+++ b/src/extra/sfen_packer.cpp
@@ -218,7 +218,7 @@ struct SfenPacker
     PieceType pr = type_of(pc);
     auto c = huffman_table[pr];
     stream.write_n_bit(c.code, c.bits);
- 
+
     if (pc == NO_PIECE)
       return;
 
@@ -249,7 +249,7 @@ struct SfenPacker
 
     // first and second flag
     Color c = (Color)stream.read_one_bit();
-    
+
     return make_piece(c, pr);
   }
 };
@@ -266,7 +266,10 @@ int Position::set_from_packed_sfen(const PackedSfen& sfen , StateInfo * si, Thre
 {
 	SfenPacker packer;
 	auto& stream = packer.stream;
-	stream.set_data((uint8_t*)&sfen);
+
+  // TODO: separate streams for writing and reading. Here we actually have to
+  // const_cast which is not safe in the long run.
+	stream.set_data(const_cast<uint8_t*>(&sfen));
 
 	std::memset(this, 0, sizeof(Position));
 	std::memset(si, 0, sizeof(StateInfo));

From 0202218f58467dac447b73b7724158ebec4a221f Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 7 Sep 2020 23:34:13 +0200
Subject: [PATCH 043/398] fix cast

---
 src/extra/sfen_packer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/extra/sfen_packer.cpp b/src/extra/sfen_packer.cpp
index fd013fa2..1d82111d 100644
--- a/src/extra/sfen_packer.cpp
+++ b/src/extra/sfen_packer.cpp
@@ -269,7 +269,7 @@ int Position::set_from_packed_sfen(const PackedSfen& sfen , StateInfo * si, Thre
 
   // TODO: separate streams for writing and reading. Here we actually have to
   // const_cast which is not safe in the long run.
-	stream.set_data(const_cast<uint8_t*>(&sfen));
+	stream.set_data(const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(&sfen)));
 
 	std::memset(this, 0, sizeof(Position));
 	std::memset(si, 0, sizeof(StateInfo));

From 41b7674aee3920cb72554f8d22eb4e2cb6c57e09 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 7 Sep 2020 23:55:07 +0200
Subject: [PATCH 044/398] Improve comments, break long lines.

---
 src/learn/learner.cpp | 321 ++++++++++++++++++++++++++++--------------
 src/misc.h            |   1 +
 2 files changed, 213 insertions(+), 109 deletions(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index ddfaff5a..f9d188b8 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -1,18 +1,24 @@
-﻿// learning routines
+﻿// Learning routines:
 //
-// 1) Automatic generation of game records
+// 1) Automatic generation of game records in .bin format
 // → "gensfen" command
-// 2) Learning evaluation function parameters from the generated game record
+//
+// 2) Learning evaluation function parameters from the generated .bin files
 // → "learn" command
+//
 // → Shuffle in the teacher phase is also an extension of this command.
 // Example) "learn shuffle"
+//
 // 3) Automatic generation of fixed traces
 // → "makebook think" command
 // → implemented in extra/book/book.cpp
+//
 // 4) Post-station automatic review mode
 // → I will not be involved in the engine because it is a problem that the GUI should assist.
 // etc..
 
+#define EVAL_LEARN
+
 #if defined(EVAL_LEARN)
 
 #include "../eval/evaluate_common.h"
@@ -53,7 +59,6 @@
 
 using namespace std;
 
-
 #if defined(USE_BOOK)
 // This is defined in the search section.
 extern Book::BookMoveSelector book;
@@ -63,6 +68,7 @@ template <typename T>
 T operator +=(std::atomic<T>& x, const T rhs)
 {
     T old = x.load(std::memory_order_consume);
+
     // It is allowed that the value is rewritten from other thread at this timing.
     // The idea that the value is not destroyed is good.
     T desired = old + rhs;
@@ -81,7 +87,7 @@ namespace Learner
 
     static double winning_probability_coefficient = 1.0 / PawnValueEg / 4.0 * std::log(10.0);
 
-    // Score scale factors.  ex) If we set src_score_min_value = 0.0,
+    // Score scale factors. ex) If we set src_score_min_value = 0.0,
     // src_score_max_value = 1.0, dest_score_min_value = 0.0,
     // dest_score_max_value = 10000.0, [0.0, 1.0] will be scaled to [0, 10000].
     static double src_score_min_value = 0.0;
@@ -89,8 +95,9 @@ namespace Learner
     static double dest_score_min_value = 0.0;
     static double dest_score_max_value = 1.0;
 
-    // Assume teacher signals are the scores of deep searches, and convert them into winning
-    // probabilities in the trainer. Sometimes we want to use the winning probabilities in the training
+    // Assume teacher signals are the scores of deep searches, 
+    // and convert them into winning probabilities in the trainer. 
+    // Sometimes we want to use the winning probabilities in the training
     // data directly. In those cases, we set false to this variable.
     static bool convert_teacher_signal_to_winning_probability = true;
 
@@ -100,13 +107,9 @@ namespace Learner
     // This CANNOT be static since it's used elsewhere.
     bool use_raw_nnue_eval = false;
 
-    // Using WDL with win rate model instead of sigmoid
+    // Using stockfish's WDL with win rate model instead of sigmoid
     static bool use_wdl = false;
 
-    // -----------------------------------
-    // command to learn from the generated game (learn)
-    // -----------------------------------
-
     // A function that converts the evaluation value to the winning rate [0,1]
     double winning_percentage(double value)
     {
@@ -142,21 +145,31 @@ namespace Learner
         }
     }
 
-    double calc_cross_entropy_of_winning_percentage(double deep_win_rate, double shallow_eval, int ply)
+    double calc_cross_entropy_of_winning_percentage(
+        double deep_win_rate, 
+        double shallow_eval, 
+        int ply)
     {
         const double p = deep_win_rate;
         const double q = winning_percentage(shallow_eval, ply);
         return -p * std::log(q) - (1.0 - p) * std::log(1.0 - q);
     }
 
-    double calc_d_cross_entropy_of_winning_percentage(double deep_win_rate, double shallow_eval, int ply)
+    double calc_d_cross_entropy_of_winning_percentage(
+        double deep_win_rate, 
+        double shallow_eval, 
+        int ply)
     {
         constexpr double epsilon = 0.000001;
 
-        const double y1 = calc_cross_entropy_of_winning_percentage(deep_win_rate, shallow_eval, ply);
-        const double y2 = calc_cross_entropy_of_winning_percentage(deep_win_rate, shallow_eval + epsilon, ply);
+        const double y1 = calc_cross_entropy_of_winning_percentage(
+            deep_win_rate, shallow_eval, ply);
 
-        // Divide by the winning_probability_coefficient to match scale with the sigmoidal win rate
+        const double y2 = calc_cross_entropy_of_winning_percentage(
+            deep_win_rate, shallow_eval + epsilon, ply);
+
+        // Divide by the winning_probability_coefficient to 
+        // match scale with the sigmoidal win rate
         return ((y2 - y1) / epsilon) / winning_probability_coefficient;
     }
 
@@ -167,9 +180,12 @@ namespace Learner
     {
         // The square of the win rate difference minimizes it in the objective function.
         // Objective function J = 1/2m Σ (win_rate(shallow)-win_rate(deep) )^2
-        // However, σ is a sigmoid function that converts the evaluation value into the difference in the winning percentage.
-        // m is the number of samples. shallow is the evaluation value for a shallow search (qsearch()). deep is the evaluation value for deep search.
-        // If W is the feature vector (parameter of the evaluation function) and Xi and Yi are teachers
+        // However, σ is a sigmoid function that converts the 
+        // evaluation value into the difference in the winning percentage.
+        // m is the number of samples. shallow is the evaluation value 
+        // for a shallow search (qsearch()). deep is the evaluation value for deep search.
+        // If W is the feature vector (parameter of the evaluation function) 
+        // and Xi and Yi are teachers
         // shallow = W*Xi // * is the Hadamard product, transposing W and meaning X
         // f(Xi) = win_rate(W*Xi)
         // If σ(i th deep) = Yi,
@@ -179,10 +195,12 @@ namespace Learner
         // ∂J/∂Wj = ∂J/∂f ・∂f/∂W ・∂W/∂Wj
         // = 1/m Σ (f(Xi)-y) ・f'(Xi) ・ 1
 
-        // 1/m will be multiplied later, but the contents of Σ can be retained in the array as the value of the gradient.
+        // 1/m will be multiplied later, but the contents of Σ can 
+        // be retained in the array as the value of the gradient.
         // f'(Xi) = win_rate'(shallow) = sigmoid'(shallow/600) = dsigmoid(shallow / 600) / 600
         // This /600 at the end is adjusted by the learning rate, so do not write it..
-        // Also, the coefficient of 1/m is unnecessary if you use the update formula that has the automatic gradient adjustment function like Adam and AdaGrad.
+        // Also, the coefficient of 1/m is unnecessary if you use the update 
+        // formula that has the automatic gradient adjustment function like Adam and AdaGrad.
         // Therefore, it is not necessary to save it in memory.
 
         const double p = winning_percentage(deep, psv.gamePly);
@@ -202,7 +220,9 @@ namespace Learner
         // Refer to etc.
 
         // Objective function design)
-        // We want to make the distribution of p closer to the distribution of q → Think of it as the problem of minimizing the cross entropy between the probability distributions of p and q.
+        // We want to make the distribution of p closer to the distribution of q 
+        // → Think of it as the problem of minimizing the cross entropy 
+        // between the probability distributions of p and q.
         // J = H(p,q) =-Σ p(x) log(q(x)) = -p log q-(1-p) log(1-q)
         // x
 
@@ -222,7 +242,8 @@ namespace Learner
     double calc_grad(Value deep, Value shallow, const PackedSfenValue& psv)
     {
         // Version that does not pass the winning percentage function
-        // This, unless EVAL_LIMIT is set low, trying to match the evaluation value with the shape of the end stage
+        // This, unless EVAL_LIMIT is set low, trying to 
+        // match the evaluation value with the shape of the end stage
         // eval may exceed the range of eval.
         return shallow - deep;
     }
@@ -261,7 +282,6 @@ namespace Learner
     {
         const double scaled_teacher_signal = get_scaled_signal(teacher_signal);
 
-        // Teacher winning probability.
         double p = scaled_teacher_signal;
         if (convert_teacher_signal_to_winning_probability) 
         {
@@ -273,7 +293,8 @@ namespace Learner
 
     double calculate_lambda(double teacher_signal)
     {
-        // If the evaluation value in deep search exceeds ELMO_LAMBDA_LIMIT, apply ELMO_LAMBDA2 instead of ELMO_LAMBDA.
+        // If the evaluation value in deep search exceeds ELMO_LAMBDA_LIMIT
+        // then apply ELMO_LAMBDA2 instead of ELMO_LAMBDA.
         const double lambda =
             (std::abs(teacher_signal) >= ELMO_LAMBDA_LIMIT)
             ? ELMO_LAMBDA2
@@ -284,7 +305,8 @@ namespace Learner
 
     double calculate_t(int game_result)
     {
-        // Use 1 as the correction term if the expected win rate is 1, 0 if you lose, and 0.5 if you draw.
+        // Use 1 as the correction term if the expected win rate is 1, 
+        // 0 if you lose, and 0.5 if you draw.
         // game_result = 1,0,-1 so add 1 and divide by 2.
         const double t = double(game_result + 1) * 0.5;
 
@@ -318,7 +340,9 @@ namespace Learner
     }
 
     // Calculate cross entropy during learning
-    // The individual cross entropy of the win/loss term and win rate term of the elmo expression is returned to the arguments cross_entropy_eval and cross_entropy_win.
+    // The individual cross entropy of the win/loss term and win 
+    // rate term of the elmo expression is returned 
+    // to the arguments cross_entropy_eval and cross_entropy_win.
     void calc_cross_entropy(
         Value teacher_signal, 
         Value shallow, 
@@ -356,11 +380,7 @@ namespace Learner
     }
 
 #endif
-
-
-    // Other variations may be prepared as the objective function..
-
-
+    // Other objective functions may be considered in the future...
     double calc_grad(Value shallow, const PackedSfenValue& psv) 
     {
         return calc_grad((Value)psv.score, shallow, psv);
@@ -369,15 +389,17 @@ namespace Learner
     // Sfen reader
     struct SfenReader
     {
-        // number of phases used for calculation such as mse
+        // Number of phases used for calculation such as mse
         // mini-batch size = 1M is standard, so 0.2% of that should be negligible in terms of time.
-        //Since search() is performed with depth = 1 in calculation of move match rate, simple comparison is not possible...
+        // Since search() is performed with depth = 1 in calculation of 
+        // move match rate, simple comparison is not possible...
         static constexpr uint64_t sfen_for_mse_size = 2000;
 
         // Number of phases buffered by each thread 0.1M phases. 4M phase at 40HT
         static constexpr size_t THREAD_BUFFER_SIZE = 10 * 1000;
 
-        // Buffer for reading files (If this is made larger, the shuffle becomes larger and the phases may vary.
+        // Buffer for reading files (If this is made larger, 
+        // the shuffle becomes larger and the phases may vary.
         // If it is too large, the memory consumption will increase.
         // SFEN_READ_SIZE is a multiple of THREAD_BUFFER_SIZE.
         static constexpr const size_t SFEN_READ_SIZE = LEARN_SFEN_READ_SIZE;
@@ -387,7 +409,8 @@ namespace Learner
         // It must be 2**N because it will be used as the mask to calculate hash_index.
         static constexpr uint64_t READ_SFEN_HASH_SIZE = 64 * 1024 * 1024;
 
-        // Do not use std::random_device().  Because it always the same integers on MinGW.
+        // Do not use std::random_device().
+        // Because it always the same integers on MinGW.
         SfenReader(int thread_num) : 
             prng(std::chrono::system_clock::now().time_since_epoch().count())
         {
@@ -460,16 +483,20 @@ namespace Learner
         // [ASYNC] Thread returns one aspect. Otherwise returns false.
         bool read_to_thread_buffer(size_t thread_id, PackedSfenValue& ps)
         {
-            // If there are any positions left in the thread buffer, retrieve one and return it.
+            // If there are any positions left in the thread buffer
+            // then retrieve one and return it.
             auto& thread_ps = packed_sfens[thread_id];
 
-            // Fill the read buffer if there is no remaining buffer, but if it doesn't even exist, finish.
-            if ((thread_ps == nullptr || thread_ps->empty()) // If the buffer is empty, fill it.
+            // Fill the read buffer if there is no remaining buffer, 
+            // but if it doesn't even exist, finish.
+            // If the buffer is empty, fill it.
+            if ((thread_ps == nullptr || thread_ps->empty())
                 && !read_to_thread_buffer_impl(thread_id))
                 return false;
 
             // read_to_thread_buffer_impl() returned true,
-            // Since the filling of the thread buffer with the phase has been completed successfully
+            // Since the filling of the thread buffer with the 
+            // phase has been completed successfully
             // thread_ps->rbegin() is alive.
 
             ps = thread_ps->back();
@@ -511,6 +538,7 @@ namespace Learner
 
                 // Waiting for file worker to fill packed_sfens_pool.
                 // The mutex isn't locked, so it should fill up soon.
+                // Poor man's condition variable.
                 sleep(1);
             }
 
@@ -519,14 +547,14 @@ namespace Learner
         // Start a thread that loads the phase file in the background.
         void start_file_read_worker()
         {
-            file_worker_thread = std::thread([&] { this->file_read_worker(); });
+            file_worker_thread = std::thread([&] { 
+                this->file_read_worker(); 
+                });
         }
 
-        // for file read-only threads
         void file_read_worker()
         {
-            auto open_next_file = [&]()
-            {
+            auto open_next_file = [&]() {
                 if (fs.is_open())
                     fs.close();
 
@@ -569,7 +597,7 @@ namespace Learner
                     }
                     else if(!open_next_file())
                     {
-                        // There was no next file. Abon.
+                        // There was no next file. Abort.
                         cout << "..end of files." << endl;
                         end_of_files = true;
                         return;
@@ -577,8 +605,6 @@ namespace Learner
                 }
 
                 // Shuffle the read phase data.
-                // random shuffle by Fisher-Yates algorithm
-
                 if (!no_shuffle)
                 {
                     Algo::shuffle(sfens, prng);
@@ -597,17 +623,19 @@ namespace Learner
                     // Delete this pointer on the receiving side.
                     auto buf = std::make_unique<PSVector>();
                     buf->resize(THREAD_BUFFER_SIZE);
-                    memcpy(buf->data(), &sfens[i * THREAD_BUFFER_SIZE], sizeof(PackedSfenValue) * THREAD_BUFFER_SIZE);
+                    memcpy(
+                        buf->data(), 
+                        &sfens[i * THREAD_BUFFER_SIZE], 
+                        sizeof(PackedSfenValue) * THREAD_BUFFER_SIZE);
 
                     buffers.emplace_back(std::move(buf));
                 }
 
-                // Since sfens is ready, look at the occasion and copy
                 {
                     std::unique_lock<std::mutex> lk(mutex);
 
-                    // You can ignore this time because you just copy the pointer...
-                    // The mutex lock is required because the contents of packed_sfens_pool are changed.
+                    // The mutex lock is required because the 
+                    // contents of packed_sfens_pool are changed.
 
                     for (auto& buf : buffers)
                         packed_sfens_pool.emplace_back(std::move(buf));
@@ -644,7 +672,7 @@ namespace Learner
 
         bool stop_flag;
 
-        vector<Key> hash; // 64MB*8 = 512MB
+        vector<Key> hash;
 
         // test phase for mse calculation
         PSVector sfen_for_mse;
@@ -660,7 +688,6 @@ namespace Learner
         // Did you read the files and reached the end?
         atomic<bool> end_of_files;
 
-
         // handle of sfen file
         std::fstream fs;
 
@@ -727,7 +754,7 @@ namespace Learner
         uint64_t epoch = 0;
 
         // Mini batch size size. Be sure to set it on the side that uses this class.
-        uint64_t mini_batch_size = 1000 * 1000;
+        uint64_t mini_batch_size = LEARN_MINI_BATCH_SIZE;
 
         bool stop_flag;
 
@@ -740,7 +767,8 @@ namespace Learner
         // Option not to learn kk/kkp/kpp/kppp
         std::array<bool, 4> freeze;
 
-        // If the absolute value of the evaluation value of the deep search of the teacher phase exceeds this value, discard the teacher phase.
+        // If the absolute value of the evaluation value of the deep search 
+        // of the teacher phase exceeds this value, discard the teacher phase.
         int eval_limit;
 
         // Flag whether to dig a folder each time the evaluation function is saved.
@@ -811,7 +839,8 @@ namespace Learner
 
     void LearnerThink::calc_loss(size_t thread_id, uint64_t done)
     {
-        // There is no point in hitting the replacement table, so at this timing the generation of the replacement table is updated.
+        // There is no point in hitting the replacement table, 
+        // so at this timing the generation of the replacement table is updated.
         // It doesn't matter if you have disabled the substitution table.
         TT.new_search();
 
@@ -845,7 +874,8 @@ namespace Learner
         sum_norm = 0;
 #endif
 
-        // The number of times the pv first move of deep search matches the pv first move of search(1).
+        // The number of times the pv first move of deep 
+        // search matches the pv first move of search(1).
         atomic<int> move_accord_count;
         move_accord_count = 0;
 
@@ -856,7 +886,8 @@ namespace Learner
         pos.set(StartFEN, false, &si, th);
         std::cout << "hirate eval = " << Eval::evaluate(pos);
 
-        // It's better to parallelize here, but it's a bit troublesome because the search before slave has not finished.
+        // It's better to parallelize here, but it's a bit 
+        // troublesome because the search before slave has not finished.
         // I created a mechanism to call task, so I will use it.
 
         // The number of tasks to do.
@@ -869,7 +900,8 @@ namespace Learner
         {
             // Assign work to each thread using TaskDispatcher.
             // A task definition for that.
-            // It is not possible to capture pos used in ↑, so specify the variables you want to capture one by one.
+            // It is not possible to capture pos used in ↑, 
+            // so specify the variables you want to capture one by one.
             auto task =
                 [
                     this,
@@ -899,7 +931,8 @@ namespace Learner
                 // Evaluation value of deep search
                 auto deep_value = (Value)ps.score;
 
-                // Note) This code does not consider when eval_limit is specified in the learn command.
+                // Note) This code does not consider when 
+                //       eval_limit is specified in the learn command.
 
                 // --- error calculation
 
@@ -975,14 +1008,16 @@ namespace Learner
             << " , eval mae = " << eval_mae;
 #endif
 
-#if defined ( LOSS_FUNCTION_IS_ELMO_METHOD )
+#if defined(LOSS_FUNCTION_IS_ELMO_METHOD)
 #if defined(EVAL_NNUE)
         latest_loss_sum += test_sum_cross_entropy - test_sum_entropy;
         latest_loss_count += sr.sfen_for_mse.size();
 #endif
 
-        // learn_cross_entropy may be called train cross entropy in the world of machine learning,
-        // When omitting the acronym, it is nice to be able to distinguish it from test cross entropy(tce) by writing it as lce.
+        // learn_cross_entropy may be called train cross 
+        // entropy in the world of machine learning,
+        // When omitting the acronym, it is nice to be able to 
+        // distinguish it from test cross entropy(tce) by writing it as lce.
 
         if (sr.sfen_for_mse.size() && done)
         {
@@ -1074,7 +1109,9 @@ namespace Learner
                     // Output the current time. Output every time.
                     std::cout << sr.total_done << " sfens , at " << now_string() << std::endl;
 
-                    // Reflect the gradient in the weight array at this timing. The calculation of the gradient is just right for each 1M phase in terms of mini-batch.
+                    // Reflect the gradient in the weight array at this timing. 
+                    // The calculation of the gradient is just right for 
+                    // each 1M phase in terms of mini-batch.
                     Eval::update_weights(epoch, freeze);
 
                     // Display epoch and current eta for debugging.
@@ -1090,14 +1127,13 @@ namespace Learner
 #endif
                     ++epoch;
 
-                    // Save once every 1 billion phases.
-
                     // However, the elapsed time during update_weights() and calc_rmse() is ignored.
                     if (++sr.save_count * mini_batch_size >= eval_save_interval)
                     {
                         sr.save_count = 0;
 
-                        // During this time, as the gradient calculation proceeds, the value becomes too large and I feel annoyed, so stop other threads.
+                        // During this time, as the gradient calculation proceeds, 
+                        // the value becomes too large and I feel annoyed, so stop other threads.
                         const bool converged = save();
                         if (converged)
                         {
@@ -1109,7 +1145,6 @@ namespace Learner
 
                     // Calculate rmse. This is done for samples of 10,000 phases.
                     // If you do with 40 cores, update_weights every 1 million phases
-                    // I don't think it's so good to be tiring.
                     static uint64_t loss_output_count = 0;
                     if (++loss_output_count * mini_batch_size >= loss_output_interval)
                     {
@@ -1129,10 +1164,12 @@ namespace Learner
                         sr.last_done = sr.total_done;
                     }
 
-                    // Next time, I want you to do this series of processing again when you process only mini_batch_size.
+                    // Next time, I want you to do this series of 
+                    // processing again when you process only mini_batch_size.
                     sr.next_update_weights += mini_batch_size;
 
-                    // Since I was waiting for the update of this sr.next_update_weights except the main thread,
+                    // Since I was waiting for the update of this 
+                    // sr.next_update_weights except the main thread,
                     // Once this value is updated, it will start moving again.
                 }
             }
@@ -1173,7 +1210,8 @@ namespace Learner
             if (pos.set_from_packed_sfen(ps.sfen, &si, th, mirror) != 0)
             {
                 // I got a strange sfen. Should be debugged!
-                // Since it is an illegal sfen, it may not be displayed with pos.sfen(), but it is better than not.
+                // Since it is an illegal sfen, it may not be 
+                // displayed with pos.sfen(), but it is better than not.
                 cout << "Error! : illigal packed sfen = " << pos.fen() << endl;
                 goto RETRY_READ;
             }
@@ -1198,9 +1236,11 @@ namespace Learner
 #endif
 
             // There is a possibility that all the pieces are blocked and stuck.
-            // Also, the declaration win phase is excluded from learning because you cannot go to leaf with PV moves.
-            // (shouldn't write out such teacher aspect itself, but may have written it out with an old generation routine)
-        // Skip the position if there are no legal moves (=checkmated or stalemate).
+            // Also, the declaration win phase is excluded from 
+            // learning because you cannot go to leaf with PV moves.
+            // (shouldn't write out such teacher aspect itself, 
+            // but may have written it out with an old generation routine)
+            // Skip the position if there are no legal moves (=checkmated or stalemate).
             if (MoveList<LEGAL>(pos).size() == 0)
                 goto RETRY_READ;
 
@@ -1214,7 +1254,8 @@ namespace Learner
             const auto deep_value = (Value)ps.score;
 
             // I feel that the mini batch has a better gradient.
-            // Go to the leaf node as it is, add only to the gradient array, and later try AdaGrad at the time of rmse aggregation.
+            // Go to the leaf node as it is, add only to the gradient array, 
+            // and later try AdaGrad at the time of rmse aggregation.
 
             const auto rootColor = pos.side_to_move();
 
@@ -1223,23 +1264,25 @@ namespace Learner
             // It may be better not to study where the difference in evaluation values ​​is too large.
 
 #if 0
-        // If you do this, about 13% of the phases will be excluded from the learning target. Good and bad are subtle.
+            // If you do this, about 13% of the phases will be excluded 
+            // from the learning target. Good and bad are subtle.
             if (pv.size() >= 1 && (uint16_t)pv[0] != ps.move)
             {
-                // dbg_hit_on(false);
+                //dbg_hit_on(false);
                 continue;
             }
 #endif
 
 #if 0
             // It may be better not to study where the difference in evaluation values ​​is too large.
-            // → It's okay because it passes the win rate function... About 30% of the phases are out of the scope of learning...
+            // → It's okay because it passes the win rate function... 
+            // About 30% of the phases are out of the scope of learning...
             if (abs((int16_t)r.first - ps.score) >= Eval::PawnValue * 4)
             {
-                //          dbg_hit_on(false);
+                //dbg_hit_on(false);
                 continue;
             }
-            //      dbg_hit_on(true);
+            //dbg_hit_on(true);
 #endif
 
             int ply = 0;
@@ -1248,9 +1291,12 @@ namespace Learner
             auto pos_add_grad = [&]() {
                 // Use the value of evaluate in leaf as shallow_value.
                 // Using the return value of qsearch() as shallow_value,
-                // If PV is interrupted in the middle, the phase where evaluate() is called to calculate the gradient, and
-                // I don't think this is a very desirable property, as the aspect that gives that gradient will be different.
-                // I have turned off the substitution table, but since the pv array has not been updated due to one stumbling block etc...
+                // If PV is interrupted in the middle, the phase where 
+                // evaluate() is called to calculate the gradient, 
+                // and I don't think this is a very desirable property, 
+                // as the aspect that gives that gradient will be different.
+                // I have turned off the substitution table, but since 
+                // the pv array has not been updated due to one stumbling block etc...
 
                 const Value shallow_value = 
                     (rootColor == pos.side_to_move()) 
@@ -1284,7 +1330,8 @@ namespace Learner
                 // Slope
                 double dj_dw = calc_grad(deep_value, shallow_value, ps);
 
-                // Add jd_dw as the gradient (∂J/∂Wj) for the feature vector currently appearing in the leaf node.
+                // Add jd_dw as the gradient (∂J/∂Wj) for the 
+                // feature vector currently appearing in the leaf node.
 
                 // If it is not PV termination, apply a discount rate.
                 if (discount_rate != 0 && ply != (int)pv.size())
@@ -1330,7 +1377,7 @@ namespace Learner
 
             if (illegal_move) 
             {
-                sync_cout << "An illical move was detected... Excluded the position from the learning data..." << sync_endl;
+                sync_cout << "An illegal move was detected... Excluded the position from the learning data..." << sync_endl;
                 continue;
             }
 
@@ -1343,7 +1390,11 @@ namespace Learner
 
 #if 0
             // When adding the gradient to the root phase
-            shallow_value = (rootColor == pos.side_to_move()) ? Eval::evaluate(pos) : -Eval::evaluate(pos);
+            shallow_value = 
+                (rootColor == pos.side_to_move()) 
+                ? Eval::evaluate(pos) 
+                : -Eval::evaluate(pos);
+
             dj_dw = calc_grad(deep_value, shallow_value, ps);
             Eval::add_grad(pos, rootColor, dj_dw, without_kpp);
 #endif
@@ -1426,10 +1477,14 @@ namespace Learner
 
     // Shuffle_files(), shuffle_files_quick() subcontracting, writing part.
     // output_file_name: Name of the file to write
-    // prng: random number
-    // afs: fstream of each teacher phase file
-    // a_count: The number of teacher positions inherent in each file.
-    void shuffle_write(const string& output_file_name, PRNG& prng, vector<fstream>& sfen_file_streams, vector<uint64_t>& sfen_count_in_file)
+    // prng: random number generator
+    // sfen_file_streams: fstream of each teacher phase file
+    // sfen_count_in_file: The number of teacher positions present in each file.
+    void shuffle_write(
+        const string& output_file_name, 
+        PRNG& prng, 
+        vector<fstream>& sfen_file_streams, 
+        vector<uint64_t>& sfen_count_in_file)
     {
         uint64_t total_sfen_count = 0;
         for (auto c : sfen_count_in_file)
@@ -1502,7 +1557,8 @@ namespace Learner
         // Temporary file is written to tmp/ folder for each buffer_size phase.
         // For example, if buffer_size = 20M, you need a buffer of 20M*40bytes = 800MB.
         // In a PC with a small memory, it would be better to reduce this.
-        // However, if the number of files increases too much, it will not be possible to open at the same time due to OS restrictions.
+        // However, if the number of files increases too much, 
+        // it will not be possible to open at the same time due to OS restrictions.
         // There should have been a limit of 512 per process on Windows, so you can open here as 500,
         // The current setting is 500 files x 20M = 10G = 10 billion phases.
 
@@ -1555,19 +1611,23 @@ namespace Learner
 
             // Read in units of sizeof(PackedSfenValue),
             // Ignore the last remaining fraction. (Fails in fs.read, so exit while)
-            // (The remaining fraction seems to be half-finished data that was created because it was stopped halfway during teacher generation.)
+            // (The remaining fraction seems to be half-finished data 
+            // that was created because it was stopped halfway during teacher generation.)
         }
 
         if (buf_write_marker != 0)
             write_buffer(buf_write_marker);
 
         // Only shuffled files have been written write_file_count.
-        // As a second pass, if you open all of them at the same time, select one at random and load one phase at a time
+        // As a second pass, if you open all of them at the same time, 
+        // select one at random and load one phase at a time
         // Now you have shuffled.
 
-        // Original file for shirt full + tmp file + file to write requires 3 times the storage capacity of the original file.
+        // Original file for shirt full + tmp file + file to write 
+        // requires 3 times the storage capacity of the original file.
         // 1 billion SSD is not enough for shuffling because it is 400GB for 10 billion phases.
-        // If you want to delete (or delete by hand) the original file at this point after writing to tmp,
+        // If you want to delete (or delete by hand) the 
+        // original file at this point after writing to tmp,
         // The storage capacity is about twice that of the original file.
         // So, maybe we should have an option to delete the original file.
 
@@ -1592,7 +1652,7 @@ namespace Learner
         PRNG prng(std::chrono::system_clock::now().time_since_epoch().count());
 
         // number of files
-        size_t file_count = filenames.size();
+        const size_t file_count = filenames.size();
 
         // Number of teacher positions stored in each file in filenames
         vector<uint64_t> sfen_count_in_file(file_count);
@@ -1651,7 +1711,8 @@ namespace Learner
 
         std::cout << "write : " << output_file_name << endl;
 
-        // If the file to be written exceeds 2GB, it cannot be written in one shot with fstream::write, so use wrapper.
+        // If the file to be written exceeds 2GB, it cannot be 
+        // written in one shot with fstream::write, so use wrapper.
         write_memory_to_file(
             output_file_name, 
             (void*)&buf[0], 
@@ -1703,9 +1764,11 @@ namespace Learner
         uint64_t buffer_size = 20000000;
         // fast shuffling assuming each file is shuffled
         bool shuffle_quick = false;
-        // A function to read the entire file in memory and shuffle it. (Requires file size memory)
+        // A function to read the entire file in memory and shuffle it. 
+        // (Requires file size memory)
         bool shuffle_on_memory = false;
-        // Conversion of packed sfen. In plain, it consists of sfen(string), evaluation value (integer), move (eg 7g7f, string), result (loss-1, win 1, draw 0)
+        // Conversion of packed sfen. In plain, it consists of sfen(string), 
+        // evaluation value (integer), move (eg 7g7f, string), result (loss-1, win 1, draw 0)
         bool use_convert_plain = false;
         // convert plain format teacher to Yaneura King's bin
         bool use_convert_bin = false;
@@ -1721,13 +1784,16 @@ namespace Learner
         // File name to write in those cases (default is "shuffled_sfen.bin")
         string output_file_name = "shuffled_sfen.bin";
 
-        // If the absolute value of the evaluation value in the deep search of the teacher phase exceeds this value, that phase is discarded.
+        // If the absolute value of the evaluation value 
+        // in the deep search of the teacher phase exceeds this value, 
+        // that phase is discarded.
         int eval_limit = 32000;
 
         // Flag to save the evaluation function file only once near the end.
         bool save_only_once = false;
 
-        // Shuffle about what you are pre-reading on the teacher aspect. (Shuffle of about 10 million phases)
+        // Shuffle about what you are pre-reading on the teacher aspect. 
+        // (Shuffle of about 10 million phases)
         // Turn on if you want to pass a pre-shuffled file.
         bool no_shuffle = false;
 
@@ -1738,7 +1804,9 @@ namespace Learner
         ELMO_LAMBDA_LIMIT = 32000;
 #endif
 
-        // Discount rate. If this is set to a value other than 0, the slope will be added even at other than the PV termination. (At that time, apply this discount rate)
+        // Discount rate. If this is set to a value other than 0, 
+        // the slope will be added even at other than the PV termination. 
+        // (At that time, apply this discount rate)
         double discount_rate = 0;
 
         // if (gamePly <rand(reduction_gameply)) continue;
@@ -1797,15 +1865,27 @@ namespace Learner
             else if (option == "eta3")       is >> eta3;
             else if (option == "eta1_epoch") is >> eta1_epoch;
             else if (option == "eta2_epoch") is >> eta2_epoch;
+
             // Accept also the old option name.
-            else if (option == "use_draw_in_training" || option == "use_draw_games_in_training") is >> use_draw_games_in_training;
+            else if (option == "use_draw_in_training" 
+                  || option == "use_draw_games_in_training") 
+                is >> use_draw_games_in_training;
+
             // Accept also the old option name.
-            else if (option == "use_draw_in_validation" || option == "use_draw_games_in_validation") is >> use_draw_games_in_validation;
+            else if (option == "use_draw_in_validation" 
+                  || option == "use_draw_games_in_validation") 
+                is >> use_draw_games_in_validation;
+
             // Accept also the old option name.
-            else if (option == "use_hash_in_training" || option == "skip_duplicated_positions_in_training") is >> skip_duplicated_positions_in_training;
+            else if (option == "use_hash_in_training" 
+                  || option == "skip_duplicated_positions_in_training") 
+                is >> skip_duplicated_positions_in_training;
+
             else if (option == "winning_probability_coefficient") is >> winning_probability_coefficient;
+
             // Discount rate
             else if (option == "discount_rate") is >> discount_rate;
+
             // Using WDL with win rate model instead of sigmoid
             else if (option == "use_wdl") is >> use_wdl;
 
@@ -1873,8 +1953,11 @@ namespace Learner
             else
                 filenames.push_back(option);
         }
+
         if (loss_output_interval == 0)
+        {
             loss_output_interval = LEARN_RMSE_OUTPUT_INTERVAL * mini_batch_size;
+        }
 
         cout << "learn command , ";
 
@@ -1900,6 +1983,7 @@ namespace Learner
         cout << "learn from ";
         for (auto s : filenames)
             cout << s << " , ";
+
         cout << endl;
         if (!validation_set_file_name.empty())
         {
@@ -1917,18 +2001,21 @@ namespace Learner
             shuffle_files(filenames, output_file_name, buffer_size);
             return;
         }
+
         if (shuffle_quick)
         {
             cout << "quick shuffle mode.." << endl;
             shuffle_files_quick(filenames, output_file_name);
             return;
         }
+
         if (shuffle_on_memory)
         {
             cout << "shuffle on memory.." << endl;
             shuffle_files_on_memory(filenames, output_file_name);
             return;
         }
+
         if (use_convert_plain)
         {
             Eval::init_NNUE();
@@ -1936,6 +2023,7 @@ namespace Learner
             convert_plain(filenames, output_file_name);
             return;
         }
+
         if (use_convert_bin)
         {
             Eval::init_NNUE();
@@ -1956,6 +2044,7 @@ namespace Learner
             return;
 
         }
+
         if (use_convert_bin_from_pgn_extract)
         {
             Eval::init_NNUE();
@@ -1976,15 +2065,21 @@ namespace Learner
 
         // Insert the file name for the number of loops.
         for (int i = 0; i < loop; ++i)
-            // sfen reader, I'll read it in reverse order so I'll reverse it here. I'm sorry.
+        {
+            // sfen reader, I'll read it in reverse 
+            // order so I'll reverse it here. I'm sorry.
             for (auto it = filenames.rbegin(); it != filenames.rend(); ++it)
+            {
                 sr.filenames.push_back(Path::Combine(base_dir, *it));
+            }
+        }
 
 #if !defined(EVAL_NNUE)
         cout << "Gradient Method   : " << LEARN_UPDATE << endl;
 #endif
         cout << "Loss Function     : " << LOSS_FUNCTION << endl;
         cout << "mini-batch size   : " << mini_batch_size << endl;
+
 #if defined(EVAL_NNUE)
         cout << "nn_batch_size     : " << nn_batch_size << endl;
         cout << "nn_options        : " << nn_options << endl;
@@ -1994,6 +2089,7 @@ namespace Learner
         cout << "use_draw_games_in_training : " << use_draw_games_in_training << endl;
         cout << "use_draw_games_in_validation : " << use_draw_games_in_validation << endl;
         cout << "skip_duplicated_positions_in_training : " << skip_duplicated_positions_in_training << endl;
+
 #if defined(EVAL_NNUE)
         if (newbob_decay != 1.0) {
             cout << "scheduling        : newbob with decay = " << newbob_decay
@@ -2003,6 +2099,7 @@ namespace Learner
             cout << "scheduling        : default" << endl;
         }
 #endif
+
         cout << "discount rate     : " << discount_rate << endl;
 
         // If reduction_gameply is set to 0, rand(0) will be divided by 0, so correct it to 1.
@@ -2014,6 +2111,7 @@ namespace Learner
         cout << "LAMBDA2           : " << ELMO_LAMBDA2 << endl;
         cout << "LAMBDA_LIMIT      : " << ELMO_LAMBDA_LIMIT << endl;
 #endif
+
         cout << "mirror_percentage : " << mirror_percentage << endl;
         cout << "eval_save_interval  : " << eval_save_interval << " sfens" << endl;
         cout << "loss_output_interval: " << loss_output_interval << " sfens" << endl;
@@ -2071,11 +2169,13 @@ namespace Learner
         learn_think.sr.no_shuffle = no_shuffle;
         learn_think.freeze = freeze;
         learn_think.reduction_gameply = reduction_gameply;
+
 #if defined(EVAL_NNUE)
         learn_think.newbob_scale = 1.0;
         learn_think.newbob_decay = newbob_decay;
         learn_think.newbob_num_trials = newbob_num_trials;
 #endif
+
         learn_think.eval_save_interval = eval_save_interval;
         learn_think.loss_output_interval = loss_output_interval;
         learn_think.mirror_percentage = mirror_percentage;
@@ -2086,16 +2186,19 @@ namespace Learner
 
         learn_think.mini_batch_size = mini_batch_size;
 
-        if (validation_set_file_name.empty()) {
+        if (validation_set_file_name.empty()) 
+        {
             // Get about 10,000 data for mse calculation.
             sr.read_for_mse();
         }
-        else {
+        else 
+        {
             sr.read_validation_set(validation_set_file_name, eval_limit);
         }
 
         // Calculate rmse once at this point (timing of 0 sfen)
         // sr.calc_rmse();
+
 #if defined(EVAL_NNUE)
         if (newbob_decay != 1.0) {
             learn_think.calc_loss(0, -1);
diff --git a/src/misc.h b/src/misc.h
index 5add3b36..4c04d3f0 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -226,6 +226,7 @@ namespace Math {
 }
 
 namespace Algo {
+    // Fisher-Yates
     template <typename Rng, typename T>
     void shuffle(std::vector<T>& buf, Rng&& prng)
     {

From fc27d158c012341593518a05abf51903ecbcb495 Mon Sep 17 00:00:00 2001
From: syzygy1 <3028851+syzygy1@users.noreply.github.com>
Date: Sun, 6 Sep 2020 17:29:12 +0200
Subject: [PATCH 045/398] Bug fix in do_null_move() and NNUE simplification.

This fixes #3108 and removes some NNUE code that is currently not used.

At the moment, do_null_move() copies the accumulator from the previous
state into the new state, which is correct. It then clears the "computed_score"
flag because the side to move has changed, and with the other side to move
NNUE will return a completely different evaluation (normally with changed
sign but also with different NNUE-internal tempo bonus).

The problem is that do_null_move() clears the wrong flag. It clears the
computed_score flag of the old state, not of the new state. It turns out
that this almost never affects the search. For example, fixing it does not
change the current bench (but it does change the previous bench). This is
because the search code usually avoids calling evaluate() after a null move.

This PR corrects do_null_move() by removing the computed_score flag altogether.
The flag is not needed because nnue_evaluate() is never called twice on a position.

This PR also removes some unnecessary {}s and inserts a few blank lines
in the modified NNUE files in line with SF coding style.

Resulf ot STC non-regression test:
LLR: 2.95 (-2.94,2.94) {-1.25,0.25}
Total: 26328 W: 3118 L: 3012 D: 20198
Ptnml(0-2): 126, 2208, 8397, 2300, 133
https://tests.stockfishchess.org/tests/view/5f553ccc2d02727c56b36db1

closes https://github.com/official-stockfish/Stockfish/pull/3109

bench: 4109324
---
 src/nnue/evaluate_nnue.cpp          | 38 ++-----------------
 src/nnue/nnue_accumulator.h         |  2 -
 src/nnue/nnue_feature_transformer.h | 58 +++++++++++++----------------
 src/position.cpp                    |  2 -
 4 files changed, 29 insertions(+), 71 deletions(-)

diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index d6ac9894..ed138881 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -115,31 +115,16 @@ namespace Eval::NNUE {
     return stream && stream.peek() == std::ios::traits_type::eof();
   }
 
-  // Proceed with the difference calculation if possible
-  static void UpdateAccumulatorIfPossible(const Position& pos) {
-
-    feature_transformer->UpdateAccumulatorIfPossible(pos);
-  }
-
-  // Calculate the evaluation value
-  static Value ComputeScore(const Position& pos, bool refresh) {
-
-    auto& accumulator = pos.state()->accumulator;
-    if (!refresh && accumulator.computed_score) {
-      return accumulator.score;
-    }
+  // Evaluation function. Perform differential calculation.
+  Value evaluate(const Position& pos) {
 
     alignas(kCacheLineSize) TransformedFeatureType
         transformed_features[FeatureTransformer::kBufferSize];
-    feature_transformer->Transform(pos, transformed_features, refresh);
+    feature_transformer->Transform(pos, transformed_features);
     alignas(kCacheLineSize) char buffer[Network::kBufferSize];
     const auto output = network->Propagate(transformed_features, buffer);
 
-    auto score = static_cast<Value>(output[0] / FV_SCALE);
-
-    accumulator.score = score;
-    accumulator.computed_score = true;
-    return accumulator.score;
+    return static_cast<Value>(output[0] / FV_SCALE);
   }
 
   // Load eval, from a file stream or a memory stream
@@ -150,19 +135,4 @@ namespace Eval::NNUE {
     return ReadParameters(stream);
   }
 
-  // Evaluation function. Perform differential calculation.
-  Value evaluate(const Position& pos) {
-    return ComputeScore(pos, false);
-  }
-
-  // Evaluation function. Perform full calculation.
-  Value compute_eval(const Position& pos) {
-    return ComputeScore(pos, true);
-  }
-
-  // Proceed with the difference calculation if possible
-  void update_eval(const Position& pos) {
-    UpdateAccumulatorIfPossible(pos);
-  }
-
 } // namespace Eval::NNUE
diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h
index 69dfaad2..26370710 100644
--- a/src/nnue/nnue_accumulator.h
+++ b/src/nnue/nnue_accumulator.h
@@ -29,9 +29,7 @@ namespace Eval::NNUE {
   struct alignas(kCacheLineSize) Accumulator {
     std::int16_t
         accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions];
-    Value score;
     bool computed_accumulation;
-    bool computed_score;
   };
 
 }  // namespace Eval::NNUE
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index 43707610..2b6259c3 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -50,11 +50,13 @@ namespace Eval::NNUE {
 
     // Hash value embedded in the evaluation file
     static constexpr std::uint32_t GetHashValue() {
+
       return RawFeatures::kHashValue ^ kOutputDimensions;
     }
 
     // Read network parameters
     bool ReadParameters(std::istream& stream) {
+
       for (std::size_t i = 0; i < kHalfDimensions; ++i)
         biases_[i] = read_little_endian<BiasType>(stream);
       for (std::size_t i = 0; i < kHalfDimensions * kInputDimensions; ++i)
@@ -64,23 +66,26 @@ namespace Eval::NNUE {
 
     // Proceed with the difference calculation if possible
     bool UpdateAccumulatorIfPossible(const Position& pos) const {
+
       const auto now = pos.state();
-      if (now->accumulator.computed_accumulation) {
+      if (now->accumulator.computed_accumulation)
         return true;
-      }
+
       const auto prev = now->previous;
       if (prev && prev->accumulator.computed_accumulation) {
         UpdateAccumulator(pos);
         return true;
       }
+
       return false;
     }
 
     // Convert input features
-    void Transform(const Position& pos, OutputType* output, bool refresh) const {
-      if (refresh || !UpdateAccumulatorIfPossible(pos)) {
+    void Transform(const Position& pos, OutputType* output) const {
+
+      if (!UpdateAccumulatorIfPossible(pos))
         RefreshAccumulator(pos);
-      }
+
       const auto& accumulation = pos.state()->accumulator.accumulation;
 
   #if defined(USE_AVX2)
@@ -177,6 +182,7 @@ namespace Eval::NNUE {
    private:
     // Calculate cumulative value without using difference calculation
     void RefreshAccumulator(const Position& pos) const {
+
       auto& accumulator = pos.state()->accumulator;
       IndexType i = 0;
       Features::IndexList active_indices[2];
@@ -216,9 +222,8 @@ namespace Eval::NNUE {
               &accumulator.accumulation[perspective][i][0]);
           auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
           constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-          for (IndexType j = 0; j < kNumChunks; ++j) {
+          for (IndexType j = 0; j < kNumChunks; ++j)
             accumulation[j] = _mm_add_pi16(accumulation[j], column[j]);
-          }
 
   #elif defined(USE_NEON)
           auto accumulation = reinterpret_cast<int16x8_t*>(
@@ -240,11 +245,11 @@ namespace Eval::NNUE {
   #endif
 
       accumulator.computed_accumulation = true;
-      accumulator.computed_score = false;
     }
 
     // Calculate cumulative value using difference calculation
     void UpdateAccumulator(const Position& pos) const {
+
       const auto prev_accumulator = pos.state()->previous->accumulator;
       auto& accumulator = pos.state()->accumulator;
       IndexType i = 0;
@@ -288,33 +293,27 @@ namespace Eval::NNUE {
 
   #if defined(USE_AVX2)
             auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j) {
+            for (IndexType j = 0; j < kNumChunks; ++j)
               accumulation[j] = _mm256_sub_epi16(accumulation[j], column[j]);
-            }
 
   #elif defined(USE_SSE2)
             auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j) {
+            for (IndexType j = 0; j < kNumChunks; ++j)
               accumulation[j] = _mm_sub_epi16(accumulation[j], column[j]);
-            }
 
   #elif defined(USE_MMX)
             auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j) {
+            for (IndexType j = 0; j < kNumChunks; ++j)
               accumulation[j] = _mm_sub_pi16(accumulation[j], column[j]);
-            }
 
   #elif defined(USE_NEON)
             auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j) {
+            for (IndexType j = 0; j < kNumChunks; ++j)
               accumulation[j] = vsubq_s16(accumulation[j], column[j]);
-            }
 
   #else
-            for (IndexType j = 0; j < kHalfDimensions; ++j) {
-              accumulator.accumulation[perspective][i][j] -=
-                  weights_[offset + j];
-            }
+            for (IndexType j = 0; j < kHalfDimensions; ++j)
+              accumulator.accumulation[perspective][i][j] -= weights_[offset + j];
   #endif
 
           }
@@ -325,33 +324,27 @@ namespace Eval::NNUE {
 
   #if defined(USE_AVX2)
             auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j) {
+            for (IndexType j = 0; j < kNumChunks; ++j)
               accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]);
-            }
 
   #elif defined(USE_SSE2)
             auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j) {
+            for (IndexType j = 0; j < kNumChunks; ++j)
               accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
-            }
 
   #elif defined(USE_MMX)
             auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j) {
+            for (IndexType j = 0; j < kNumChunks; ++j)
               accumulation[j] = _mm_add_pi16(accumulation[j], column[j]);
-            }
 
   #elif defined(USE_NEON)
             auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j) {
+            for (IndexType j = 0; j < kNumChunks; ++j)
               accumulation[j] = vaddq_s16(accumulation[j], column[j]);
-            }
 
   #else
-            for (IndexType j = 0; j < kHalfDimensions; ++j) {
-              accumulator.accumulation[perspective][i][j] +=
-                  weights_[offset + j];
-            }
+            for (IndexType j = 0; j < kHalfDimensions; ++j)
+              accumulator.accumulation[perspective][i][j] += weights_[offset + j];
   #endif
 
           }
@@ -362,7 +355,6 @@ namespace Eval::NNUE {
   #endif
 
       accumulator.computed_accumulation = true;
-      accumulator.computed_score = false;
     }
 
     using BiasType = std::int16_t;
diff --git a/src/position.cpp b/src/position.cpp
index fe89b753..e6a760d2 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -704,7 +704,6 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
 
   // Used by NNUE
   st->accumulator.computed_accumulation = false;
-  st->accumulator.computed_score = false;
   auto& dp = st->dirtyPiece;
   dp.dirty_num = 1;
 
@@ -1000,7 +999,6 @@ void Position::do_null_move(StateInfo& newSt) {
   if (Eval::useNNUE)
   {
       std::memcpy(&newSt, st, sizeof(StateInfo));
-      st->accumulator.computed_score = false;
   }
   else
       std::memcpy(&newSt, st, offsetof(StateInfo, accumulator));

From d2562cde12cdcc3df654279d6d632ae74c5f71af Mon Sep 17 00:00:00 2001
From: Gian-Carlo Pascutto <gcp@sjeng.org>
Date: Tue, 8 Sep 2020 15:37:53 +0200
Subject: [PATCH 046/398] Always re-enable NNUE after "bench".

Restore the default NNUE setting (enabled) after a bench command.
This also makes the resulting program settings independent of the
number of FENs that are being benched.

Fixes issue #3112.

closes https://github.com/official-stockfish/Stockfish/pull/3113

No functional change.
---
 src/benchmark.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/benchmark.cpp b/src/benchmark.cpp
index 806e9840..ffb631a2 100644
--- a/src/benchmark.cpp
+++ b/src/benchmark.cpp
@@ -164,5 +164,7 @@ vector<string> setup_bench(const Position& current, istream& is) {
           ++posCounter;
       }
 
+  list.emplace_back("setoption name Use NNUE value true");
+
   return list;
 }

From 0405f3540366cc16245d51531881c55d3726c8b5 Mon Sep 17 00:00:00 2001
From: SFisGOD <jonathandumale@gmail.com>
Date: Mon, 7 Sep 2020 04:54:26 +0800
Subject: [PATCH 047/398] Double probability of using classical eval

This patch doubles the moderate imbalance threshold and probability of using classical eval.
So now if imbalance is greater than PawnValueMg / 4 then there is a 1/8 chance of using classical eval.

STC:
LLR: 2.93 (-2.94,2.94) {-0.25,1.25}
Total: 10984 W: 1303 L: 1140 D: 8541
Ptnml(0-2): 58, 867, 3489, 1010, 68
https://tests.stockfishchess.org/tests/view/5f554c9f97da2d5437d3813e

LTC:
LLR: 2.95 (-2.94,2.94) {0.25,1.25}
Total: 43064 W: 2476 L: 2276 D: 38312
Ptnml(0-2): 37, 1985, 17308, 2145, 57
https://tests.stockfishchess.org/tests/view/5f55690a00a0aa2ca79f0a43

closes https://github.com/official-stockfish/Stockfish/pull/3114

Bench: 4161067
---
 src/evaluate.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index db8379da..faf71d27 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -1015,10 +1015,13 @@ make_v:
 
 Value Eval::evaluate(const Position& pos) {
 
+  // Use classical eval if there is a large imbalance
+  // If there is a moderate imbalance, use classical eval with probability (1/8),
+  // as derived from the node counter.
   bool useClassical = abs(eg_value(pos.psq_score())) * 16 > NNUEThreshold1 * (16 + pos.rule50_count());
   bool classical = !Eval::useNNUE
                 ||  useClassical
-                || (abs(eg_value(pos.psq_score())) > PawnValueMg / 8 && !(pos.this_thread()->nodes & 0xF));
+                || (abs(eg_value(pos.psq_score())) > PawnValueMg / 4 && !(pos.this_thread()->nodes & 0xB));
   Value v = classical ? Evaluation<NO_TRACE>(pos).value()
                       : NNUE::evaluate(pos) * 5 / 4 + Tempo;
 

From d21424c8d3af0f63e6317ebd0a727114442248e0 Mon Sep 17 00:00:00 2001
From: noobpwnftw <guo.bojun@gmail.com>
Date: Tue, 8 Sep 2020 09:35:53 +0800
Subject: [PATCH 048/398] test

---
 README.md                              |   5 +-
 src/Makefile                           |   3 +-
 src/evaluate.cpp                       |  52 ++++----
 src/learn/gensfen.cpp                  | 170 +++----------------------
 src/learn/gensfen2019.cpp              |   1 -
 src/learn/learner.cpp                  |  25 ----
 src/nnue/features/enpassant.cpp        |   2 +-
 src/nnue/features/half_kp.cpp          |   4 +-
 src/nnue/features/half_relative_kp.cpp |   4 +-
 src/nnue/features/k.cpp                |   4 +-
 src/nnue/features/p.cpp                |   4 +-
 src/nnue/nnue_common.h                 |   2 +-
 src/search.cpp                         |  17 +--
 src/tt.cpp                             |   4 +-
 src/ucioption.cpp                      |   2 +-
 15 files changed, 61 insertions(+), 238 deletions(-)
 delete mode 100644 src/learn/gensfen2019.cpp

diff --git a/README.md b/README.md
index 6d28a998..0dcce0a6 100644
--- a/README.md
+++ b/README.md
@@ -17,12 +17,10 @@ setoption name Threads value x
 setoption name Hash value y
 setoption name SyzygyPath value path
 isready
-gensfen depth a loop b use_draw_in_training_data_generation 1 eval_limit 32000 use_raw_nnue_eval 0
+gensfen depth a loop b use_draw_in_training_data_generation 1 eval_limit 32000
 ```
 Specify how many threads and how much memory you would like to use with the x and y values. The option SyzygyPath is not necessary, but if you would like to use it, you must first have Syzygy endgame tablebases on your computer, which you can find [here](http://oics.olympuschess.com/tracker/index.php). You will need to have a torrent client to download these tablebases, as that is probably the fastest way to obtain them. The path is the path to the folder containing those tablebases. It does not have to be surrounded in quotes.
 
-use_raw_nnue_eval controls if the training data generator or trainer uses raw NNUE eval values.  Don't forget to set use_raw_nnue_eval 0 when initial training data are generated.  Otherwise, the gensfen command will crash.
-
 This will save a file named "generated_kifu.bin" in the same folder as the binary. Once generation is done, rename the file to something like "1billiondepth12.bin" to remember the depth and quantity of the positions and move it to a folder named "trainingdata" in the same directory as the binaries.
 #### Generation Parameters
 - Depth is the searched depth per move, or how far the engine looks forward. This value is an integer.
@@ -34,6 +32,7 @@ Use the "learn" binary. Create an empty folder named "evalsave" in the same dire
 ```
 uci
 setoption name SkipLoadingEval value true
+setoption name Training value true
 setoption name Use NNUE value true
 setoption name Threads value x
 isready
diff --git a/src/Makefile b/src/Makefile
index 9db13e44..4f8801ee 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -56,7 +56,6 @@ SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp
 	nnue/features/enpassant.cpp \
 	nnue/nnue_test_command.cpp \
 	extra/sfen_packer.cpp \
-	learn/gensfen2019.cpp \
 	learn/learner.cpp \
 	learn/gensfen.cpp \
 	learn/convert.cpp \
@@ -908,7 +907,7 @@ learn: config-sanity
 	EXTRALDFLAGS=' $(BLASLDFLAGS) -fopenmp  ' \
 	all
 
-profile-learn: net config-sanity objclean profileclean
+profile-learn: config-sanity objclean profileclean
 	@echo ""
 	@echo "Step 1/4. Building instrumented executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make) \
diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 8edc9bb8..9dd83e1f 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -32,13 +32,6 @@
 #include "thread.h"
 #include "uci.h"
 
-#ifdef EVAL_LEARN
-namespace Learner
-{
-    extern bool use_raw_nnue_eval;
-}
-#endif
-
 namespace Eval {
 
   bool useNNUE;
@@ -947,27 +940,32 @@ make_v:
 /// evaluation of the position from the point of view of the side to move.
 
 Value Eval::evaluate(const Position& pos) {
-#ifdef EVAL_LEARN
-  if (Learner::use_raw_nnue_eval) {
-      return NNUE::evaluate(pos);
+  if (Options["Training"]) {
+    Value v = NNUE::evaluate(pos);
+    // Damp down the evaluation linearly when shuffling
+    v = v * (100 - pos.rule50_count()) / 100;
+
+    // Guarantee evaluation does not hit the tablebase range
+    v = std::clamp(v, VALUE_TB_LOSS_IN_MAX_PLY + 1, VALUE_TB_WIN_IN_MAX_PLY - 1);
+
+    return v;
+  } else {
+    bool classical = !Eval::useNNUE
+                  ||  abs(eg_value(pos.psq_score())) * 16 > NNUEThreshold1 * (16 + pos.rule50_count());
+    Value v = classical ? Evaluation<NO_TRACE>(pos).value()
+                        : NNUE::evaluate(pos) * 5 / 4 + Tempo;
+
+    if (classical && Eval::useNNUE && abs(v) * 16 < NNUEThreshold2 * (16 + pos.rule50_count()))
+        v = NNUE::evaluate(pos) * 5 / 4 + Tempo;
+
+    // Damp down the evaluation linearly when shuffling
+    v = v * (100 - pos.rule50_count()) / 100;
+
+    // Guarantee evaluation does not hit the tablebase range
+    v = std::clamp(v, VALUE_TB_LOSS_IN_MAX_PLY + 1, VALUE_TB_WIN_IN_MAX_PLY - 1);
+
+    return v;
   }
-#endif
-
-  bool classical = !Eval::useNNUE
-                ||  abs(eg_value(pos.psq_score())) * 16 > NNUEThreshold1 * (16 + pos.rule50_count());
-  Value v = classical ? Evaluation<NO_TRACE>(pos).value()
-                      : NNUE::evaluate(pos) * 5 / 4 + Tempo;
-
-  if (classical && Eval::useNNUE && abs(v) * 16 < NNUEThreshold2 * (16 + pos.rule50_count()))
-      v = NNUE::evaluate(pos) * 5 / 4 + Tempo;
-
-  // Damp down the evaluation linearly when shuffling
-  v = v * (100 - pos.rule50_count()) / 100;
-
-  // Guarantee evaluation does not hit the tablebase range
-  v = std::clamp(v, VALUE_TB_LOSS_IN_MAX_PLY + 1, VALUE_TB_WIN_IN_MAX_PLY - 1);
-
-  return v;
 }
 
 /// trace() is like evaluate(), but instead of returning a value, it returns
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 6c8c455e..8526bc40 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -11,10 +11,6 @@
 #include "../uci.h"
 #include "../syzygy/tbprobe.h"
 
-#if defined(USE_BOOK)
-#include "../extra/book/book.h"
-#endif
-
 #include <chrono>
 #include <random>
 #include <regex>
@@ -54,11 +50,7 @@ namespace Learner
     static bool detect_draw_by_consecutive_low_score = false;
     static bool detect_draw_by_insufficient_mating_material = false;
 
-    // Use raw NNUE eval value in the Eval::evaluate().
-    // If hybrid eval is enabled, training data
-    // generation and training don't work well.
-    // https://discordapp.com/channels/435943710472011776/733545871911813221/748524079761326192
-    static bool use_raw_nnue_eval = true;
+    static std::vector<std::string> bookStart;
 
     // Helper class for exporting Sfen
     struct SfenWriter
@@ -313,13 +305,6 @@ namespace Learner
             int ply,
             int& random_move_c);
 
-        Value evaluate_leaf(
-            Position& pos,
-            std::vector<StateInfo, AlignedAllocator<StateInfo>>& states,
-            int ply,
-            int depth,
-            vector<Move>& pv);
-
         // Min and max depths for search during gensfen
         int search_depth_min;
         int search_depth_max;
@@ -674,69 +659,6 @@ namespace Learner
         return random_move_flag;
     }
 
-    Value MultiThinkGenSfen::evaluate_leaf(
-        Position& pos,
-        std::vector<StateInfo, AlignedAllocator<StateInfo>>& states,
-        int ply,
-        int depth,
-        vector<Move>& pv)
-    {
-        auto rootColor = pos.side_to_move();
-
-        for (auto m : pv)
-        {
-#if 1
-            // There should be no illegal move. This is as a debugging precaution.
-            if (!pos.pseudo_legal(m) || !pos.legal(m))
-            {
-                cout << "Error! : " << pos.fen() << m << endl;
-            }
-#endif
-            pos.do_move(m, states[ply++]);
-
-            // Because the difference calculation of evaluate() cannot be
-            // performed unless each node evaluate() is called!
-            // If the depth is 8 or more, it seems
-            // faster not to calculate this difference.
-#if defined(EVAL_NNUE)
-            if (depth < 8)
-            {
-                Eval::NNUE::update_eval(pos);
-            }
-#endif  // defined(EVAL_NNUE)
-        }
-
-        // Reach leaf
-        Value v;
-        if (pos.checkers())
-        {
-            // Sometime a king is checked.  An example is a case that a checkmate is
-            // found in the search.  If Eval::evaluate() is called whne a king is
-            // checked, classic eval crashes by an assertion. To avoid crashes, return
-            // VALUE_NONE and let the caller assign a value to the position.
-            v = VALUE_NONE;
-        }
-        else
-        {
-            v = Eval::evaluate(pos);
-
-            // evaluate() returns the evaluation value on the turn side, so
-            // If it's a turn different from root_color, you must invert v and return it.
-            if (rootColor != pos.side_to_move())
-            {
-                v = -v;
-            }
-        }
-
-        // Rewind the pv moves.
-        for (auto it = pv.rbegin(); it != pv.rend(); ++it)
-        {
-            pos.undo_move(*it);
-        }
-
-        return v;
-    }
-
     // thread_id = 0..Threads.size()-1
     void MultiThinkGenSfen::thread_worker(size_t thread_id)
     {
@@ -760,12 +682,7 @@ namespace Learner
             auto th = Threads[thread_id];
 
             auto& pos = th->rootPos;
-            pos.set(StartFEN, false, &si, th);
-
-#if defined(USE_BOOK)
-            // Refer to the members of BookMoveSelector defined in the search section.
-            auto& book = ::book;
-#endif
+            pos.set(bookStart[prng.rand(bookStart.size())], false, &si, th);
 
             // Vector for holding the sfens in the current simulated game.
             PSVector a_psv;
@@ -800,35 +717,6 @@ namespace Learner
                     flush_psv(result.value());
                     break;
                 }
-#if defined(USE_BOOK)
-                if ((next_move = book.probe(pos)) != MOVE_NONE)
-                {
-                    // Hit the constant track.
-                    // The move was stored in next_move.
-
-                    // Do not use the fixed phase for learning.
-                    sfens.clear();
-
-                    if (random_move_minply != -1)
-                    {
-                        // Random move is performed with a certain
-                        // probability even in the constant phase.
-                        goto RANDOM_MOVE;
-                    }
-                    else
-                    {
-                        // When -1 is specified as random_move_minply,
-                        // it points according to the standard until
-                        // it goes out of the standard.
-                        // Prepare an innumerable number of situations
-                        // that have left the constant as
-                        // ConsiderationBookMoveCount true using a huge constant
-                        // Used for purposes such as performing
-                        // a random move 5 times from there.
-                        goto DO_MOVE;
-                    }
-                }
-#endif
                 {
                     auto [search_value, search_pv] = search(pos, depth, 1, nodes);
 
@@ -916,18 +804,7 @@ namespace Learner
 
                         // Get the value of evaluate() as seen from the
                         // root color on the leaf node of the PV line.
-                        // I don't know the goodness and badness of using the
-                        // return value of search() as it is.
-                        // TODO: Consider using search value instead of evaluate_leaf.
-                        //       Maybe give it as an option.
-
-                        // Use PV moves to reach the leaf node and use the value
-                        // that evaluated() is called on that leaf node.
-                        const auto leaf_value = evaluate_leaf(pos, states, ply, depth, search_pv);
-
-                        // If for some reason the leaf node couldn't yield an eval
-                        // we fallback to search value.
-                        psv.score = leaf_value == VALUE_NONE ? search_value : leaf_value;
+                        psv.score = search_value;
 
                         psv.gamePly = ply;
 
@@ -948,9 +825,6 @@ namespace Learner
                     // Update the next move according to best search result.
                     next_move = search_pv[0];
                 }
-
-            RANDOM_MOVE:;
-
                 auto random_move = choose_random_move(pos, random_move_flag, ply, actual_random_move_count);
                 if (random_move.has_value())
                 {
@@ -962,13 +836,7 @@ namespace Learner
                     {
                         break;
                     }
-
-                    // Clear the sfens that were written before the random move.
-                    // (???) why?
-                    a_psv.clear();
                 }
-
-            DO_MOVE:;
                 pos.do_move(next_move, states[ply]);
 
                 // Call node evaluate() for each difference calculation.
@@ -1095,18 +963,10 @@ namespace Learner
                 is >> detect_draw_by_consecutive_low_score;
             else if (token == "detect_draw_by_insufficient_mating_material")
                 is >> detect_draw_by_insufficient_mating_material;
-            else if (token == "use_raw_nnue_eval")
-                is >> use_raw_nnue_eval;
             else
                 cout << "Error! : Illegal token " << token << endl;
         }
 
-#if defined(USE_GLOBAL_OPTIONS)
-        // Save it for later restore.
-        auto oldGlobalOptions = GlobalOptions;
-        GlobalOptions.use_eval_hash = use_eval_hash;
-#endif
-
         // If search depth2 is not set, leave it the same as search depth.
         if (search_depth_max == INT_MIN)
             search_depth_max = search_depth_min;
@@ -1130,15 +990,26 @@ namespace Learner
             output_file_name = output_file_name + "_" + to_hex(r.rand<uint64_t>()) + to_hex(r.rand<uint64_t>());
         }
 
+        bookStart.clear();
+        {
+          std::string line;
+          std::ifstream myfile ("3moves_v2.epd");
+          if (myfile.is_open())
+          {
+            while (getline(myfile,line))
+            {
+                bookStart.push_back(line);
+            }
+            myfile.close();
+          }
+        }
         std::cout << "gensfen : " << endl
             << "  search_depth_min = " << search_depth_min << " to " << search_depth_max << endl
             << "  nodes = " << nodes << endl
             << "  loop_max = " << loop_max << endl
             << "  eval_limit = " << eval_limit << endl
-            << "  thread_num (set by USI setoption) = " << thread_num << endl
-#if defined(USE_BOOK)
-            << "  book_moves (set by USI setoption) = " << Options["BookMoves"] << endl
-#endif
+            << "  thread_num             = " << thread_num << endl
+            << "  bookStart              = " << bookStart.size() << endl
             << "  random_move_minply     = " << random_move_minply << endl
             << "  random_move_maxply     = " << random_move_maxply << endl
             << "  random_move_count      = " << random_move_count << endl
@@ -1188,11 +1059,6 @@ namespace Learner
 
         std::cout << "gensfen finished." << endl;
 
-#if defined(USE_GLOBAL_OPTIONS)
-        // Restore Global Options.
-        GlobalOptions = oldGlobalOptions;
-#endif
-
     }
 }
 #endif
diff --git a/src/learn/gensfen2019.cpp b/src/learn/gensfen2019.cpp
deleted file mode 100644
index 01293b9c..00000000
--- a/src/learn/gensfen2019.cpp
+++ /dev/null
@@ -1 +0,0 @@
-// just a place holder
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 7021fd7f..a8724892 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -98,12 +98,6 @@ namespace Learner
     // data directly. In those cases, we set false to this variable.
     static bool convert_teacher_signal_to_winning_probability = true;
 
-    // Use raw NNUE eval value in the Eval::evaluate(). If hybrid eval is enabled, training data
-    // generation and training don't work well.
-    // https://discordapp.com/channels/435943710472011776/733545871911813221/748524079761326192
-    // This CANNOT be static since it's used elsewhere.
-    bool use_raw_nnue_eval = true;
-
     // Using WDL with win rate model instead of sigmoid
     static bool use_wdl = false;
 
@@ -1616,15 +1610,6 @@ namespace Learner
         uint64_t eta1_epoch = 0; // eta2 is not applied by default
         uint64_t eta2_epoch = 0; // eta3 is not applied by default
 
-#if defined(USE_GLOBAL_OPTIONS)
-    // Save it for later restore.
-        auto oldGlobalOptions = GlobalOptions;
-        // If you hit the eval hash, you can not calculate rmse etc. so turn it off.
-        GlobalOptions.use_eval_hash = false;
-        // If you hit the replacement table, pruning may occur at the previous evaluation value, so turn it off.
-        GlobalOptions.use_hash_probe = false;
-#endif
-
         // --- Function that only shuffles the teacher aspect
 
         // normal shuffle
@@ -1796,7 +1781,6 @@ namespace Learner
             else if (option == "dest_score_min_value") is >> dest_score_min_value;
             else if (option == "dest_score_max_value") is >> dest_score_max_value;
             else if (option == "convert_teacher_signal_to_winning_probability") is >> convert_teacher_signal_to_winning_probability;
-            else if (option == "use_raw_nnue_eval") is >> use_raw_nnue_eval;
 
             // Otherwise, it's a filename.
             else
@@ -2076,18 +2060,9 @@ namespace Learner
         // Save once at the end.
         learn_think.save(true);
 
-#if defined(USE_GLOBAL_OPTIONS)
-        // Restore Global Options.
-        GlobalOptions = oldGlobalOptions;
-#endif
     }
 
 
 } // namespace Learner
 
-#if defined(GENSFEN2019)
-#include "gensfen2019.cpp"
-#endif
-
-
 #endif // EVAL_LEARN
diff --git a/src/nnue/features/enpassant.cpp b/src/nnue/features/enpassant.cpp
index ea70529a..ed877322 100644
--- a/src/nnue/features/enpassant.cpp
+++ b/src/nnue/features/enpassant.cpp
@@ -23,7 +23,7 @@ namespace Eval {
         }
 
         if (perspective == BLACK) {
-          epSquare = rotate180(epSquare);
+          epSquare = flip_rank(epSquare);
         }
 
         auto file = file_of(epSquare);
diff --git a/src/nnue/features/half_kp.cpp b/src/nnue/features/half_kp.cpp
index 88e384a3..ff20a00a 100644
--- a/src/nnue/features/half_kp.cpp
+++ b/src/nnue/features/half_kp.cpp
@@ -23,9 +23,9 @@
 
 namespace Eval::NNUE::Features {
 
-  // Orient a square according to perspective (rotates by 180 for black)
+  // Orient a square according to perspective (flip rank for black)
   inline Square orient(Color perspective, Square s) {
-    return Square(int(s) ^ (bool(perspective) * 63));
+    return Square(int(s) ^ (bool(perspective) * SQ_A8));
   }
 
   // Find the index of the feature quantity from the king position and PieceSquare
diff --git a/src/nnue/features/half_relative_kp.cpp b/src/nnue/features/half_relative_kp.cpp
index 015ecb73..efe85035 100644
--- a/src/nnue/features/half_relative_kp.cpp
+++ b/src/nnue/features/half_relative_kp.cpp
@@ -11,9 +11,9 @@ namespace NNUE {
 
 namespace Features {
 
-// Orient a square according to perspective (rotates by 180 for black)
+// Orient a square according to perspective (flip rank for black)
 inline Square orient(Color perspective, Square s) {
-  return Square(int(s) ^ (bool(perspective) * 63));
+  return Square(int(s) ^ (bool(perspective) * SQ_A8));
 }
 
 // Find the index of the feature quantity from the ball position and PieceSquare
diff --git a/src/nnue/features/k.cpp b/src/nnue/features/k.cpp
index 314b1338..1bb28c53 100644
--- a/src/nnue/features/k.cpp
+++ b/src/nnue/features/k.cpp
@@ -11,9 +11,9 @@ namespace NNUE {
 
 namespace Features {
 
-// Orient a square according to perspective (rotates by 180 for black)
+// Orient a square according to perspective (flip rank for black)
 inline Square orient(Color perspective, Square s) {
-  return Square(int(s) ^ (bool(perspective) * 63));
+  return Square(int(s) ^ (bool(perspective) * SQ_A8));
 }
 
 // Index of a feature for a given king position.
diff --git a/src/nnue/features/p.cpp b/src/nnue/features/p.cpp
index b4a6faf9..7e008fdc 100644
--- a/src/nnue/features/p.cpp
+++ b/src/nnue/features/p.cpp
@@ -11,9 +11,9 @@ namespace NNUE {
 
 namespace Features {
 
-// Orient a square according to perspective (rotates by 180 for black)
+// Orient a square according to perspective (flip rank for black)
 inline Square orient(Color perspective, Square s) {
-  return Square(int(s) ^ (bool(perspective) * 63));
+  return Square(int(s) ^ (bool(perspective) * SQ_A8));
 }
 
 // Find the index of the feature quantity from the king position and PieceSquare
diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h
index d7ffa21a..cc54378b 100644
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -69,7 +69,7 @@
 namespace Eval::NNUE {
 
   // Version of the evaluation file
-  constexpr std::uint32_t kVersion = 0x7AF32F16u;
+  constexpr std::uint32_t kVersion = 0x7AF32F17u;
 
   // Constant used in evaluation value calculation
   constexpr int FV_SCALE = 16;
diff --git a/src/search.cpp b/src/search.cpp
index 8f258ae4..c01247bd 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -68,8 +68,6 @@ namespace {
     return Value(223 * (d - improving));
   }
 
-  bool training;
-
   // Reductions lookup table, initialized at startup
   int Reductions[MAX_MOVES]; // [depth or moveNumber]
 
@@ -195,8 +193,6 @@ void Search::init() {
 
   for (int i = 1; i < MAX_MOVES; ++i)
       Reductions[i] = int((22.0 + std::log(Threads.size())) * std::log(i));
-
-  training = Options["Training"];
 }
 
 
@@ -1011,7 +1007,7 @@ moves_loop: // When in check, search starts from here
 
       // Step 12. Pruning at shallow depth (~200 Elo)
       if (  !rootNode
-          && !(training && PvNode)
+          && !(Options["Training"] && PvNode)
           && pos.non_pawn_material(us)
           && bestValue > VALUE_TB_LOSS_IN_MAX_PLY)
       {
@@ -2070,17 +2066,6 @@ namespace Learner
         rootMoves.push_back(Search::RootMove(m));
 
       assert(!rootMoves.empty());
-
-      //#if defined(USE_GLOBAL_OPTIONS)
-      // Since the generation of the substitution table for each search thread should be managed,
-      // Increase the generation of the substitution table for this thread because it is a new search.
-            //TT.new_search(th->thread_id());
-
-            // ª If you call new_search here, it may be a loss because you can't use the previous search result.
-            // Do not do this here, but caller should do TT.new_search(th->thread_id()) for each station ...
-
-            // ¨Because we want to avoid reaching the same final diagram, use the substitution table commonly for all threads when generating teachers.
-      //#endif
     }
   }
 
diff --git a/src/tt.cpp b/src/tt.cpp
index 60a3a5f1..5e1f53d2 100644
--- a/src/tt.cpp
+++ b/src/tt.cpp
@@ -115,7 +115,9 @@ void TranspositionTable::clear() {
 /// TTEntry t2 if its replace value is greater than that of t2.
 
 TTEntry* TranspositionTable::probe(const Key key, bool& found) const {
-
+  if (Options["Training"]) {
+    return found = false, first_entry(0);
+  }
   TTEntry* const tte = first_entry(key);
   const uint16_t key16 = (uint16_t)key;  // Use the low 16 bits as key inside the cluster
 
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index 0007b559..1517326e 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -82,7 +82,7 @@ void init(OptionsMap& o) {
   o["Use NNUE"]              << Option(true, on_use_NNUE);
   // The default must follow the format nn-[SHA256 first 12 digits].nnue
   // for the build process (profile-build and fishtest) to work.
-  o["EvalFile"]              << Option("nn-82215d0fd0df.nnue", on_eval_file);
+  o["EvalFile"]              << Option("nn.bin", on_eval_file);
 #ifdef EVAL_NNUE
   // When the evaluation function is loaded at the ucinewgame timing, it is necessary to convert the new evaluation function.
   // I want to hit the test eval convert command, but there is no new evaluation function

From a6013557f2cb5d13c21a2d406a02d504a643c885 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Tue, 8 Sep 2020 22:45:29 +0900
Subject: [PATCH 049/398] Removed EVAL_NNUE macro.

---
 src/Makefile                                  |   6 +-
 src/eval/evaluate_common.h                    |   3 -
 src/learn/gensfen.cpp                         |  40 +++---
 src/learn/learner.cpp                         | 133 +++---------------
 src/nnue/evaluate_nnue.cpp                    |   2 -
 src/nnue/evaluate_nnue_learner.cpp            |   4 +-
 src/nnue/evaluate_nnue_learner.h              |   4 +-
 src/nnue/features/castling_right.cpp          |   4 -
 src/nnue/features/castling_right.h            |   4 -
 src/nnue/features/enpassant.cpp               |   4 -
 src/nnue/features/enpassant.h                 |   4 -
 src/nnue/features/half_relative_kp.cpp        |   4 -
 src/nnue/features/half_relative_kp.h          |   4 -
 src/nnue/features/k.cpp                       |   4 -
 src/nnue/features/k.h                         |   4 -
 src/nnue/features/p.cpp                       |   4 -
 src/nnue/features/p.h                         |   4 -
 src/nnue/layers/sum.h                         |   4 -
 src/nnue/nnue_test_command.cpp                |   4 +-
 src/nnue/nnue_test_command.h                  |   4 +-
 src/nnue/trainer/features/factorizer.h        |   4 -
 .../trainer/features/factorizer_feature_set.h |   4 -
 .../trainer/features/factorizer_half_kp.h     |   4 -
 src/nnue/trainer/trainer.h                    |   4 +-
 src/nnue/trainer/trainer_affine_transform.h   |   4 +-
 src/nnue/trainer/trainer_clipped_relu.h       |   4 +-
 .../trainer/trainer_feature_transformer.h     |   4 +-
 src/nnue/trainer/trainer_input_slice.h        |   4 +-
 src/nnue/trainer/trainer_sum.h                |   4 +-
 src/uci.cpp                                   |   6 +-
 src/ucioption.cpp                             |   2 -
 31 files changed, 65 insertions(+), 223 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index ca851dba..a07e1251 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -903,7 +903,7 @@ icc-profile-use:
 
 learn: config-sanity
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
-	EXTRACXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
+	EXTRACXXFLAGS=' -DEVAL_LEARN -DENABLE_TEST_CMD -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
 	EXTRALDFLAGS=' $(BLASLDFLAGS) -fopenmp  ' \
 	all
 
@@ -911,7 +911,7 @@ profile-learn: net config-sanity objclean profileclean
 	@echo ""
 	@echo "Step 1/4. Building instrumented executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make) \
-	LEARNCXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
+	LEARNCXXFLAGS=' -DEVAL_LEARN -DENABLE_TEST_CMD -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
 	LEARNLDFLAGS='  $(BLASLDFLAGS) -fopenmp '
 	@echo ""
 	@echo "Step 2/4. Running benchmark for pgo-build ..."
@@ -920,7 +920,7 @@ profile-learn: net config-sanity objclean profileclean
 	@echo "Step 3/4. Building optimized executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) objclean
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_use) \
-	LEARNCXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
+	LEARNCXXFLAGS=' -DEVAL_LEARN -DENABLE_TEST_CMD -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
 	LEARNLDFLAGS=' $(BLASLDFLAGS) -fopenmp '
 	@echo ""
 	@echo "Step 4/4. Deleting profile data ..."
diff --git a/src/eval/evaluate_common.h b/src/eval/evaluate_common.h
index dacbd2ba..3fb161ab 100644
--- a/src/eval/evaluate_common.h
+++ b/src/eval/evaluate_common.h
@@ -3,7 +3,6 @@
 
 // A common header-like function for modern evaluation functions (EVAL_KPPT and EVAL_KPP_KKPT).
 
-#if defined(EVAL_NNUE) || defined(EVAL_LEARN)
 #include <functional>
 
 // KK file name
@@ -79,6 +78,4 @@ namespace Eval
 
 }
 
-#endif  // defined(EVAL_NNUE) || defined(EVAL_LEARN)
-
 #endif // _EVALUATE_KPPT_COMMON_H_
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index b049192e..9ae83174 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -1,45 +1,41 @@
 ﻿#if defined(EVAL_LEARN)
 
 #include "../eval/evaluate_common.h"
-
-#include "learn.h"
-#include "multi_think.h"
 #include "../misc.h"
-#include "../thread.h"
+#include "../nnue/evaluate_nnue_learner.h"
 #include "../position.h"
+#include "../syzygy/tbprobe.h"
+#include "../thread.h"
 #include "../tt.h"
 #include "../uci.h"
-#include "../syzygy/tbprobe.h"
+#include "learn.h"
+#include "multi_think.h"
 
 #if defined(USE_BOOK)
 #include "../extra/book/book.h"
 #endif
 
 #include <chrono>
-#include <random>
-#include <regex>
-#include <sstream>
-#include <fstream>
-#include <unordered_set>
-#include <iomanip>
-#include <list>
+#include <climits>
 #include <cmath>
 #include <cstring>
-#include <memory>
-#include <limits>
-#include <optional>
 #include <filesystem>
+#include <fstream>
+#include <iomanip>
+#include <limits>
+#include <list>
+#include <memory>
+#include <optional>
+#include <random>
+#include <regex>
+#include <shared_mutex>
+#include <sstream>
+#include <unordered_set>
 
 #if defined (_OPENMP)
 #include <omp.h>
 #endif
 
-#if defined(EVAL_NNUE)
-#include "../nnue/evaluate_nnue_learner.h"
-#include <climits>
-#include <shared_mutex>
-#endif
-
 using namespace std;
 
 namespace Learner
@@ -692,12 +688,10 @@ namespace Learner
             // performed unless each node evaluate() is called!
             // If the depth is 8 or more, it seems
             // faster not to calculate this difference.
-#if defined(EVAL_NNUE)
             if (depth < 8)
             {
                 Eval::NNUE::update_eval(pos);
             }
-#endif  // defined(EVAL_NNUE)
         }
 
         // Reach leaf
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index f9d188b8..358848ec 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -17,45 +17,40 @@
 // → I will not be involved in the engine because it is a problem that the GUI should assist.
 // etc..
 
-#define EVAL_LEARN
-
 #if defined(EVAL_LEARN)
 
 #include "../eval/evaluate_common.h"
-
+#include "../misc.h"
+#include "../nnue/evaluate_nnue_learner.h"
+#include "../position.h"
+#include "../syzygy/tbprobe.h"
+#include "../thread.h"
+#include "../tt.h"
+#include "../uci.h"
 #include "learn.h"
 #include "multi_think.h"
-#include "../uci.h"
-#include "../syzygy/tbprobe.h"
-#include "../misc.h"
-#include "../thread.h"
-#include "../position.h"
-#include "../tt.h"
 
 #include <chrono>
-#include <random>
-#include <regex>
-#include <sstream>
-#include <fstream>
-#include <unordered_set>
-#include <iomanip>
-#include <list>
+#include <climits>
 #include <cmath>    // std::exp(),std::pow(),std::log()
 #include <cstring>  // memcpy()
-#include <memory>
-#include <limits>
-#include <optional>
 #include <filesystem>
+#include <fstream>
+#include <iomanip>
+#include <limits>
+#include <list>
+#include <memory>
+#include <optional>
+#include <random>
+#include <regex>
+#include <shared_mutex>
+#include <sstream>
+#include <unordered_set>
 
 #if defined (_OPENMP)
 #include <omp.h>
 #endif
 
-#if defined(EVAL_NNUE)
-#include "../nnue/evaluate_nnue_learner.h"
-#include <climits>
-#include <shared_mutex>
-#endif
 
 using namespace std;
 
@@ -724,14 +719,12 @@ namespace Learner
             learn_sum_entropy = 0.0;
 #endif
 
-#if defined(EVAL_NNUE)
             newbob_scale = 1.0;
             newbob_decay = 1.0;
             newbob_num_trials = 2;
             best_loss = std::numeric_limits<double>::infinity();
             latest_loss_sum = 0.0;
             latest_loss_count = 0;
-#endif
         }
 
         virtual void thread_worker(size_t thread_id);
@@ -787,7 +780,6 @@ namespace Learner
         atomic<double> learn_sum_entropy;
 #endif
 
-#if defined(EVAL_NNUE)
         shared_timed_mutex nn_mutex;
         double newbob_scale;
         double newbob_decay;
@@ -796,7 +788,6 @@ namespace Learner
         double latest_loss_sum;
         uint64_t latest_loss_count;
         std::string best_nn_directory;
-#endif
 
         uint64_t eval_save_interval;
         uint64_t loss_output_interval;
@@ -844,13 +835,10 @@ namespace Learner
         // It doesn't matter if you have disabled the substitution table.
         TT.new_search();
 
-
-#if defined(EVAL_NNUE)
         std::cout << "PROGRESS: " << now_string() << ", ";
         std::cout << sr.total_done << " sfens";
         std::cout << ", iteration " << epoch;
         std::cout << ", eta = " << Eval::get_eta() << ", ";
-#endif
 
 #if !defined(LOSS_FUNCTION_IS_ELMO_METHOD)
         double sum_error = 0;
@@ -1009,10 +997,8 @@ namespace Learner
 #endif
 
 #if defined(LOSS_FUNCTION_IS_ELMO_METHOD)
-#if defined(EVAL_NNUE)
         latest_loss_sum += test_sum_cross_entropy - test_sum_entropy;
         latest_loss_count += sr.sfen_for_mse.size();
-#endif
 
         // learn_cross_entropy may be called train cross 
         // entropy in the world of machine learning,
@@ -1074,14 +1060,10 @@ namespace Learner
             // display mse (this is sometimes done only for thread 0)
             // Immediately after being read from the file...
 
-#if defined(EVAL_NNUE)
         // Lock the evaluation function so that it is not used during updating.
             shared_lock<shared_timed_mutex> read_lock(nn_mutex, defer_lock);
             if (sr.next_update_weights <= sr.total_done ||
                 (thread_id != 0 && !read_lock.try_lock()))
-#else
-            if (sr.next_update_weights <= sr.total_done)
-#endif
             {
                 if (thread_id != 0)
                 {
@@ -1105,18 +1087,6 @@ namespace Learner
                         continue;
                     }
 
-#if !defined(EVAL_NNUE)
-                    // Output the current time. Output every time.
-                    std::cout << sr.total_done << " sfens , at " << now_string() << std::endl;
-
-                    // Reflect the gradient in the weight array at this timing. 
-                    // The calculation of the gradient is just right for 
-                    // each 1M phase in terms of mini-batch.
-                    Eval::update_weights(epoch, freeze);
-
-                    // Display epoch and current eta for debugging.
-                    std::cout << "epoch = " << epoch << " , eta = " << Eval::get_eta() << std::endl;
-#else
                     {
                         // update parameters
 
@@ -1124,7 +1094,7 @@ namespace Learner
                         lock_guard<shared_timed_mutex> write_lock(nn_mutex);
                         Eval::NNUE::UpdateParameters(epoch);
                     }
-#endif
+
                     ++epoch;
 
                     // However, the elapsed time during update_weights() and calc_rmse() is ignored.
@@ -1156,9 +1126,7 @@ namespace Learner
                         // loss calculation
                         calc_loss(thread_id, done);
 
-#if defined(EVAL_NNUE)
                         Eval::NNUE::CheckHealth();
-#endif
 
                         // Make a note of how far you have totaled.
                         sr.last_done = sr.total_done;
@@ -1216,25 +1184,6 @@ namespace Learner
                 goto RETRY_READ;
             }
 
-#if !defined(EVAL_NNUE)
-            if (skip_duplicated_positions_in_training)
-            {
-                const auto key = pos.key();
-
-                // Exclude the phase used for rmse calculation.
-                if (sr.is_for_rmse(key))
-                    goto RETRY_READ;
-
-                // Exclude the most recently used aspect.
-                const auto hash_index = size_t(key & (sr.READ_SFEN_HASH_SIZE - 1));
-                const auto key2 = sr.hash[hash_index];
-                if (key == key2)
-                    goto RETRY_READ;
-
-                sr.hash[hash_index] = key; // Replace with the current key.
-            }
-#endif
-
             // There is a possibility that all the pieces are blocked and stuck.
             // Also, the declaration win phase is excluded from 
             // learning because you cannot go to leaf with PV moves.
@@ -1326,25 +1275,9 @@ namespace Learner
                 learn_sum_entropy += learn_entropy;
 #endif
 
-#if !defined(EVAL_NNUE)
-                // Slope
-                double dj_dw = calc_grad(deep_value, shallow_value, ps);
-
-                // Add jd_dw as the gradient (∂J/∂Wj) for the 
-                // feature vector currently appearing in the leaf node.
-
-                // If it is not PV termination, apply a discount rate.
-                if (discount_rate != 0 && ply != (int)pv.size())
-                    dj_dw *= discount_rate;
-
-                // Since we have reached leaf, add the gradient to the features that appear in this phase.
-                // Update based on gradient later.
-                Eval::add_grad(pos, rootColor, dj_dw, freeze);
-#else
                 const double example_weight =
                     (discount_rate != 0 && ply != (int)pv.size()) ? discount_rate : 1.0;
                 Eval::NNUE::AddExample(pos, rootColor, ps, example_weight);
-#endif
 
                 // Since the processing is completed, the counter of the processed number is incremented
                 sr.total_done++;
@@ -1425,7 +1358,6 @@ namespace Learner
             const std::string dir_name = std::to_string(dir_number++);
             Eval::save_eval(dir_name);
 
-#if defined(EVAL_NNUE)
             if (newbob_decay != 1.0 && latest_loss_count > 0) {
                 static int trials = newbob_num_trials;
                 const double latest_loss = latest_loss_sum / latest_loss_count;
@@ -1470,7 +1402,6 @@ namespace Learner
                     return true;
                 }
             }
-#endif
         }
         return false;
     }
@@ -1817,12 +1748,10 @@ namespace Learner
         // Optional item that does not let you learn KK/KKP/KPP/KPPP
         array<bool, 4> freeze = {};
 
-#if defined(EVAL_NNUE)
         uint64_t nn_batch_size = 1000;
         double newbob_decay = 1.0;
         int newbob_num_trials = 2;
         string nn_options;
-#endif
 
         uint64_t eval_save_interval = LEARN_EVAL_SAVE_INTERVAL;
         uint64_t loss_output_interval = 0;
@@ -1922,12 +1851,11 @@ namespace Learner
             else if (option == "save_only_once") save_only_once = true;
             else if (option == "no_shuffle") no_shuffle = true;
 
-#if defined(EVAL_NNUE)
             else if (option == "nn_batch_size") is >> nn_batch_size;
             else if (option == "newbob_decay") is >> newbob_decay;
             else if (option == "newbob_num_trials") is >> newbob_num_trials;
             else if (option == "nn_options") is >> nn_options;
-#endif
+
             else if (option == "eval_save_interval") is >> eval_save_interval;
             else if (option == "loss_output_interval") is >> loss_output_interval;
             else if (option == "mirror_percentage") is >> mirror_percentage;
@@ -2074,23 +2002,18 @@ namespace Learner
             }
         }
 
-#if !defined(EVAL_NNUE)
-        cout << "Gradient Method   : " << LEARN_UPDATE << endl;
-#endif
         cout << "Loss Function     : " << LOSS_FUNCTION << endl;
         cout << "mini-batch size   : " << mini_batch_size << endl;
 
-#if defined(EVAL_NNUE)
         cout << "nn_batch_size     : " << nn_batch_size << endl;
         cout << "nn_options        : " << nn_options << endl;
-#endif
+
         cout << "learning rate     : " << eta1 << " , " << eta2 << " , " << eta3 << endl;
         cout << "eta_epoch         : " << eta1_epoch << " , " << eta2_epoch << endl;
         cout << "use_draw_games_in_training : " << use_draw_games_in_training << endl;
         cout << "use_draw_games_in_validation : " << use_draw_games_in_validation << endl;
         cout << "skip_duplicated_positions_in_training : " << skip_duplicated_positions_in_training << endl;
 
-#if defined(EVAL_NNUE)
         if (newbob_decay != 1.0) {
             cout << "scheduling        : newbob with decay = " << newbob_decay
                 << ", " << newbob_num_trials << " trials" << endl;
@@ -2098,7 +2021,6 @@ namespace Learner
         else {
             cout << "scheduling        : default" << endl;
         }
-#endif
 
         cout << "discount rate     : " << discount_rate << endl;
 
@@ -2133,12 +2055,6 @@ namespace Learner
         // Read evaluation function parameters
         Eval::init_NNUE();
 
-#if !defined(EVAL_NNUE)
-        cout << "init_grad.." << endl;
-
-        // Initialize gradient array of merit function parameters
-        Eval::init_grad(eta1, eta1_epoch, eta2, eta2_epoch, eta3);
-#else
         cout << "init_training.." << endl;
         Eval::NNUE::InitializeTraining(eta1, eta1_epoch, eta2, eta2_epoch, eta3);
         Eval::NNUE::SetBatchSize(nn_batch_size);
@@ -2146,7 +2062,6 @@ namespace Learner
         if (newbob_decay != 1.0 && !Options["SkipLoadingEval"]) {
             learn_think.best_nn_directory = std::string(Options["EvalDir"]);
         }
-#endif
 
 #if 0
         // A test to give a gradient of 1.0 to the initial stage of Hirate.
@@ -2170,11 +2085,9 @@ namespace Learner
         learn_think.freeze = freeze;
         learn_think.reduction_gameply = reduction_gameply;
 
-#if defined(EVAL_NNUE)
         learn_think.newbob_scale = 1.0;
         learn_think.newbob_decay = newbob_decay;
         learn_think.newbob_num_trials = newbob_num_trials;
-#endif
 
         learn_think.eval_save_interval = eval_save_interval;
         learn_think.loss_output_interval = loss_output_interval;
@@ -2199,7 +2112,6 @@ namespace Learner
         // Calculate rmse once at this point (timing of 0 sfen)
         // sr.calc_rmse();
 
-#if defined(EVAL_NNUE)
         if (newbob_decay != 1.0) {
             learn_think.calc_loss(0, -1);
             learn_think.best_loss = learn_think.latest_loss_sum / learn_think.latest_loss_count;
@@ -2207,7 +2119,6 @@ namespace Learner
             learn_think.latest_loss_count = 0;
             cout << "initial loss: " << learn_think.best_loss << endl;
         }
-#endif
 
         // -----------------------------------
         // start learning evaluation function parameters
diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index 5c8cee71..a2845c96 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -184,13 +184,11 @@ namespace Eval::NNUE {
 
     Initialize();
 
-#if defined(EVAL_NNUE)
     if (Options["SkipLoadingEval"])
     {
       std::cout << "info string SkipLoadingEval set to true, Net not loaded!" << std::endl;
       return true;
     }
-#endif
 
     fileName = evalFile;
 
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 13d9d578..7be06832 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -1,6 +1,6 @@
 ﻿// Code for learning NNUE evaluation function
 
-#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+#if defined(EVAL_LEARN)
 
 #include <random>
 #include <fstream>
@@ -229,4 +229,4 @@ double get_eta() {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+#endif  // defined(EVAL_LEARN)
diff --git a/src/nnue/evaluate_nnue_learner.h b/src/nnue/evaluate_nnue_learner.h
index 1e4a463e..0e5fbcd2 100644
--- a/src/nnue/evaluate_nnue_learner.h
+++ b/src/nnue/evaluate_nnue_learner.h
@@ -3,7 +3,7 @@
 #ifndef _EVALUATE_NNUE_LEARNER_H_
 #define _EVALUATE_NNUE_LEARNER_H_
 
-#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+#if defined(EVAL_LEARN)
 
 #include "../learn/learn.h"
 
@@ -41,6 +41,6 @@ void CheckHealth();
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+#endif  // defined(EVAL_LEARN)
 
 #endif
diff --git a/src/nnue/features/castling_right.cpp b/src/nnue/features/castling_right.cpp
index ee7b6576..47fbd986 100644
--- a/src/nnue/features/castling_right.cpp
+++ b/src/nnue/features/castling_right.cpp
@@ -1,7 +1,5 @@
 //Definition of input feature quantity K of NNUE evaluation function
 
-#if defined(EVAL_NNUE)
-
 #include "castling_right.h"
 #include "index_list.h"
 
@@ -69,5 +67,3 @@ namespace Eval {
   }  // namespace NNUE
 
 }  // namespace Eval
-
-#endif  // defined(EVAL_NNUE)
diff --git a/src/nnue/features/castling_right.h b/src/nnue/features/castling_right.h
index 3af5b074..27074080 100644
--- a/src/nnue/features/castling_right.h
+++ b/src/nnue/features/castling_right.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_FEATURES_CASTLING_RIGHT_H_
 #define _NNUE_FEATURES_CASTLING_RIGHT_H_
 
-#if defined(EVAL_NNUE)
-
 #include "../../evaluate.h"
 #include "features_common.h"
 
@@ -43,6 +41,4 @@ namespace Eval {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_NNUE)
-
 #endif
diff --git a/src/nnue/features/enpassant.cpp b/src/nnue/features/enpassant.cpp
index ea70529a..77bc936e 100644
--- a/src/nnue/features/enpassant.cpp
+++ b/src/nnue/features/enpassant.cpp
@@ -1,7 +1,5 @@
 //Definition of input feature quantity K of NNUE evaluation function
 
-#if defined(EVAL_NNUE)
-
 #include "enpassant.h"
 #include "index_list.h"
 
@@ -43,5 +41,3 @@ namespace Eval {
   }  // namespace NNUE
 
 }  // namespace Eval
-
-#endif  // defined(EVAL_NNUE)
diff --git a/src/nnue/features/enpassant.h b/src/nnue/features/enpassant.h
index f77f9c4f..70a8eb5a 100644
--- a/src/nnue/features/enpassant.h
+++ b/src/nnue/features/enpassant.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_FEATURES_ENPASSANT_H_
 #define _NNUE_FEATURES_ENPASSANT_H_
 
-#if defined(EVAL_NNUE)
-
 #include "../../evaluate.h"
 #include "features_common.h"
 
@@ -43,6 +41,4 @@ namespace Eval {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_NNUE)
-
 #endif
diff --git a/src/nnue/features/half_relative_kp.cpp b/src/nnue/features/half_relative_kp.cpp
index 015ecb73..597d65fb 100644
--- a/src/nnue/features/half_relative_kp.cpp
+++ b/src/nnue/features/half_relative_kp.cpp
@@ -1,7 +1,5 @@
 ﻿//Definition of input features HalfRelativeKP of NNUE evaluation function
 
-#if defined(EVAL_NNUE)
-
 #include "half_relative_kp.h"
 #include "index_list.h"
 
@@ -74,5 +72,3 @@ template class HalfRelativeKP<Side::kEnemy>;
 }  // namespace NNUE
 
 }  // namespace Eval
-
-#endif  // defined(EVAL_NNUE)
diff --git a/src/nnue/features/half_relative_kp.h b/src/nnue/features/half_relative_kp.h
index 2d4182e4..1b384c14 100644
--- a/src/nnue/features/half_relative_kp.h
+++ b/src/nnue/features/half_relative_kp.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_FEATURES_HALF_RELATIVE_KP_H_
 #define _NNUE_FEATURES_HALF_RELATIVE_KP_H_
 
-#if defined(EVAL_NNUE)
-
 #include "../../evaluate.h"
 #include "features_common.h"
 
@@ -60,6 +58,4 @@ class HalfRelativeKP {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_NNUE)
-
 #endif
diff --git a/src/nnue/features/k.cpp b/src/nnue/features/k.cpp
index 314b1338..38ec9997 100644
--- a/src/nnue/features/k.cpp
+++ b/src/nnue/features/k.cpp
@@ -1,7 +1,5 @@
 ﻿//Definition of input feature quantity K of NNUE evaluation function
 
-#if defined(EVAL_NNUE)
-
 #include "k.h"
 #include "index_list.h"
 
@@ -54,5 +52,3 @@ void K::AppendChangedIndices(
 }  // namespace NNUE
 
 }  // namespace Eval
-
-#endif  // defined(EVAL_NNUE)
diff --git a/src/nnue/features/k.h b/src/nnue/features/k.h
index 0c394f4e..9a0be4bb 100644
--- a/src/nnue/features/k.h
+++ b/src/nnue/features/k.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_FEATURES_K_H_
 #define _NNUE_FEATURES_K_H_
 
-#if defined(EVAL_NNUE)
-
 #include "../../evaluate.h"
 #include "features_common.h"
 
@@ -47,6 +45,4 @@ private:
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_NNUE)
-
 #endif
diff --git a/src/nnue/features/p.cpp b/src/nnue/features/p.cpp
index b4a6faf9..0c1b7d50 100644
--- a/src/nnue/features/p.cpp
+++ b/src/nnue/features/p.cpp
@@ -1,7 +1,5 @@
 ﻿//Definition of input feature P of NNUE evaluation function
 
-#if defined(EVAL_NNUE)
-
 #include "p.h"
 #include "index_list.h"
 
@@ -52,5 +50,3 @@ void P::AppendChangedIndices(
 }  // namespace NNUE
 
 }  // namespace Eval
-
-#endif  // defined(EVAL_NNUE)
diff --git a/src/nnue/features/p.h b/src/nnue/features/p.h
index b3d4191e..07d88952 100644
--- a/src/nnue/features/p.h
+++ b/src/nnue/features/p.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_FEATURES_P_H_
 #define _NNUE_FEATURES_P_H_
 
-#if defined(EVAL_NNUE)
-
 #include "../../evaluate.h"
 #include "features_common.h"
 
@@ -47,6 +45,4 @@ class P {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_NNUE)
-
 #endif
diff --git a/src/nnue/layers/sum.h b/src/nnue/layers/sum.h
index d8c7bf93..419ced89 100644
--- a/src/nnue/layers/sum.h
+++ b/src/nnue/layers/sum.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_LAYERS_SUM_H_
 #define _NNUE_LAYERS_SUM_H_
 
-#if defined(EVAL_NNUE)
-
 #include "../nnue_common.h"
 
 namespace Eval {
@@ -158,6 +156,4 @@ class Sum<PreviousLayer> {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_NNUE)
-
 #endif
diff --git a/src/nnue/nnue_test_command.cpp b/src/nnue/nnue_test_command.cpp
index 311c5ded..b8346693 100644
--- a/src/nnue/nnue_test_command.cpp
+++ b/src/nnue/nnue_test_command.cpp
@@ -1,6 +1,6 @@
 ﻿// USI extended command for NNUE evaluation function
 
-#if defined(ENABLE_TEST_CMD) && defined(EVAL_NNUE)
+#if defined(ENABLE_TEST_CMD)
 
 #include "../thread.h"
 #include "../uci.h"
@@ -198,4 +198,4 @@ void TestCommand(Position& pos, std::istream& stream) {
 
 }  // namespace Eval
 
-#endif  // defined(ENABLE_TEST_CMD) && defined(EVAL_NNUE)
+#endif  // defined(ENABLE_TEST_CMD)
diff --git a/src/nnue/nnue_test_command.h b/src/nnue/nnue_test_command.h
index 570ef01b..30854fd2 100644
--- a/src/nnue/nnue_test_command.h
+++ b/src/nnue/nnue_test_command.h
@@ -3,7 +3,7 @@
 #ifndef _NNUE_TEST_COMMAND_H_
 #define _NNUE_TEST_COMMAND_H_
 
-#if defined(ENABLE_TEST_CMD) && defined(EVAL_NNUE)
+#if defined(ENABLE_TEST_CMD)
 
 namespace Eval {
 
@@ -16,6 +16,6 @@ void TestCommand(Position& pos, std::istream& stream);
 
 }  // namespace Eval
 
-#endif  // defined(ENABLE_TEST_CMD) && defined(EVAL_NNUE)
+#endif  // defined(ENABLE_TEST_CMD)
 
 #endif
diff --git a/src/nnue/trainer/features/factorizer.h b/src/nnue/trainer/features/factorizer.h
index 148ee8ec..43950de2 100644
--- a/src/nnue/trainer/features/factorizer.h
+++ b/src/nnue/trainer/features/factorizer.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_H_
 #define _NNUE_TRAINER_FEATURES_FACTORIZER_H_
 
-#if defined(EVAL_NNUE)
-
 #include "../../nnue_common.h"
 #include "../trainer.h"
 
@@ -105,6 +103,4 @@ constexpr std::size_t GetArrayLength(const T (&/*array*/)[N]) {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_NNUE)
-
 #endif
diff --git a/src/nnue/trainer/features/factorizer_feature_set.h b/src/nnue/trainer/features/factorizer_feature_set.h
index af524719..caf6608b 100644
--- a/src/nnue/trainer/features/factorizer_feature_set.h
+++ b/src/nnue/trainer/features/factorizer_feature_set.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_
 #define _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_
 
-#if defined(EVAL_NNUE)
-
 #include "../../features/feature_set.h"
 #include "factorizer.h"
 
@@ -99,6 +97,4 @@ public:
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_NNUE)
-
 #endif
diff --git a/src/nnue/trainer/features/factorizer_half_kp.h b/src/nnue/trainer/features/factorizer_half_kp.h
index 955894e8..70a6acca 100644
--- a/src/nnue/trainer/features/factorizer_half_kp.h
+++ b/src/nnue/trainer/features/factorizer_half_kp.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_
 #define _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_
 
-#if defined(EVAL_NNUE)
-
 #include "../../features/half_kp.h"
 #include "../../features/p.h"
 #include "../../features/half_relative_kp.h"
@@ -98,6 +96,4 @@ constexpr FeatureProperties Factorizer<HalfKP<AssociatedKing>>::kProperties[];
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_NNUE)
-
 #endif
diff --git a/src/nnue/trainer/trainer.h b/src/nnue/trainer/trainer.h
index 4b467041..d526557a 100644
--- a/src/nnue/trainer/trainer.h
+++ b/src/nnue/trainer/trainer.h
@@ -3,7 +3,7 @@
 #ifndef _NNUE_TRAINER_H_
 #define _NNUE_TRAINER_H_
 
-#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+#if defined(EVAL_LEARN)
 
 #include "../nnue_common.h"
 #include "../features/index_list.h"
@@ -120,6 +120,6 @@ std::shared_ptr<T> MakeAlignedSharedPtr(ArgumentTypes&&... arguments) {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+#endif  // defined(EVAL_LEARN)
 
 #endif
diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h
index da11ca29..4b5ddee6 100644
--- a/src/nnue/trainer/trainer_affine_transform.h
+++ b/src/nnue/trainer/trainer_affine_transform.h
@@ -3,7 +3,7 @@
 #ifndef _NNUE_TRAINER_AFFINE_TRANSFORM_H_
 #define _NNUE_TRAINER_AFFINE_TRANSFORM_H_
 
-#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+#if defined(EVAL_LEARN)
 
 #include "../../learn/learn.h"
 #include "../layers/affine_transform.h"
@@ -296,6 +296,6 @@ class Trainer<Layers::AffineTransform<PreviousLayer, OutputDimensions>> {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+#endif  // defined(EVAL_LEARN)
 
 #endif
diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index bd59a02d..72575bf8 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -3,7 +3,7 @@
 #ifndef _NNUE_TRAINER_CLIPPED_RELU_H_
 #define _NNUE_TRAINER_CLIPPED_RELU_H_
 
-#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+#if defined(EVAL_LEARN)
 
 #include "../../learn/learn.h"
 #include "../layers/clipped_relu.h"
@@ -137,6 +137,6 @@ class Trainer<Layers::ClippedReLU<PreviousLayer>> {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+#endif  // defined(EVAL_LEARN)
 
 #endif
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 97dbeff4..6b94d952 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -3,7 +3,7 @@
 #ifndef _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
 #define _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
 
-#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+#if defined(EVAL_LEARN)
 
 #include "../../learn/learn.h"
 #include "../nnue_feature_transformer.h"
@@ -372,6 +372,6 @@ class Trainer<FeatureTransformer> {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+#endif  // defined(EVAL_LEARN)
 
 #endif
diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
index 7d9e76c3..b6d6635b 100644
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -3,7 +3,7 @@
 #ifndef _NNUE_TRAINER_INPUT_SLICE_H_
 #define _NNUE_TRAINER_INPUT_SLICE_H_
 
-#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+#if defined(EVAL_LEARN)
 
 #include "../../learn/learn.h"
 #include "../layers/input_slice.h"
@@ -246,6 +246,6 @@ class Trainer<Layers::InputSlice<OutputDimensions, Offset>> {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+#endif  // defined(EVAL_LEARN)
 
 #endif
diff --git a/src/nnue/trainer/trainer_sum.h b/src/nnue/trainer/trainer_sum.h
index f7bf3b3d..0b7abe36 100644
--- a/src/nnue/trainer/trainer_sum.h
+++ b/src/nnue/trainer/trainer_sum.h
@@ -3,7 +3,7 @@
 #ifndef _NNUE_TRAINER_SUM_H_
 #define _NNUE_TRAINER_SUM_H_
 
-#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+#if defined(EVAL_LEARN)
 
 #include "../../learn/learn.h"
 #include "../layers/sum.h"
@@ -185,6 +185,6 @@ class Trainer<Layers::Sum<PreviousLayer>> {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+#endif  // defined(EVAL_LEARN)
 
 #endif
diff --git a/src/uci.cpp b/src/uci.cpp
index d6745d19..5be2afbb 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -32,7 +32,7 @@
 #include "uci.h"
 #include "syzygy/tbprobe.h"
 
-#if defined(EVAL_NNUE) && defined(ENABLE_TEST_CMD)
+#if defined(ENABLE_TEST_CMD)
 #include "nnue/nnue_test_command.h"
 #endif
 
@@ -67,7 +67,7 @@ namespace Learner
 }
 #endif
 
-#if defined(EVAL_NNUE) && defined(ENABLE_TEST_CMD)
+#if defined(ENABLE_TEST_CMD)
 void test_cmd(Position& pos, istringstream& is)
 {
     // Initialize as it may be searched.
@@ -373,7 +373,7 @@ void UCI::loop(int argc, char* argv[]) {
 
 #endif
 
-#if defined(EVAL_NNUE) && defined(ENABLE_TEST_CMD)
+#if defined(ENABLE_TEST_CMD)
       // test command
       else if (token == "test") test_cmd(pos, is);
 #endif
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index 0007b559..4f9fab5e 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -83,7 +83,6 @@ void init(OptionsMap& o) {
   // The default must follow the format nn-[SHA256 first 12 digits].nnue
   // for the build process (profile-build and fishtest) to work.
   o["EvalFile"]              << Option("nn-82215d0fd0df.nnue", on_eval_file);
-#ifdef EVAL_NNUE
   // When the evaluation function is loaded at the ucinewgame timing, it is necessary to convert the new evaluation function.
   // I want to hit the test eval convert command, but there is no new evaluation function
   // It ends abnormally before executing this command.
@@ -92,7 +91,6 @@ void init(OptionsMap& o) {
   o["SkipLoadingEval"]       << Option(false);
   // how many moves to use a fixed move
   // o["BookMoves"] << Option(16, 0, 10000);
-#endif
 #if defined(EVAL_LEARN)
   // When learning the evaluation function, you can change the folder to save the evaluation function.
   // Evalsave by default. This folder shall be prepared in advance.

From e6a6ba52213290d0996913ec6367a8364c5199ec Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Tue, 8 Sep 2020 22:49:55 +0900
Subject: [PATCH 050/398] Removed USE_BOOK macro.

---
 src/learn/gensfen.cpp | 40 ----------------------------------------
 src/learn/learner.cpp |  5 -----
 2 files changed, 45 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 9ae83174..589d9559 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -11,10 +11,6 @@
 #include "learn.h"
 #include "multi_think.h"
 
-#if defined(USE_BOOK)
-#include "../extra/book/book.h"
-#endif
-
 #include <chrono>
 #include <climits>
 #include <cmath>
@@ -750,11 +746,6 @@ namespace Learner
             auto& pos = th->rootPos;
             pos.set(StartFEN, false, &si, th);
 
-#if defined(USE_BOOK)
-            // Refer to the members of BookMoveSelector defined in the search section.
-            auto& book = ::book;
-#endif
-
             // Vector for holding the sfens in the current simulated game.
             PSVector a_psv;
             a_psv.reserve(write_maxply + MAX_PLY);
@@ -788,35 +779,7 @@ namespace Learner
                     flush_psv(result.value());
                     break;
                 }
-#if defined(USE_BOOK)
-                if ((next_move = book.probe(pos)) != MOVE_NONE)
-                {
-                    // Hit the constant track.
-                    // The move was stored in next_move.
 
-                    // Do not use the fixed phase for learning.
-                    sfens.clear();
-
-                    if (random_move_minply != -1)
-                    {
-                        // Random move is performed with a certain
-                        // probability even in the constant phase.
-                        goto RANDOM_MOVE;
-                    }
-                    else
-                    {
-                        // When -1 is specified as random_move_minply,
-                        // it points according to the standard until
-                        // it goes out of the standard.
-                        // Prepare an innumerable number of situations
-                        // that have left the constant as
-                        // ConsiderationBookMoveCount true using a huge constant
-                        // Used for purposes such as performing
-                        // a random move 5 times from there.
-                        goto DO_MOVE;
-                    }
-                }
-#endif
                 {
                     auto [search_value, search_pv] = search(pos, depth, 1, nodes);
 
@@ -1124,9 +1087,6 @@ namespace Learner
             << "  loop_max = " << loop_max << endl
             << "  eval_limit = " << eval_limit << endl
             << "  thread_num (set by USI setoption) = " << thread_num << endl
-#if defined(USE_BOOK)
-            << "  book_moves (set by USI setoption) = " << Options["BookMoves"] << endl
-#endif
             << "  random_move_minply     = " << random_move_minply << endl
             << "  random_move_maxply     = " << random_move_maxply << endl
             << "  random_move_count      = " << random_move_count << endl
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 358848ec..e7f021fe 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -54,11 +54,6 @@
 
 using namespace std;
 
-#if defined(USE_BOOK)
-// This is defined in the search section.
-extern Book::BookMoveSelector book;
-#endif
-
 template <typename T>
 T operator +=(std::atomic<T>& x, const T rhs)
 {

From 21cfead52c2a77abc4e9eed21739ccc3df9826c0 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Tue, 8 Sep 2020 22:52:46 +0900
Subject: [PATCH 051/398] Removed unused OMP_ macro.

---
 src/learn/convert.cpp        | 4 ----
 src/learn/gensfen.cpp        | 4 ----
 src/learn/learning_tools.cpp | 3 ---
 3 files changed, 11 deletions(-)

diff --git a/src/learn/convert.cpp b/src/learn/convert.cpp
index 9bd9548d..d07fc00c 100644
--- a/src/learn/convert.cpp
+++ b/src/learn/convert.cpp
@@ -27,10 +27,6 @@
 #include <regex>
 #include <filesystem>
 
-#if defined (_OPENMP)
-#include <omp.h>
-#endif
-
 using namespace std;
 
 namespace Learner
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 589d9559..65e64177 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -28,10 +28,6 @@
 #include <sstream>
 #include <unordered_set>
 
-#if defined (_OPENMP)
-#include <omp.h>
-#endif
-
 using namespace std;
 
 namespace Learner
diff --git a/src/learn/learning_tools.cpp b/src/learn/learning_tools.cpp
index de6da9c5..eca11c47 100644
--- a/src/learn/learning_tools.cpp
+++ b/src/learn/learning_tools.cpp
@@ -2,9 +2,6 @@
 
 #if defined (EVAL_LEARN)
 
-#if defined(_OPENMP)
-#include <omp.h>
-#endif
 #include "../misc.h"
 
 using namespace Eval;

From 1d00d002412e11505430a9da32297b81e11b6801 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Tue, 8 Sep 2020 22:59:57 +0900
Subject: [PATCH 052/398] Removed ENABLE_TEST_CMD macro.

---
 src/Makefile                   |  6 +++---
 src/nnue/nnue_test_command.cpp |  4 ----
 src/nnue/nnue_test_command.h   |  4 ----
 src/uci.cpp                    | 11 ++---------
 4 files changed, 5 insertions(+), 20 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index a07e1251..49c6c1b3 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -903,7 +903,7 @@ icc-profile-use:
 
 learn: config-sanity
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
-	EXTRACXXFLAGS=' -DEVAL_LEARN -DENABLE_TEST_CMD -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
+	EXTRACXXFLAGS=' -DEVAL_LEARN -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
 	EXTRALDFLAGS=' $(BLASLDFLAGS) -fopenmp  ' \
 	all
 
@@ -911,7 +911,7 @@ profile-learn: net config-sanity objclean profileclean
 	@echo ""
 	@echo "Step 1/4. Building instrumented executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make) \
-	LEARNCXXFLAGS=' -DEVAL_LEARN -DENABLE_TEST_CMD -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
+	LEARNCXXFLAGS=' -DEVAL_LEARN -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
 	LEARNLDFLAGS='  $(BLASLDFLAGS) -fopenmp '
 	@echo ""
 	@echo "Step 2/4. Running benchmark for pgo-build ..."
@@ -920,7 +920,7 @@ profile-learn: net config-sanity objclean profileclean
 	@echo "Step 3/4. Building optimized executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) objclean
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_use) \
-	LEARNCXXFLAGS=' -DEVAL_LEARN -DENABLE_TEST_CMD -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
+	LEARNCXXFLAGS=' -DEVAL_LEARN -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
 	LEARNLDFLAGS=' $(BLASLDFLAGS) -fopenmp '
 	@echo ""
 	@echo "Step 4/4. Deleting profile data ..."
diff --git a/src/nnue/nnue_test_command.cpp b/src/nnue/nnue_test_command.cpp
index b8346693..c3a53c7d 100644
--- a/src/nnue/nnue_test_command.cpp
+++ b/src/nnue/nnue_test_command.cpp
@@ -1,7 +1,5 @@
 ﻿// USI extended command for NNUE evaluation function
 
-#if defined(ENABLE_TEST_CMD)
-
 #include "../thread.h"
 #include "../uci.h"
 #include "evaluate_nnue.h"
@@ -197,5 +195,3 @@ void TestCommand(Position& pos, std::istream& stream) {
 }  // namespace NNUE
 
 }  // namespace Eval
-
-#endif  // defined(ENABLE_TEST_CMD)
diff --git a/src/nnue/nnue_test_command.h b/src/nnue/nnue_test_command.h
index 30854fd2..75d33e82 100644
--- a/src/nnue/nnue_test_command.h
+++ b/src/nnue/nnue_test_command.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_TEST_COMMAND_H_
 #define _NNUE_TEST_COMMAND_H_
 
-#if defined(ENABLE_TEST_CMD)
-
 namespace Eval {
 
 namespace NNUE {
@@ -16,6 +14,4 @@ void TestCommand(Position& pos, std::istream& stream);
 
 }  // namespace Eval
 
-#endif  // defined(ENABLE_TEST_CMD)
-
 #endif
diff --git a/src/uci.cpp b/src/uci.cpp
index 5be2afbb..1454e4e0 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -24,17 +24,14 @@
 
 #include "evaluate.h"
 #include "movegen.h"
+#include "nnue/nnue_test_command.h"
 #include "position.h"
 #include "search.h"
+#include "syzygy/tbprobe.h"
 #include "thread.h"
 #include "timeman.h"
 #include "tt.h"
 #include "uci.h"
-#include "syzygy/tbprobe.h"
-
-#if defined(ENABLE_TEST_CMD)
-#include "nnue/nnue_test_command.h"
-#endif
 
 using namespace std;
 
@@ -67,7 +64,6 @@ namespace Learner
 }
 #endif
 
-#if defined(ENABLE_TEST_CMD)
 void test_cmd(Position& pos, istringstream& is)
 {
     // Initialize as it may be searched.
@@ -78,7 +74,6 @@ void test_cmd(Position& pos, istringstream& is)
 
     if (param == "nnue") Eval::NNUE::TestCommand(pos, is);
 }
-#endif
 
 namespace {
 
@@ -373,10 +368,8 @@ void UCI::loop(int argc, char* argv[]) {
 
 #endif
 
-#if defined(ENABLE_TEST_CMD)
       // test command
       else if (token == "test") test_cmd(pos, is);
-#endif
       else
           sync_cout << "Unknown command: " << cmd << sync_endl;
 

From 458771a18199d4f64f4190521bea4aa91460c462 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Tue, 8 Sep 2020 23:02:31 +0900
Subject: [PATCH 053/398] Removed GENSFEN2019 macro.

---
 src/uci.cpp | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/src/uci.cpp b/src/uci.cpp
index 1454e4e0..6675f2e0 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -50,11 +50,6 @@ namespace Learner
   // Learning from the generated game record
   void learn(Position& pos, istringstream& is);
 
-#if defined(GENSFEN2019)
-  // Automatic generation command of teacher phase under development
-  void gen_sfen2019(Position& pos, istringstream& is);
-#endif
-
   // A pair of reader and evaluation value. Returned by Learner::search(),Learner::qsearch().
   typedef std::pair<Value, std::vector<Move> > ValueAndPV;
 
@@ -358,10 +353,6 @@ void UCI::loop(int argc, char* argv[]) {
       else if (token == "gensfen") Learner::gen_sfen(pos, is);
       else if (token == "learn") Learner::learn(pos, is);
 
-#if defined (GENSFEN2019)
-	  // Command to generate teacher phase under development
-      else if (token == "gensfen2019") Learner::gen_sfen2019(pos, is);
-#endif
       // Command to call qsearch(),search() directly for testing
       else if (token == "qsearch") qsearch_cmd(pos);
       else if (token == "search") search_cmd(pos, is);

From 04a9a951b8611d6f176d49c9edd24d22ec5ba457 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Tue, 8 Sep 2020 23:08:39 +0900
Subject: [PATCH 054/398] Removed "#if 0" and "#if 1".

---
 src/learn/gensfen.cpp | 24 +++------------------
 src/learn/learner.cpp | 50 -------------------------------------------
 2 files changed, 3 insertions(+), 71 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 65e64177..ec3de570 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -166,7 +166,7 @@ namespace Learner
                         output_file_stream.write(reinterpret_cast<const char*>(buf->data()), sizeof(PackedSfenValue) * buf->size());
 
                         sfen_write_count += buf->size();
-#if 1
+
                         // Add the processed number here, and if it exceeds save_every,
                         // change the file name and reset this counter.
                         sfen_write_count_current_file += buf->size();
@@ -186,7 +186,7 @@ namespace Learner
                             output_file_stream.open(new_filename, ios::out | ios::binary | ios::app);
                             cout << endl << "output sfen file = " << new_filename << endl;
                         }
-#endif
+
                         // Output '.' every time when writing a game record.
                         std::cout << ".";
 
@@ -519,10 +519,6 @@ namespace Learner
         {
             // Write out one sfen.
             sfen_writer.write(thread_id, *it);
-#if 0
-            pos.set_from_packed_sfen(it->sfen);
-            cout << pos << "Win : " << it->is_win << " , " << it->score << endl;
-#endif
         }
 
         return quit;
@@ -667,13 +663,12 @@ namespace Learner
 
         for (auto m : pv)
         {
-#if 1
             // There should be no illegal move. This is as a debugging precaution.
             if (!pos.pseudo_legal(m) || !pos.legal(m))
             {
                 cout << "Error! : " << pos.fen() << m << endl;
             }
-#endif
+
             pos.do_move(m, states[ply++]);
 
             // Because the difference calculation of evaluate() cannot be
@@ -803,19 +798,6 @@ namespace Learner
                     // Save the move score for adjudication.
                     move_hist_scores.push_back(search_value);
 
-#if 0
-                    dbg_hit_on(search_value == leaf_value);
-                    // gensfen depth 3 eval_limit 32000
-                    // Total 217749 Hits 203579 hit rate (%) 93.490
-                    // gensfen depth 6 eval_limit 32000
-                    // Total 78407 Hits 69190 hit rate (%) 88.245
-                    // gensfen depth 6 eval_limit 3000
-                    // Total 53879 Hits 43713 hit rate (%) 81.132
-
-                    // Problems such as pruning with moves in the substitution table.
-                    // This is a little uncomfortable as a teacher...
-#endif
-
                     // If depth 0, pv is not obtained, so search again at depth 2.
                     if (search_depth_min <= 0)
                     {
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index e7f021fe..2f1d27b2 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -1163,11 +1163,6 @@ namespace Learner
             if (ps.gamePly < prng.rand(reduction_gameply))
                 goto RETRY_READ;
 
-#if 0
-            auto sfen = pos.sfen_unpack(ps.data);
-            pos.set(sfen);
-#endif
-            // ↑ Since it is slow when passing through sfen, I made a dedicated function.
             StateInfo si;
             const bool mirror = prng.rand(100) < mirror_percentage;
             if (pos.set_from_packed_sfen(ps.sfen, &si, th, mirror) != 0)
@@ -1207,28 +1202,6 @@ namespace Learner
             // If it is the result of searching a completely different place, it may become noise.
             // It may be better not to study where the difference in evaluation values ​​is too large.
 
-#if 0
-            // If you do this, about 13% of the phases will be excluded 
-            // from the learning target. Good and bad are subtle.
-            if (pv.size() >= 1 && (uint16_t)pv[0] != ps.move)
-            {
-                //dbg_hit_on(false);
-                continue;
-            }
-#endif
-
-#if 0
-            // It may be better not to study where the difference in evaluation values ​​is too large.
-            // → It's okay because it passes the win rate function... 
-            // About 30% of the phases are out of the scope of learning...
-            if (abs((int16_t)r.first - ps.score) >= Eval::PawnValue * 4)
-            {
-                //dbg_hit_on(false);
-                continue;
-            }
-            //dbg_hit_on(true);
-#endif
-
             int ply = 0;
 
             // A helper function that adds the gradient to the current phase.
@@ -1315,17 +1288,6 @@ namespace Learner
             // rewind the phase
             for (auto it = pv.rbegin(); it != pv.rend(); ++it)
                 pos.undo_move(*it);
-
-#if 0
-            // When adding the gradient to the root phase
-            shallow_value = 
-                (rootColor == pos.side_to_move()) 
-                ? Eval::evaluate(pos) 
-                : -Eval::evaluate(pos);
-
-            dj_dw = calc_grad(deep_value, shallow_value, ps);
-            Eval::add_grad(pos, rootColor, dj_dw, without_kpp);
-#endif
         }
 
     }
@@ -2058,18 +2020,6 @@ namespace Learner
             learn_think.best_nn_directory = std::string(Options["EvalDir"]);
         }
 
-#if 0
-        // A test to give a gradient of 1.0 to the initial stage of Hirate.
-        pos.set_hirate();
-        cout << Eval::evaluate(pos) << endl;
-        //Eval::print_eval_stat(pos);
-        Eval::add_grad(pos, BLACK, 32.0, false);
-        Eval::update_weights(1);
-        pos.state()->sum.p[2][0] = VALUE_NOT_EVALUATED;
-        cout << Eval::evaluate(pos) << endl;
-        //Eval::print_eval_stat(pos);
-#endif
-
         cout << "init done." << endl;
 
         // Reflect other option settings.

From ec96409176fa8f2cdc2e8a003150fcabf037f85c Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Tue, 8 Sep 2020 23:30:57 +0900
Subject: [PATCH 055/398] Replaced DNDEBUG macro to _DEBUG macro.

---
 src/learn/gensfen.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index ec3de570..0232e5d4 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -70,7 +70,7 @@ namespace Learner
             file_worker_thread.join();
             output_file_stream.close();
 
-#if !defined(DNDEBUG)
+#if defined(_DEBUG)
             {
                 // All buffers should be empty since file_worker_thread
                 // should have written everything before exiting.

From aa2452caf39446fded3c0ee79c18c3ecb43369b3 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Tue, 8 Sep 2020 23:45:19 +0900
Subject: [PATCH 056/398] Removed #if for USE_EVAL_HASH.

---
 src/eval/evaluate_common.h |  6 ------
 src/learn/gensfen.cpp      | 10 ----------
 src/learn/learner.cpp      |  2 --
 3 files changed, 18 deletions(-)

diff --git a/src/eval/evaluate_common.h b/src/eval/evaluate_common.h
index 3fb161ab..927783cd 100644
--- a/src/eval/evaluate_common.h
+++ b/src/eval/evaluate_common.h
@@ -18,12 +18,6 @@
 
 namespace Eval
 {
-
-#if defined(USE_EVAL_HASH)
-	// prefetch function
-	void prefetch_evalhash(const Key key);
-#endif
-
 	// An operator that applies the function f to each parameter of the evaluation function.
 	// Used for parameter analysis etc.
 	// type indicates the survey target.
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 0232e5d4..4050d983 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -956,12 +956,6 @@ namespace Learner
 
         string token;
 
-        // When hit to eval hash, as a evaluation value near the initial stage, if a hash collision occurs and a large value is written
-        // When eval_limit is set small, eval_limit will be exceeded every time in the initial phase, and phase generation will not proceed.
-        // Therefore, eval hash needs to be disabled.
-        // After that, when the hash of the eval hash collides, the evaluation value of a strange value is used, and it may be unpleasant to use it for the teacher.
-        bool use_eval_hash = false;
-
         // Save to file in this unit.
         // File names are serialized like file_1.bin, file_2.bin.
         uint64_t save_every = UINT64_MAX;
@@ -1010,8 +1004,6 @@ namespace Learner
                 is >> write_minply;
             else if (token == "write_maxply")
                 is >> write_maxply;
-            else if (token == "use_eval_hash")
-                is >> use_eval_hash;
             else if (token == "save_every")
                 is >> save_every;
             else if (token == "random_file_name")
@@ -1033,7 +1025,6 @@ namespace Learner
 #if defined(USE_GLOBAL_OPTIONS)
         // Save it for later restore.
         auto oldGlobalOptions = GlobalOptions;
-        GlobalOptions.use_eval_hash = use_eval_hash;
 #endif
 
         // If search depth2 is not set, leave it the same as search depth.
@@ -1075,7 +1066,6 @@ namespace Learner
             << "  write_minply           = " << write_minply << endl
             << "  write_maxply           = " << write_maxply << endl
             << "  output_file_name       = " << output_file_name << endl
-            << "  use_eval_hash          = " << use_eval_hash << endl
             << "  save_every             = " << save_every << endl
             << "  random_file_name       = " << random_file_name << endl
             << "  write_out_draw_game_in_training_data_generation = " << write_out_draw_game_in_training_data_generation << endl
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 2f1d27b2..9e6f10cb 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -1639,8 +1639,6 @@ namespace Learner
 #if defined(USE_GLOBAL_OPTIONS)
     // Save it for later restore.
         auto oldGlobalOptions = GlobalOptions;
-        // If you hit the eval hash, you can not calculate rmse etc. so turn it off.
-        GlobalOptions.use_eval_hash = false;
         // If you hit the replacement table, pruning may occur at the previous evaluation value, so turn it off.
         GlobalOptions.use_hash_probe = false;
 #endif

From 82dc68ba9ffe1d5fe849eef1f0fcc565ef810512 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Tue, 8 Sep 2020 23:47:04 +0900
Subject: [PATCH 057/398] Removed #if for USE_GLOBAL_OPTIONS.

---
 src/learn/gensfen.cpp | 11 -----------
 src/learn/learner.cpp | 12 ------------
 src/search.cpp        | 11 -----------
 3 files changed, 34 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 4050d983..3d015acf 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -1022,11 +1022,6 @@ namespace Learner
                 cout << "Error! : Illegal token " << token << endl;
         }
 
-#if defined(USE_GLOBAL_OPTIONS)
-        // Save it for later restore.
-        auto oldGlobalOptions = GlobalOptions;
-#endif
-
         // If search depth2 is not set, leave it the same as search depth.
         if (search_depth_max == INT_MIN)
             search_depth_max = search_depth_min;
@@ -1103,12 +1098,6 @@ namespace Learner
         }
 
         std::cout << "gensfen finished." << endl;
-
-#if defined(USE_GLOBAL_OPTIONS)
-        // Restore Global Options.
-        GlobalOptions = oldGlobalOptions;
-#endif
-
     }
 }
 #endif
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 9e6f10cb..daea9594 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -1636,13 +1636,6 @@ namespace Learner
         uint64_t eta1_epoch = 0; // eta2 is not applied by default
         uint64_t eta2_epoch = 0; // eta3 is not applied by default
 
-#if defined(USE_GLOBAL_OPTIONS)
-    // Save it for later restore.
-        auto oldGlobalOptions = GlobalOptions;
-        // If you hit the replacement table, pruning may occur at the previous evaluation value, so turn it off.
-        GlobalOptions.use_hash_probe = false;
-#endif
-
         // --- Function that only shuffles the teacher aspect
 
         // normal shuffle
@@ -2072,11 +2065,6 @@ namespace Learner
 
         // Save once at the end.
         learn_think.save(true);
-
-#if defined(USE_GLOBAL_OPTIONS)
-        // Restore Global Options.
-        GlobalOptions = oldGlobalOptions;
-#endif
     }
 
 } // namespace Learner
diff --git a/src/search.cpp b/src/search.cpp
index 8f258ae4..67348a2b 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -2070,17 +2070,6 @@ namespace Learner
         rootMoves.push_back(Search::RootMove(m));
 
       assert(!rootMoves.empty());
-
-      //#if defined(USE_GLOBAL_OPTIONS)
-      // Since the generation of the substitution table for each search thread should be managed,
-      // Increase the generation of the substitution table for this thread because it is a new search.
-            //TT.new_search(th->thread_id());
-
-            // ª If you call new_search here, it may be a loss because you can't use the previous search result.
-            // Do not do this here, but caller should do TT.new_search(th->thread_id()) for each station ...
-
-            // ¨Because we want to avoid reaching the same final diagram, use the substitution table commonly for all threads when generating teachers.
-      //#endif
     }
   }
 

From 05d26499b42878447a21b6d721f4868151357665 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Tue, 8 Sep 2020 23:57:51 +0900
Subject: [PATCH 058/398] Removed LEARN_ELMO_METHOD macro.

---
 src/learn/learn.h | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/src/learn/learn.h b/src/learn/learn.h
index 1bc39cf9..7285f61a 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -5,18 +5,6 @@
 
 #include <vector>
 
-// =====================
-// Settings for learning
-// =====================
-
-// If you select one of the following, the details after that will be automatically selected.
-// If you don't select any of them, you need to set the subsequent details one by one.
-
-// Learning setting by elmo method. This is the default setting.
-// To make a standard squeeze diaphragm, specify "lambda 1" with the learn command.
-#define LEARN_ELMO_METHOD
-
-
 // ----------------------
 // update formula
 // ----------------------
@@ -147,10 +135,8 @@ typedef float LearnFloatType;
 // Learning with the method of elmo (WCSC27)
 // ----------------------
 
-#if defined( LEARN_ELMO_METHOD )
 #define LOSS_FUNCTION_IS_ELMO_METHOD
 #define ADA_GRAD_UPDATE
-#endif
 
 // Character string according to update formula. (Output for debugging.)
 // Implemented various update expressions, but concluded that AdaGrad is the best in terms of speed and memory.

From 0271d707759117af6557beb93319aa51c07280aa Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 00:01:53 +0900
Subject: [PATCH 059/398] Removed ADA_GRAD_UPDATE macro.

---
 src/learn/learn.h          | 10 +------
 src/learn/learning_tools.h | 54 +-------------------------------------
 2 files changed, 2 insertions(+), 62 deletions(-)

diff --git a/src/learn/learn.h b/src/learn/learn.h
index 7285f61a..8fb6217f 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -9,9 +9,6 @@
 // update formula
 // ----------------------
 
-// Ada Grad. Recommended because it is stable.
-// #define ADA_GRAD_UPDATE
-
 // SGD looking only at the sign of the gradient. It requires less memory, but the accuracy is...
 // #define SGD_UPDATE
 
@@ -136,13 +133,8 @@ typedef float LearnFloatType;
 // ----------------------
 
 #define LOSS_FUNCTION_IS_ELMO_METHOD
-#define ADA_GRAD_UPDATE
 
-// Character string according to update formula. (Output for debugging.)
-// Implemented various update expressions, but concluded that AdaGrad is the best in terms of speed and memory.
-#if defined(ADA_GRAD_UPDATE)
-#define LEARN_UPDATE "AdaGrad"
-#elif defined(SGD_UPDATE)
+#if defined(SGD_UPDATE)
 #define LEARN_UPDATE "SGD"
 #endif
 
diff --git a/src/learn/learning_tools.h b/src/learn/learning_tools.h
index 3c4be08a..854133e4 100644
--- a/src/learn/learning_tools.h
+++ b/src/learn/learning_tools.h
@@ -76,59 +76,7 @@ namespace EvalLearningTools
 
 		template <typename T> void updateFV(T& v) { updateFV(v, 1.0); }
 
-#if defined (ADA_GRAD_UPDATE)
-
-		// Since the maximum value that can be accurately calculated with float is INT16_MAX*256-1
-		// Keep the small value as a marker.
-		const LearnFloatType V0_NOT_INIT = (INT16_MAX * 128);
-
-		// What holds v internally. The previous implementation kept a fixed decimal with only a fractional part to save memory,
-		// Since it is doubtful in accuracy and the visibility is bad, it was abolished.
-		LearnFloatType v0 = LearnFloatType(V0_NOT_INIT);
-
-		// AdaGrad g2
-		LearnFloatType g2 = LearnFloatType(0);
-
-		// update with AdaGrad
-		// When executing this function, the value of g and the member do not change
-		// Guaranteed by the caller. It does not have to be an atomic operation.
-		// k is a coefficient for eta. 1.0 is usually sufficient. If you want to lower eta for your turn item, set this to 1/8.0 etc.
-		template <typename T>
-		void updateFV(T& v,double k)
-		{
-			// AdaGrad update formula
-			// Gradient vector is g, vector to be updated is v, η(eta) is a constant,
-			//     g2 = g2 + g^2
-			//     v = v - ηg/sqrt(g2)
-
-			constexpr double epsilon = 0.000001;
-
-			if (g == LearnFloatType(0))
-				return;
-
-			g2 += g * g;
-
-			// If v0 is V0_NOT_INIT, it means that the value is not initialized with the value of KK/KKP/KPP array,
-			// In this case, read the value of v from the one passed in the argument.
-			double V = (v0 == V0_NOT_INIT) ? v : v0;
-
-			V -= k * eta * (double)g / sqrt((double)g2 + epsilon);
-
-			// Limit the value of V to be within the range of types.
-			// By the way, windows.h defines the min and max macros, so to avoid it,
-			// Here, it is enclosed in parentheses so that it is not treated as a function-like macro.
-			V = (std::min)((double)(std::numeric_limits<T>::max)() , V);
-			V = (std::max)((double)(std::numeric_limits<T>::min)() , V);
-
-			v0 = (LearnFloatType)V;
-			v = (T)round(V);
-
-			// Clear g because one update of mini-batch for this element is over
-			// g[i] = 0;
-			// → There is a problem of dimension reduction, so this will be done by the caller.
-		}
-
-#elif defined(SGD_UPDATE)
+#if defined(SGD_UPDATE)
 
 		// See only the sign of the gradient Update with SGD
 		// When executing this function, the value of g and the member do not change

From f3a158725d573753cf4b81fc5866c0f3bbdb1e88 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 00:07:09 +0900
Subject: [PATCH 060/398] Removed SGD_UPDATE macro.

---
 src/learn/learn.h          | 12 ---------
 src/learn/learning_tools.h | 51 ++------------------------------------
 2 files changed, 2 insertions(+), 61 deletions(-)

diff --git a/src/learn/learn.h b/src/learn/learn.h
index 8fb6217f..91b40213 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -5,14 +5,6 @@
 
 #include <vector>
 
-// ----------------------
-// update formula
-// ----------------------
-
-// SGD looking only at the sign of the gradient. It requires less memory, but the accuracy is...
-// #define SGD_UPDATE
-
-
 // ----------------------
 // Select the objective function
 // ----------------------
@@ -134,10 +126,6 @@ typedef float LearnFloatType;
 
 #define LOSS_FUNCTION_IS_ELMO_METHOD
 
-#if defined(SGD_UPDATE)
-#define LEARN_UPDATE "SGD"
-#endif
-
 #if defined(LOSS_FUNCTION_IS_WINNING_PERCENTAGE)
 #define LOSS_FUNCTION "WINNING_PERCENTAGE"
 #elif defined(LOSS_FUNCTION_IS_CROSS_ENTOROPY)
diff --git a/src/learn/learning_tools.h b/src/learn/learning_tools.h
index 854133e4..348105b6 100644
--- a/src/learn/learning_tools.h
+++ b/src/learn/learning_tools.h
@@ -4,13 +4,12 @@
 // A set of machine learning tools related to the weight array used for machine learning of evaluation functions
 
 #include "learn.h"
+
 #if defined (EVAL_LEARN)
-#include <array>
 
-#if defined(SGD_UPDATE) || defined(USE_KPPP_MIRROR_WRITE)
 #include "../misc.h"  // PRNG , my_insertion_sort
-#endif
 
+#include <array>
 #include <cmath>	// std::sqrt()
 
 namespace EvalLearningTools
@@ -29,14 +28,6 @@ namespace EvalLearningTools
 		// cumulative value of one mini-batch gradient
 		LearnFloatType g = LearnFloatType(0);
 
-		// When ADA_GRAD_UPDATE. LearnFloatType == float,
-		// total 4*2 + 4*2 + 1*2 = 18 bytes
-		// It suffices to secure a Weight array that is 4.5 times the size of the evaluation function parameter of 1GB.
-		// However, sizeof(Weight)==20 code is generated if the structure alignment is in 4-byte units, so
-		// Specify pragma pack(2).
-
-		// For SGD_UPDATE, this structure is reduced by 10 bytes to 8 bytes.
-
 		// Learning rate η(eta) such as AdaGrad.
 		// It is assumed that eta1,2,3,eta1_epoch,eta2_epoch have been set by the time updateFV() is called.
 		// The epoch of update_weights() gradually changes from eta1 to eta2 until eta1_epoch.
@@ -76,44 +67,6 @@ namespace EvalLearningTools
 
 		template <typename T> void updateFV(T& v) { updateFV(v, 1.0); }
 
-#if defined(SGD_UPDATE)
-
-		// See only the sign of the gradient Update with SGD
-		// When executing this function, the value of g and the member do not change
-		// Guaranteed by the caller. It does not have to be an atomic operation.
-		template <typename T>
-		void updateFV(T & v , double k)
-		{
-			if (g == 0)
-				return;
-
-			// See only the sign of g and update.
-			// If g <0, add v a little.
-			// If g> 0, subtract v slightly.
-
-			// Since we only add integers, no decimal part is required.
-
-			// It's a good idea to move around 0-5.
-			// It is better to have a Gaussian distribution, so generate a 5-bit random number (each bit has a 1/2 probability of 1),
-			// Pop_count() it. At this time, it has a binomial distribution.
-			//int16_t diff = (int16_t)POPCNT32((u32)prng.rand(31));
-			// → If I do this with 80 threads, this AsyncPRNG::rand() locks, so I slowed down. This implementation is not good.
-			int16_t diff = 1;
-
-			double V = v;
-			if (g > 0.0)
-				V-= diff;
-			else
-				V+= diff;
-
-			V = (std::min)((double)(std::numeric_limits<T>::max)(), V);
-			V = (std::max)((double)(std::numeric_limits<T>::min)(), V);
-
-			v = (T)V;
-		}
-
-#endif
-
 		// grad setting
 		template <typename T> void set_grad(const T& g_) { g = g_; }
 

From d37eb63581ce2de8fd1a8406a9bc06b6377d2176 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 00:08:56 +0900
Subject: [PATCH 061/398] Removed LOSS_FUNCTION_IS_WINNING_PERCENTAGE macro.

---
 src/learn/learn.h     |  9 +--------
 src/learn/learner.cpp | 36 ------------------------------------
 2 files changed, 1 insertion(+), 44 deletions(-)

diff --git a/src/learn/learn.h b/src/learn/learn.h
index 91b40213..9d783986 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -9,11 +9,6 @@
 // Select the objective function
 // ----------------------
 
-// The objective function is the sum of squares of the difference in winning percentage
-// See learner.cpp for more information.
-
-//#define LOSS_FUNCTION_IS_WINNING_PERCENTAGE
-
 // Objective function is cross entropy
 // See learner.cpp for more information.
 // So-called ordinary "rag cloth squeezer"
@@ -126,9 +121,7 @@ typedef float LearnFloatType;
 
 #define LOSS_FUNCTION_IS_ELMO_METHOD
 
-#if defined(LOSS_FUNCTION_IS_WINNING_PERCENTAGE)
-#define LOSS_FUNCTION "WINNING_PERCENTAGE"
-#elif defined(LOSS_FUNCTION_IS_CROSS_ENTOROPY)
+#if defined(LOSS_FUNCTION_IS_CROSS_ENTOROPY)
 #define LOSS_FUNCTION "CROSS_ENTOROPY"
 #elif defined(LOSS_FUNCTION_IS_CROSS_ENTOROPY_FOR_VALUE)
 #define LOSS_FUNCTION "CROSS_ENTOROPY_FOR_VALUE"
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index daea9594..e9658da6 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -163,42 +163,6 @@ namespace Learner
         return ((y2 - y1) / epsilon) / winning_probability_coefficient;
     }
 
-    // When the objective function is the sum of squares of the difference in winning percentage
-#if defined (LOSS_FUNCTION_IS_WINNING_PERCENTAGE)
-// function to calculate the gradient
-    double calc_grad(Value deep, Value shallow, PackedSfenValue& psv)
-    {
-        // The square of the win rate difference minimizes it in the objective function.
-        // Objective function J = 1/2m Σ (win_rate(shallow)-win_rate(deep) )^2
-        // However, σ is a sigmoid function that converts the 
-        // evaluation value into the difference in the winning percentage.
-        // m is the number of samples. shallow is the evaluation value 
-        // for a shallow search (qsearch()). deep is the evaluation value for deep search.
-        // If W is the feature vector (parameter of the evaluation function) 
-        // and Xi and Yi are teachers
-        // shallow = W*Xi // * is the Hadamard product, transposing W and meaning X
-        // f(Xi) = win_rate(W*Xi)
-        // If σ(i th deep) = Yi,
-        // J = m/2 Σ (f(Xi)-Yi )^2
-        // becomes a common expression.
-        // W is a vector, and if we write the jth element as Wj, from the chain rule
-        // ∂J/∂Wj = ∂J/∂f ・∂f/∂W ・∂W/∂Wj
-        // = 1/m Σ (f(Xi)-y) ・f'(Xi) ・ 1
-
-        // 1/m will be multiplied later, but the contents of Σ can 
-        // be retained in the array as the value of the gradient.
-        // f'(Xi) = win_rate'(shallow) = sigmoid'(shallow/600) = dsigmoid(shallow / 600) / 600
-        // This /600 at the end is adjusted by the learning rate, so do not write it..
-        // Also, the coefficient of 1/m is unnecessary if you use the update 
-        // formula that has the automatic gradient adjustment function like Adam and AdaGrad.
-        // Therefore, it is not necessary to save it in memory.
-
-        const double p = winning_percentage(deep, psv.gamePly);
-        const double q = winning_percentage(shallow, psv.gamePly);
-        return (q - p) * Math::dsigmoid(double(shallow) / 600.0);
-    }
-#endif
-
 #if defined (LOSS_FUNCTION_IS_CROSS_ENTOROPY)
     double calc_grad(Value deep, Value shallow, const PackedSfenValue& psv)
     {

From f52fbf8006174023fa137feda1d7db67a884ac2e Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 00:10:04 +0900
Subject: [PATCH 062/398] Removed LOSS_FUNCTION_IS_CROSS_ENTOROPY macro.

---
 src/learn/learn.h     |  9 +--------
 src/learn/learner.cpp | 29 -----------------------------
 2 files changed, 1 insertion(+), 37 deletions(-)

diff --git a/src/learn/learn.h b/src/learn/learn.h
index 9d783986..da542d67 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -9,11 +9,6 @@
 // Select the objective function
 // ----------------------
 
-// Objective function is cross entropy
-// See learner.cpp for more information.
-// So-called ordinary "rag cloth squeezer"
-//#define LOSS_FUNCTION_IS_CROSS_ENTOROPY
-
 // A version in which the objective function is cross entropy, but the win rate function is not passed
 // #define LOSS_FUNCTION_IS_CROSS_ENTOROPY_FOR_VALUE
 
@@ -121,9 +116,7 @@ typedef float LearnFloatType;
 
 #define LOSS_FUNCTION_IS_ELMO_METHOD
 
-#if defined(LOSS_FUNCTION_IS_CROSS_ENTOROPY)
-#define LOSS_FUNCTION "CROSS_ENTOROPY"
-#elif defined(LOSS_FUNCTION_IS_CROSS_ENTOROPY_FOR_VALUE)
+#if defined(LOSS_FUNCTION_IS_CROSS_ENTOROPY_FOR_VALUE)
 #define LOSS_FUNCTION "CROSS_ENTOROPY_FOR_VALUE"
 #elif defined(LOSS_FUNCTION_IS_ELMO_METHOD)
 #define LOSS_FUNCTION "ELMO_METHOD(WCSC27)"
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index e9658da6..66835ce5 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -163,35 +163,6 @@ namespace Learner
         return ((y2 - y1) / epsilon) / winning_probability_coefficient;
     }
 
-#if defined (LOSS_FUNCTION_IS_CROSS_ENTOROPY)
-    double calc_grad(Value deep, Value shallow, const PackedSfenValue& psv)
-    {
-        // Objective function with cross entropy
-
-        // For the concept and nature of cross entropy,
-        // http://nnadl-ja.github.io/nnadl_site_ja/chap3.html#the_cross-entropy_cost_function
-        // http://postd.cc/visual-information-theory-3/
-        // Refer to etc.
-
-        // Objective function design)
-        // We want to make the distribution of p closer to the distribution of q 
-        // → Think of it as the problem of minimizing the cross entropy 
-        // between the probability distributions of p and q.
-        // J = H(p,q) =-Σ p(x) log(q(x)) = -p log q-(1-p) log(1-q)
-        // x
-
-        // p is a constant and q is a Wi function (q = σ(W・Xi) ).
-        // ∂J/∂Wi = -p・q'/q-(1-p)(1-q)'/(1-q)
-        // = ...
-        // = q-p.
-
-        const double p = winning_percentage(deep, psv.gamePly);
-        const double q = winning_percentage(shallow, psv.gamePly);
-
-        return q - p;
-    }
-#endif
-
 #if defined ( LOSS_FUNCTION_IS_CROSS_ENTOROPY_FOR_VALUE )
     double calc_grad(Value deep, Value shallow, const PackedSfenValue& psv)
     {

From ef1601218db703b42e31b34d8c324f0ec3001f83 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 00:11:11 +0900
Subject: [PATCH 063/398] Removed LOSS_FUNCTION_IS_CROSS_ENTOROPY_FOR_VALUE
 macro.

---
 src/learn/learn.h     |  7 +------
 src/learn/learner.cpp | 11 -----------
 2 files changed, 1 insertion(+), 17 deletions(-)

diff --git a/src/learn/learn.h b/src/learn/learn.h
index da542d67..d2477277 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -9,9 +9,6 @@
 // Select the objective function
 // ----------------------
 
-// A version in which the objective function is cross entropy, but the win rate function is not passed
-// #define LOSS_FUNCTION_IS_CROSS_ENTOROPY_FOR_VALUE
-
 // elmo (WCSC27) method
 // #define LOSS_FUNCTION_IS_ELMO_METHOD
 
@@ -116,9 +113,7 @@ typedef float LearnFloatType;
 
 #define LOSS_FUNCTION_IS_ELMO_METHOD
 
-#if defined(LOSS_FUNCTION_IS_CROSS_ENTOROPY_FOR_VALUE)
-#define LOSS_FUNCTION "CROSS_ENTOROPY_FOR_VALUE"
-#elif defined(LOSS_FUNCTION_IS_ELMO_METHOD)
+#if defined(LOSS_FUNCTION_IS_ELMO_METHOD)
 #define LOSS_FUNCTION "ELMO_METHOD(WCSC27)"
 #endif
 
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 66835ce5..82bcfa09 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -163,17 +163,6 @@ namespace Learner
         return ((y2 - y1) / epsilon) / winning_probability_coefficient;
     }
 
-#if defined ( LOSS_FUNCTION_IS_CROSS_ENTOROPY_FOR_VALUE )
-    double calc_grad(Value deep, Value shallow, const PackedSfenValue& psv)
-    {
-        // Version that does not pass the winning percentage function
-        // This, unless EVAL_LIMIT is set low, trying to 
-        // match the evaluation value with the shape of the end stage
-        // eval may exceed the range of eval.
-        return shallow - deep;
-    }
-#endif
-
 #if defined ( LOSS_FUNCTION_IS_ELMO_METHOD )
 
     // A constant used in elmo (WCSC27). Adjustment required.

From dbad9d96e0fc2923edfdbef37162ecd5b0645d50 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 00:17:02 +0900
Subject: [PATCH 064/398] Removed LOSS_FUNCTION_IS_ELMO_METHOD macro.

---
 src/learn/learn.h     | 19 ---------------
 src/learn/learner.cpp | 54 +------------------------------------------
 2 files changed, 1 insertion(+), 72 deletions(-)

diff --git a/src/learn/learn.h b/src/learn/learn.h
index d2477277..2ee2f8d6 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -5,21 +5,6 @@
 
 #include <vector>
 
-// ----------------------
-// Select the objective function
-// ----------------------
-
-// elmo (WCSC27) method
-// #define LOSS_FUNCTION_IS_ELMO_METHOD
-
-// ※ Other things may be added.
-
-
-// ----------------------
-// debug settings for learning
-// ----------------------
-
-
 // ----------------------
 // learning from zero vector
 // ----------------------
@@ -111,11 +96,7 @@ typedef float LearnFloatType;
 // Learning with the method of elmo (WCSC27)
 // ----------------------
 
-#define LOSS_FUNCTION_IS_ELMO_METHOD
-
-#if defined(LOSS_FUNCTION_IS_ELMO_METHOD)
 #define LOSS_FUNCTION "ELMO_METHOD(WCSC27)"
-#endif
 
 // ----------------------
 // Definition of struct used in Learner
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 82bcfa09..84cade5c 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -163,8 +163,6 @@ namespace Learner
         return ((y2 - y1) / epsilon) / winning_probability_coefficient;
     }
 
-#if defined ( LOSS_FUNCTION_IS_ELMO_METHOD )
-
     // A constant used in elmo (WCSC27). Adjustment required.
     // Since elmo does not internally divide the expression, the value is different.
     // You can set this value with the learn command.
@@ -293,7 +291,6 @@ namespace Learner
             (-m * std::log(m + epsilon) - (1.0 - m) * std::log(1.0 - m + epsilon));
     }
 
-#endif
     // Other objective functions may be considered in the future...
     double calc_grad(Value shallow, const PackedSfenValue& psv) 
     {
@@ -629,14 +626,12 @@ namespace Learner
             stop_flag(false), 
             save_only_once(false)
         {
-#if defined ( LOSS_FUNCTION_IS_ELMO_METHOD )
             learn_sum_cross_entropy_eval = 0.0;
             learn_sum_cross_entropy_win = 0.0;
             learn_sum_cross_entropy = 0.0;
             learn_sum_entropy_eval = 0.0;
             learn_sum_entropy_win = 0.0;
             learn_sum_entropy = 0.0;
-#endif
 
             newbob_scale = 1.0;
             newbob_decay = 1.0;
@@ -689,15 +684,13 @@ namespace Learner
 
         // --- loss calculation
 
-#if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
-    // For calculation of learning data loss
+        // For calculation of learning data loss
         atomic<double> learn_sum_cross_entropy_eval;
         atomic<double> learn_sum_cross_entropy_win;
         atomic<double> learn_sum_cross_entropy;
         atomic<double> learn_sum_entropy_eval;
         atomic<double> learn_sum_entropy_win;
         atomic<double> learn_sum_entropy;
-#endif
 
         shared_timed_mutex nn_mutex;
         double newbob_scale;
@@ -759,13 +752,6 @@ namespace Learner
         std::cout << ", iteration " << epoch;
         std::cout << ", eta = " << Eval::get_eta() << ", ";
 
-#if !defined(LOSS_FUNCTION_IS_ELMO_METHOD)
-        double sum_error = 0;
-        double sum_error2 = 0;
-        double sum_error3 = 0;
-#endif
-
-#if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
         // For calculation of verification data loss
         atomic<double> test_sum_cross_entropy_eval, test_sum_cross_entropy_win, test_sum_cross_entropy;
         atomic<double> test_sum_entropy_eval, test_sum_entropy_win, test_sum_entropy;
@@ -779,7 +765,6 @@ namespace Learner
         // norm for learning
         atomic<double> sum_norm;
         sum_norm = 0;
-#endif
 
         // The number of times the pv first move of deep 
         // search matches the pv first move of search(1).
@@ -841,25 +826,11 @@ namespace Learner
                 // Note) This code does not consider when 
                 //       eval_limit is specified in the learn command.
 
-                // --- error calculation
-
-#if !defined(LOSS_FUNCTION_IS_ELMO_METHOD)
-                auto grad = calc_grad(deep_value, shallow_value, ps);
-
-                // something like rmse
-                sum_error += grad * grad;
-                // Add the absolute value of the gradient
-                sum_error2 += abs(grad);
-                // Add the absolute value of the difference between the evaluation values
-                sum_error3 += abs(shallow_value - deep_value);
-#endif
-
                 // --- calculation of cross entropy
 
                 // For the time being, regarding the win rate and loss terms only in the elmo method
                 // Calculate and display the cross entropy.
 
-#if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
                 double test_cross_entropy_eval, test_cross_entropy_win, test_cross_entropy;
                 double test_entropy_eval, test_entropy_win, test_entropy;
                 calc_cross_entropy(
@@ -881,7 +852,6 @@ namespace Learner
                 test_sum_entropy_win += test_entropy_win;
                 test_sum_entropy += test_entropy;
                 sum_norm += (double)abs(shallow_value);
-#endif
 
                 // Determine if the teacher's move and the score of the shallow search match
                 {
@@ -905,17 +875,6 @@ namespace Learner
         while (task_count)
             sleep(1);
 
-#if !defined(LOSS_FUNCTION_IS_ELMO_METHOD)
-        // rmse = root mean square error: mean square error
-        // mae = mean absolute error: mean absolute error
-        auto dsig_rmse = std::sqrt(sum_error / (sfen_for_mse.size() + epsilon));
-        auto dsig_mae = sum_error2 / (sfen_for_mse.size() + epsilon);
-        auto eval_mae = sum_error3 / (sfen_for_mse.size() + epsilon);
-        cout << " , dsig rmse = " << dsig_rmse << " , dsig mae = " << dsig_mae
-            << " , eval mae = " << eval_mae;
-#endif
-
-#if defined(LOSS_FUNCTION_IS_ELMO_METHOD)
         latest_loss_sum += test_sum_cross_entropy - test_sum_entropy;
         latest_loss_count += sr.sfen_for_mse.size();
 
@@ -960,9 +919,6 @@ namespace Learner
         learn_sum_entropy_eval = 0.0;
         learn_sum_entropy_win = 0.0;
         learn_sum_entropy = 0.0;
-#else
-        << endl;
-#endif
     }
 
     void LearnerThink::thread_worker(size_t thread_id)
@@ -1144,7 +1100,6 @@ namespace Learner
                     ? Eval::evaluate(pos) 
                     : -Eval::evaluate(pos);
 
-#if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
                 // Calculate loss for training data
                 double learn_cross_entropy_eval, learn_cross_entropy_win, learn_cross_entropy;
                 double learn_entropy_eval, learn_entropy_win, learn_entropy;
@@ -1165,7 +1120,6 @@ namespace Learner
                 learn_sum_entropy_eval += learn_entropy_eval;
                 learn_sum_entropy_win += learn_entropy_win;
                 learn_sum_entropy += learn_entropy;
-#endif
 
                 const double example_weight =
                     (discount_rate != 0 && ply != (int)pv.size()) ? discount_rate : 1.0;
@@ -1600,12 +1554,10 @@ namespace Learner
         // Turn on if you want to pass a pre-shuffled file.
         bool no_shuffle = false;
 
-#if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
         // elmo lambda
         ELMO_LAMBDA = 0.33;
         ELMO_LAMBDA2 = 0.33;
         ELMO_LAMBDA_LIMIT = 32000;
-#endif
 
         // Discount rate. If this is set to a value other than 0, 
         // the slope will be added even at other than the PV termination. 
@@ -1703,13 +1655,11 @@ namespace Learner
             else if (option == "freeze_kkpp")  is >> freeze[3];
 #endif
 
-#if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
             // LAMBDA
             else if (option == "lambda")       is >> ELMO_LAMBDA;
             else if (option == "lambda2")      is >> ELMO_LAMBDA2;
             else if (option == "lambda_limit") is >> ELMO_LAMBDA_LIMIT;
 
-#endif
             else if (option == "reduction_gameply") is >> reduction_gameply;
 
             // shuffle related
@@ -1900,11 +1850,9 @@ namespace Learner
         reduction_gameply = max(reduction_gameply, 1);
         cout << "reduction_gameply : " << reduction_gameply << endl;
 
-#if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
         cout << "LAMBDA            : " << ELMO_LAMBDA << endl;
         cout << "LAMBDA2           : " << ELMO_LAMBDA2 << endl;
         cout << "LAMBDA_LIMIT      : " << ELMO_LAMBDA_LIMIT << endl;
-#endif
 
         cout << "mirror_percentage : " << mirror_percentage << endl;
         cout << "eval_save_interval  : " << eval_save_interval << " sfens" << endl;

From f52165e1d3b8bebdd702e089eb9fdd7761d45076 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 00:19:14 +0900
Subject: [PATCH 065/398] Removed RESET_TO_ZERO_VECTOR macro.

---
 src/learn/learn.h | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/src/learn/learn.h b/src/learn/learn.h
index 2ee2f8d6..6056e8c6 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -5,18 +5,6 @@
 
 #include <vector>
 
-// ----------------------
-// learning from zero vector
-// ----------------------
-
-// Start learning the evaluation function parameters from the zero vector.
-// Initialize to zero, generate a game, learn from zero vector,
-// Game generation → If you repeat learning, you will get parameters that do not depend on the professional game. (maybe)
-// (very time consuming)
-
-//#define RESET_TO_ZERO_VECTOR
-
-
 // ----------------------
 // Floating point for learning
 // ----------------------

From 5e2570267228653a11bf42c14d77d1baf26b99ac Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 00:19:53 +0900
Subject: [PATCH 066/398] Removed USE_TRIANGLE_WEIGHT_ARRAY macro.

---
 src/learn/learn.h | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/src/learn/learn.h b/src/learn/learn.h
index 6056e8c6..ea622bce 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -23,15 +23,6 @@ typedef float LearnFloatType;
 //#include "half_float.h"
 //typedef HalfFloat::float16 LearnFloatType;
 
-// ----------------------
-// save memory
-// ----------------------
-
-// Use a triangular array for the Weight array (of which is KPP) to save memory.
-// If this is used, the weight array for learning will be about 3 times as large as the evaluation function file.
-
-#define USE_TRIANGLE_WEIGHT_ARRAY
-
 // ----------------------
 // dimension down
 // ----------------------

From eafa5693658a91e97612a04b2c620ec5a545e3a0 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 00:22:02 +0900
Subject: [PATCH 067/398] Removed macros for KPP factorization.

---
 src/learn/learn.h | 31 -------------------------------
 1 file changed, 31 deletions(-)

diff --git a/src/learn/learn.h b/src/learn/learn.h
index ea622bce..0df71c7a 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -23,37 +23,6 @@ typedef float LearnFloatType;
 //#include "half_float.h"
 //typedef HalfFloat::float16 LearnFloatType;
 
-// ----------------------
-// dimension down
-// ----------------------
-
-// Dimension reduction for mirrors (left/right symmetry) and inverse (forward/backward symmetry).
-// All on by default.
-
-// Dimension reduction using mirror and inverse for KK. (Unclear effect)
-// USE_KK_MIRROR_WRITE must be on when USE_KK_INVERSE_WRITE is on.
-#define USE_KK_MIRROR_WRITE
-#define USE_KK_INVERSE_WRITE
-
-// Dimension reduction using Mirror and Inverse for KKP. (Inverse is not so effective)
-// When USE_KKP_INVERSE_WRITE is turned on, USE_KKP_MIRROR_WRITE must also be turned on.
-#define USE_KKP_MIRROR_WRITE
-#define USE_KKP_INVERSE_WRITE
-
-// Perform dimension reduction using a mirror for KPP. (Turning this off requires double the teacher position)
-// KPP has no inverse. (Because there is only K on the front side)
-#define USE_KPP_MIRROR_WRITE
-
-// Perform a dimension reduction using a mirror for KPPP. (Turning this off requires double the teacher position)
-// KPPP has no inverse. (Because there is only K on the front side)
-#define USE_KPPP_MIRROR_WRITE
-
-// Reduce the dimension by KPP for learning the KKPP component.
-// Learning is very slow.
-// Do not use as it is not debugged.
-//#define USE_KKPP_LOWER_DIM
-
-
 // ======================
 // Settings for creating teacher phases
 // ======================

From 8d763fb503fed49e4b7fa2be115e0fa6eb0e74d7 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 00:22:38 +0900
Subject: [PATCH 068/398] Removed LEARN_GENSFEN_USE_DRAW_RESULT macro.

---
 src/learn/learn.h | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/src/learn/learn.h b/src/learn/learn.h
index 0df71c7a..b7ca18e8 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -23,19 +23,6 @@ typedef float LearnFloatType;
 //#include "half_float.h"
 //typedef HalfFloat::float16 LearnFloatType;
 
-// ======================
-// Settings for creating teacher phases
-// ======================
-
-// ----------------------
-// write out the draw
-// ----------------------
-
-// When you reach a draw, write it out as a teacher position
-// It's subtle whether it's better to do this.
-// #define LEARN_GENSFEN_USE_DRAW_RESULT
-
-
 // ======================
 // configure
 // ======================

From cea17c92f9ad91d0dd2d73db272e6ce6712ba048 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 08:53:57 +0900
Subject: [PATCH 069/398] Simplified evaluate_common.h.

---
 src/eval/evaluate_common.h | 59 ++++----------------------------------
 1 file changed, 5 insertions(+), 54 deletions(-)

diff --git a/src/eval/evaluate_common.h b/src/eval/evaluate_common.h
index 927783cd..989169b3 100644
--- a/src/eval/evaluate_common.h
+++ b/src/eval/evaluate_common.h
@@ -1,75 +1,26 @@
 ﻿#ifndef _EVALUATE_COMMON_H_
 #define _EVALUATE_COMMON_H_
 
+#if defined(EVAL_LEARN)
+
 // A common header-like function for modern evaluation functions (EVAL_KPPT and EVAL_KPP_KKPT).
 
-#include <functional>
-
-// KK file name
-#define KK_BIN "KK_synthesized.bin"
-
-// KKP file name
-#define KKP_BIN "KKP_synthesized.bin"
-
-// KPP file name
-#define KPP_BIN "KPP_synthesized.bin"
-
-#include "../position.h"
+#include <string>
 
 namespace Eval
 {
-	// An operator that applies the function f to each parameter of the evaluation function.
-	// Used for parameter analysis etc.
-	// type indicates the survey target.
-	// type = -1 :KK,KKP,KPP all
-	// type = 0: KK only
-	// type = 1: KKP only
-	// type = 2: KPP only
-	void foreach_eval_param(std::function<void(int32_t, int32_t)>f, int type = -1);
-
 	// --------------------------
 	// for learning
 	// --------------------------
 
-#if defined(EVAL_LEARN)
-	// Initialize the gradient array during learning
-	// Pass the learning rate as an argument. If 0.0, the default value is used.
-	// The epoch of update_weights() gradually changes from eta to eta2 until eta_epoch.
-	// After eta2_epoch, gradually change from eta2 to eta3.
-	void init_grad(double eta1, uint64_t eta_epoch, double eta2, uint64_t eta2_epoch, double eta3);
-
-	// Add the gradient difference value to the gradient array for all features that appear in the current phase.
-	// freeze[0]: Flag that kk does not learn
-	// freeze[1]: Flag that kkp does not learn
-	// freeze[2]: Flag that kpp does not learn
-	// freeze[3]: Flag that kppp does not learn
-	void add_grad(Position& pos, Color rootColor, double delt_grad, const std::array<bool, 4>& freeze);
-
-	// Do SGD or AdaGrad or something based on the current gradient.
-	// epoch: Generation counter (starting from 0)
-	// freeze[0]: Flag that kk does not learn
-	// freeze[1]: Flag that kkp does not learn
-	// freeze[2]: Flag that kpp does not learn
-	// freeze[3]: Flag that kppp does not learn
-	void update_weights(uint64_t epoch, const std::array<bool, 4>& freeze);
-
 	// Save the evaluation function parameters to a file.
 	// You can specify the extension added to the end of the file.
 	void save_eval(std::string suffix);
 
 	// Get the current eta.
 	double get_eta();
-
-	// --learning related commands
-
-	// A function that normalizes KK. Note that it is not completely equivalent to the original evaluation function.
-	// By making the values ​​of kkp and kpp as close to zero as possible, the value of the feature factor (which is zero) that did not appear during learning
-	// The idea of ​​ensuring it is valid.
-	void regularize_kk();
-
-#endif
-
-
 }
 
+#endif // defined(EVAL_LEARN)
+
 #endif // _EVALUATE_KPPT_COMMON_H_

From 2583f689729f7644cb5a5ac6d0369c0c726c3141 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 08:58:10 +0900
Subject: [PATCH 070/398] Removed macros for KPP evaluate functions.

---
 src/eval/evaluate_common.h |  2 +-
 src/learn/learner.cpp      | 16 ----------------
 2 files changed, 1 insertion(+), 17 deletions(-)

diff --git a/src/eval/evaluate_common.h b/src/eval/evaluate_common.h
index 989169b3..7799fe79 100644
--- a/src/eval/evaluate_common.h
+++ b/src/eval/evaluate_common.h
@@ -3,7 +3,7 @@
 
 #if defined(EVAL_LEARN)
 
-// A common header-like function for modern evaluation functions (EVAL_KPPT and EVAL_KPP_KKPT).
+// A common header-like function for modern evaluation functions.
 
 #include <string>
 
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 84cade5c..5d9b242f 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -1647,14 +1647,6 @@ namespace Learner
             else if (option == "freeze_kkp")   is >> freeze[1];
             else if (option == "freeze_kpp")   is >> freeze[2];
 
-#if defined(EVAL_KPPT) || defined(EVAL_KPP_KKPT) || defined(EVAL_KPP_KKPT_FV_VAR) || defined(EVAL_NABLA)
-
-#elif defined(EVAL_KPPPT) || defined(EVAL_KPPP_KKPT) || defined(EVAL_HELICES)
-            else if (option == "freeze_kppp")  is >> freeze[3];
-#elif defined(EVAL_KKPP_KKPT) || defined(EVAL_KKPPT)
-            else if (option == "freeze_kkpp")  is >> freeze[3];
-#endif
-
             // LAMBDA
             else if (option == "lambda")       is >> ELMO_LAMBDA;
             else if (option == "lambda2")      is >> ELMO_LAMBDA2;
@@ -1858,14 +1850,6 @@ namespace Learner
         cout << "eval_save_interval  : " << eval_save_interval << " sfens" << endl;
         cout << "loss_output_interval: " << loss_output_interval << " sfens" << endl;
 
-#if defined(EVAL_KPPT) || defined(EVAL_KPP_KKPT) || defined(EVAL_KPP_KKPT_FV_VAR) || defined(EVAL_NABLA)
-        cout << "freeze_kk/kkp/kpp      : " << freeze[0] << " , " << freeze[1] << " , " << freeze[2] << endl;
-#elif defined(EVAL_KPPPT) || defined(EVAL_KPPP_KKPT) || defined(EVAL_HELICES)
-        cout << "freeze_kk/kkp/kpp/kppp : " << freeze[0] << " , " << freeze[1] << " , " << freeze[2] << " , " << freeze[3] << endl;
-#elif defined(EVAL_KKPP_KKPT) || defined(EVAL_KKPPT)
-        cout << "freeze_kk/kkp/kpp/kkpp : " << freeze[0] << " , " << freeze[1] << " , " << freeze[2] << " , " << freeze[3] << endl;
-#endif
-
         // -----------------------------------
         // various initialization
         // -----------------------------------

From 18648458117a35acb2617e9fe04192acca6ba2ae Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 09:26:42 +0900
Subject: [PATCH 071/398] Commented out unused parameters.

---
 src/nnue/features/castling_right.cpp | 6 +++---
 src/nnue/features/enpassant.cpp      | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/nnue/features/castling_right.cpp b/src/nnue/features/castling_right.cpp
index 47fbd986..86fe06fe 100644
--- a/src/nnue/features/castling_right.cpp
+++ b/src/nnue/features/castling_right.cpp
@@ -26,7 +26,7 @@ namespace Eval {
             & ((castling_rights >> 2) & 3);
         }
 
-        for (int i = 0; i <kDimensions; ++i) {
+        for (Eval::NNUE::IndexType i = 0; i < kDimensions; ++i) {
           if (relative_castling_rights & (i << 1)) {
             active->push_back(i);
           }
@@ -36,7 +36,7 @@ namespace Eval {
       // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
       void CastlingRight::AppendChangedIndices(
         const Position& pos, Color perspective,
-        IndexList* removed, IndexList* added) {
+        IndexList* removed, IndexList* /* added */) {
 
         int previous_castling_rights = pos.state()->previous->castlingRights;
         int current_castling_rights = pos.state()->castlingRights;
@@ -54,7 +54,7 @@ namespace Eval {
             & ((current_castling_rights >> 2) & 3);
         }
 
-        for (int i = 0; i < kDimensions; ++i) {
+        for (Eval::NNUE::IndexType i = 0; i < kDimensions; ++i) {
           if ((relative_previous_castling_rights & (i << 1)) &&
             (relative_current_castling_rights & (i << 1)) == 0) {
             removed->push_back(i);
diff --git a/src/nnue/features/enpassant.cpp b/src/nnue/features/enpassant.cpp
index 77bc936e..386bd907 100644
--- a/src/nnue/features/enpassant.cpp
+++ b/src/nnue/features/enpassant.cpp
@@ -30,8 +30,8 @@ namespace Eval {
 
       // Get a list of indices whose values ??have changed from the previous one in the feature quantity
       void EnPassant::AppendChangedIndices(
-        const Position& pos, Color perspective,
-        IndexList* removed, IndexList* added) {
+        const Position& /* pos */, Color /* perspective */,
+        IndexList* /* removed */, IndexList* /* added */) {
         // Not implemented.
         assert(false);
       }

From 4206a1edd069600da29b8ee5a99a486b7aa1603f Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 09:46:05 +0900
Subject: [PATCH 072/398] Renamed parameters to avoid shadowing other
 parameters.

---
 src/nnue/nnue_test_command.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/nnue/nnue_test_command.cpp b/src/nnue/nnue_test_command.cpp
index c3a53c7d..5f0776ef 100644
--- a/src/nnue/nnue_test_command.cpp
+++ b/src/nnue/nnue_test_command.cpp
@@ -34,12 +34,12 @@ void TestFeatures(Position& pos) {
   std::vector<std::uint64_t> num_resets(kRefreshTriggers.size());
   constexpr IndexType kUnknown = -1;
   std::vector<IndexType> trigger_map(RawFeatures::kDimensions, kUnknown);
-  auto make_index_sets = [&](const Position& pos) {
+  auto make_index_sets = [&](const Position& position) {
     std::vector<std::vector<std::set<IndexType>>> index_sets(
         kRefreshTriggers.size(), std::vector<std::set<IndexType>>(2));
     for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
       Features::IndexList active_indices[2];
-      RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i],
+      RawFeatures::AppendActiveIndices(position, kRefreshTriggers[i],
                                        active_indices);
       for (const auto perspective : Colors) {
         for (const auto index : active_indices[perspective]) {
@@ -53,11 +53,11 @@ void TestFeatures(Position& pos) {
     }
     return index_sets;
   };
-  auto update_index_sets = [&](const Position& pos, auto* index_sets) {
+  auto update_index_sets = [&](const Position& position, auto* index_sets) {
     for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
       Features::IndexList removed_indices[2], added_indices[2];
       bool reset[2];
-      RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i],
+      RawFeatures::AppendChangedIndices(position, kRefreshTriggers[i],
                                         removed_indices, added_indices, reset);
       for (const auto perspective : Colors) {
         if (reset[perspective]) {

From 17d42e023ed13665ed200491a299a177c7954c74 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Tue, 8 Sep 2020 15:10:58 +0200
Subject: [PATCH 073/398] add more CI, instrumented runs

---
 .travis.yml                 |   7 ++
 src/learn/learner.cpp       |   6 +-
 tests/instrumented_learn.sh | 126 ++++++++++++++++++++++++++++++++++++
 3 files changed, 136 insertions(+), 3 deletions(-)
 create mode 100755 tests/instrumented_learn.sh

diff --git a/.travis.yml b/.travis.yml
index 5859f97b..eb3ad741 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -107,3 +107,10 @@ script:
   #
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-undefined; fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64-modern sanitize=thread    optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-thread; fi
+
+  #
+  # NNUE testing / TODO should work with debug=yes as well
+  #
+  - export CXXFLAGS="-O1 -fno-inline"
+  - if [ -x "$(command -v valgrind )" ]; then make clean && make -j2 ARCH=x86-64-modern debug=no optimize=no learn > /dev/null && ../tests/instrumented_learn.sh --valgrind; fi
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=no learn > /dev/null && ../tests/instrumented_learn.sh --sanitizer-undefined; fi
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 5d9b242f..15f0825d 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -1716,9 +1716,9 @@ namespace Learner
             namespace sys = std::filesystem;
             sys::path p(kif_base_dir); // Origin of enumeration
             std::for_each(sys::directory_iterator(p), sys::directory_iterator(),
-                [&](const sys::path& p) {
-                    if (sys::is_regular_file(p))
-                        filenames.push_back(Path::Combine(target_dir, p.filename().generic_string()));
+                [&](const sys::path& path) {
+                    if (sys::is_regular_file(path))
+                        filenames.push_back(Path::Combine(target_dir, path.filename().generic_string()));
                 });
         }
 
diff --git a/tests/instrumented_learn.sh b/tests/instrumented_learn.sh
new file mode 100755
index 00000000..756569e6
--- /dev/null
+++ b/tests/instrumented_learn.sh
@@ -0,0 +1,126 @@
+#!/bin/bash
+# check for errors under valgrind or sanitizers.
+
+error()
+{
+  echo "instrumented testing failed on line $1"
+  exit 1
+}
+trap 'error ${LINENO}' ERR
+
+# define suitable post and prefixes for testing options
+case $1 in
+  --valgrind)
+    echo "valgrind testing started"
+    prefix=''
+    exeprefix='valgrind --error-exitcode=42'
+    postfix='1>/dev/null'
+    threads="1"
+  ;;
+  --valgrind-thread)
+    echo "valgrind-thread testing started"
+    prefix=''
+    exeprefix='valgrind --error-exitcode=42'
+    postfix='1>/dev/null'
+    threads="2"
+  ;;
+  --sanitizer-undefined)
+    echo "sanitizer-undefined testing started"
+    prefix='!'
+    exeprefix=''
+    postfix='2>&1 | grep -A50 "runtime error:"'
+    threads="1"
+  ;;
+  --sanitizer-thread)
+    echo "sanitizer-thread testing started"
+    prefix='!'
+    exeprefix=''
+    postfix='2>&1 | grep -A50 "WARNING: ThreadSanitizer:"'
+    threads="2"
+
+cat << EOF > tsan.supp
+race:TTEntry::move
+race:TTEntry::depth
+race:TTEntry::bound
+race:TTEntry::save
+race:TTEntry::value
+race:TTEntry::eval
+race:TTEntry::is_pv
+
+race:TranspositionTable::probe
+race:TranspositionTable::hashfull
+
+EOF
+
+    export TSAN_OPTIONS="suppressions=./tsan.supp"
+
+  ;;
+  *)
+    echo "unknown testing started"
+    prefix=''
+    exeprefix=''
+    postfix=''
+    threads="1"
+  ;;
+esac
+
+mkdir -p training_data_01
+mkdir -p training_data_02
+
+# gensfen testing 01
+cat << EOF > gensfen01.exp
+ set timeout 240
+ spawn $exeprefix ./stockfish
+
+ send "uci\n"
+ expect "uciok"
+
+ send "setoption name Threads value $threads\n"
+ send "setoption name Use NNUE value false\n"
+ send "isready\n"
+ send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data_01/training_data.bin use_raw_nnue_eval 0\n"
+ expect "gensfen finished."
+
+ send "quit\n"
+ expect eof
+
+ # return error code of the spawned program, useful for valgrind
+ lassign [wait] pid spawnid os_error_flag value
+ exit \$value
+EOF
+
+# gensfen testing 02
+cat << EOF > gensfen02.exp
+ set timeout 240
+ spawn $exeprefix ./stockfish
+
+ send "uci\n"
+ expect "uciok"
+
+ send "setoption name Threads value $threads\n"
+ send "setoption name Use NNUE value true\n"
+ send "isready\n"
+ send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data_02/training_data.bin use_raw_nnue_eval 0\n"
+ expect "gensfen finished."
+
+ send "quit\n"
+ expect eof
+
+ # return error code of the spawned program, useful for valgrind
+ lassign [wait] pid spawnid os_error_flag value
+ exit \$value
+EOF
+
+for exp in gensfen01.exp gensfen02.exp
+do
+
+  echo "$prefix expect $exp $postfix"
+  eval "$prefix expect $exp $postfix"
+
+  rm $exp
+
+done
+
+rm -f tsan.supp
+
+echo "instrumented learn testing OK"

From 8fcf8b97f1806313fd01d383cb1ffdfd2dcc4e47 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Wed, 9 Sep 2020 09:22:48 +0200
Subject: [PATCH 074/398] Add -lstdc++fs

---
 .travis.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index eb3ad741..438bf4d0 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -112,5 +112,5 @@ script:
   # NNUE testing / TODO should work with debug=yes as well
   #
   - export CXXFLAGS="-O1 -fno-inline"
-  - if [ -x "$(command -v valgrind )" ]; then make clean && make -j2 ARCH=x86-64-modern debug=no optimize=no learn > /dev/null && ../tests/instrumented_learn.sh --valgrind; fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=no learn > /dev/null && ../tests/instrumented_learn.sh --sanitizer-undefined; fi
+  - if [ -x "$(command -v valgrind )" ]; then make clean && LDFLAGS="-lstdc++fs"  make -j2 ARCH=x86-64-modern debug=no optimize=no learn > /dev/null && ../tests/instrumented_learn.sh --valgrind; fi
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && LDFLAGS="-lstdc++fs"  make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=no learn > /dev/null && ../tests/instrumented_learn.sh --sanitizer-undefined; fi

From 158399da4b368c2118e0d418f09f6dd142608760 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 20:16:09 +0900
Subject: [PATCH 075/398] Remove compile warnings.

---
 .travis.yml                            |  3 +--
 src/learn/gensfen.cpp                  |  5 ++---
 src/nnue/evaluate_nnue_learner.cpp     | 14 ++++++++++++--
 src/nnue/trainer/trainer.h             |  4 ++--
 src/nnue/trainer/trainer_input_slice.h |  2 +-
 5 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 438bf4d0..503d678a 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -77,8 +77,7 @@ script:
   - if [[ "$TRAVIS_OS_NAME" != "linux" || "$COMP" == "gcc" ]]; then make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref; fi
 
   # start some basic learner CI
-  #TODO enable -Werror
-  - export CXXFLAGS=""
+  - export CXXFLAGS="-Werror"
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && LDFLAGS="-lstdc++fs" make -j2 ARCH=x86-64-modern learn; fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && LDFLAGS="-lstdc++fs" make -j2 ARCH=x86-64-modern profile-learn; fi
 
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 3d015acf..84feabb0 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -878,8 +878,7 @@ namespace Learner
                     next_move = search_pv[0];
                 }
 
-            RANDOM_MOVE:;
-
+                // Random move.
                 auto random_move = choose_random_move(pos, random_move_flag, ply, actual_random_move_count);
                 if (random_move.has_value())
                 {
@@ -897,7 +896,7 @@ namespace Learner
                     a_psv.clear();
                 }
 
-            DO_MOVE:;
+                // Do move.
                 pos.do_move(next_move, states[ply]);
 
                 // Call node evaluate() for each difference calculation.
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 7be06832..8b0413e5 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -113,8 +113,13 @@ void SetOptions(const std::string& options) {
 void RestoreParameters(const std::string& dir_name) {
   const std::string file_name = Path::Combine(dir_name, NNUE::savedfileName);
   std::ifstream stream(file_name, std::ios::binary);
-  bool result = ReadParameters(stream);
+#ifndef NDEBUG
+  bool result =
+#endif
+  ReadParameters(stream);
+#ifndef NDEBUG
   assert(result);
+#endif
 
   SendMessages({{"reset"}});
 }
@@ -216,8 +221,13 @@ void save_eval(std::string dir_name) {
 
   const std::string file_name = Path::Combine(eval_dir, NNUE::savedfileName);
   std::ofstream stream(file_name, std::ios::binary);
-  const bool result = NNUE::WriteParameters(stream);
+#ifndef NDEBUG
+  const bool result =
+#endif
+  NNUE::WriteParameters(stream);
+#ifndef NDEBUG
   assert(result);
+#endif
 
   std::cout << "save_eval() finished. folder = " << eval_dir << std::endl;
 }
diff --git a/src/nnue/trainer/trainer.h b/src/nnue/trainer/trainer.h
index d526557a..94553c07 100644
--- a/src/nnue/trainer/trainer.h
+++ b/src/nnue/trainer/trainer.h
@@ -70,8 +70,8 @@ struct Example {
 
 // Message used for setting hyperparameters
 struct Message {
-  Message(const std::string& name, const std::string& value = ""):
-      name(name), value(value), num_peekers(0), num_receivers(0) {}
+  Message(const std::string& message_name, const std::string& message_value = ""):
+      name(message_name), value(message_value), num_peekers(0), num_receivers(0) {}
   const std::string name;
   const std::string value;
   std::uint32_t num_peekers;
diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
index b6d6635b..6b0adc9f 100644
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -206,7 +206,7 @@ class Trainer<Layers::InputSlice<OutputDimensions, Offset>> {
       const IndexType input_offset = kInputDimensions * b;
       const IndexType output_offset = kOutputDimensions * b;
       for (IndexType i = 0; i < kInputDimensions; ++i) {
-        if (i < Offset || i >= Offset + kOutputDimensions) {
+        if ((int)i < (int)Offset || i >= Offset + kOutputDimensions) {
           gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
         } else {
           gradients_[input_offset + i] = gradients[output_offset + i - Offset];

From d993bd36d0a984b47b7f2f0e14a91bbcec5f948e Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 21:21:10 +0900
Subject: [PATCH 076/398] Removed compile warnings.

---
 src/learn/learning_tools.h | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/learn/learning_tools.h b/src/learn/learning_tools.h
index 348105b6..1f9bdf96 100644
--- a/src/learn/learning_tools.h
+++ b/src/learn/learning_tools.h
@@ -40,13 +40,14 @@ namespace EvalLearningTools
 		static uint64_t eta2_epoch;
 
 		// Batch initialization of eta. If 0 is passed, the default value will be set.
-		static void init_eta(double eta1, double eta2, double eta3, uint64_t eta1_epoch, uint64_t eta2_epoch)
+		static void init_eta(double new_eta1, double new_eta2, double new_eta3,
+			uint64_t new_eta1_epoch, uint64_t new_eta2_epoch)
 		{
-			Weight::eta1 = (eta1 != 0) ? eta1 : 30.0;
-			Weight::eta2 = (eta2 != 0) ? eta2 : 30.0;
-			Weight::eta3 = (eta3 != 0) ? eta3 : 30.0;
-			Weight::eta1_epoch = (eta1_epoch != 0) ? eta1_epoch : 0;
-			Weight::eta2_epoch = (eta2_epoch != 0) ? eta2_epoch : 0;
+			Weight::eta1 = (new_eta1 != 0) ? new_eta1 : 30.0;
+			Weight::eta2 = (new_eta2 != 0) ? new_eta2 : 30.0;
+			Weight::eta3 = (new_eta3 != 0) ? new_eta3 : 30.0;
+			Weight::eta1_epoch = (new_eta1_epoch != 0) ? new_eta1_epoch : 0;
+			Weight::eta2_epoch = (new_eta2_epoch != 0) ? new_eta2_epoch : 0;
 		}
 
 		// Set eta according to epoch.

From 7bd4688747c37764778853f0d0ff1977bd7e663d Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 20:16:09 +0900
Subject: [PATCH 077/398] Remove compile warnings.

---
 .travis.yml                            |  3 +--
 src/learn/gensfen.cpp                  |  5 ++---
 src/nnue/evaluate_nnue_learner.cpp     | 14 ++++++++++++--
 src/nnue/trainer/trainer.h             |  4 ++--
 src/nnue/trainer/trainer_input_slice.h |  2 +-
 5 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 438bf4d0..503d678a 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -77,8 +77,7 @@ script:
   - if [[ "$TRAVIS_OS_NAME" != "linux" || "$COMP" == "gcc" ]]; then make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref; fi
 
   # start some basic learner CI
-  #TODO enable -Werror
-  - export CXXFLAGS=""
+  - export CXXFLAGS="-Werror"
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && LDFLAGS="-lstdc++fs" make -j2 ARCH=x86-64-modern learn; fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && LDFLAGS="-lstdc++fs" make -j2 ARCH=x86-64-modern profile-learn; fi
 
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 3d015acf..84feabb0 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -878,8 +878,7 @@ namespace Learner
                     next_move = search_pv[0];
                 }
 
-            RANDOM_MOVE:;
-
+                // Random move.
                 auto random_move = choose_random_move(pos, random_move_flag, ply, actual_random_move_count);
                 if (random_move.has_value())
                 {
@@ -897,7 +896,7 @@ namespace Learner
                     a_psv.clear();
                 }
 
-            DO_MOVE:;
+                // Do move.
                 pos.do_move(next_move, states[ply]);
 
                 // Call node evaluate() for each difference calculation.
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 7be06832..8b0413e5 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -113,8 +113,13 @@ void SetOptions(const std::string& options) {
 void RestoreParameters(const std::string& dir_name) {
   const std::string file_name = Path::Combine(dir_name, NNUE::savedfileName);
   std::ifstream stream(file_name, std::ios::binary);
-  bool result = ReadParameters(stream);
+#ifndef NDEBUG
+  bool result =
+#endif
+  ReadParameters(stream);
+#ifndef NDEBUG
   assert(result);
+#endif
 
   SendMessages({{"reset"}});
 }
@@ -216,8 +221,13 @@ void save_eval(std::string dir_name) {
 
   const std::string file_name = Path::Combine(eval_dir, NNUE::savedfileName);
   std::ofstream stream(file_name, std::ios::binary);
-  const bool result = NNUE::WriteParameters(stream);
+#ifndef NDEBUG
+  const bool result =
+#endif
+  NNUE::WriteParameters(stream);
+#ifndef NDEBUG
   assert(result);
+#endif
 
   std::cout << "save_eval() finished. folder = " << eval_dir << std::endl;
 }
diff --git a/src/nnue/trainer/trainer.h b/src/nnue/trainer/trainer.h
index d526557a..94553c07 100644
--- a/src/nnue/trainer/trainer.h
+++ b/src/nnue/trainer/trainer.h
@@ -70,8 +70,8 @@ struct Example {
 
 // Message used for setting hyperparameters
 struct Message {
-  Message(const std::string& name, const std::string& value = ""):
-      name(name), value(value), num_peekers(0), num_receivers(0) {}
+  Message(const std::string& message_name, const std::string& message_value = ""):
+      name(message_name), value(message_value), num_peekers(0), num_receivers(0) {}
   const std::string name;
   const std::string value;
   std::uint32_t num_peekers;
diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
index b6d6635b..6b0adc9f 100644
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -206,7 +206,7 @@ class Trainer<Layers::InputSlice<OutputDimensions, Offset>> {
       const IndexType input_offset = kInputDimensions * b;
       const IndexType output_offset = kOutputDimensions * b;
       for (IndexType i = 0; i < kInputDimensions; ++i) {
-        if (i < Offset || i >= Offset + kOutputDimensions) {
+        if ((int)i < (int)Offset || i >= Offset + kOutputDimensions) {
           gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
         } else {
           gradients_[input_offset + i] = gradients[output_offset + i - Offset];

From 9dcadfa642524553dbba9ea7d89516fc87ccb583 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 21:21:10 +0900
Subject: [PATCH 078/398] Removed compile warnings.

---
 src/learn/learning_tools.h | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/learn/learning_tools.h b/src/learn/learning_tools.h
index 348105b6..1f9bdf96 100644
--- a/src/learn/learning_tools.h
+++ b/src/learn/learning_tools.h
@@ -40,13 +40,14 @@ namespace EvalLearningTools
 		static uint64_t eta2_epoch;
 
 		// Batch initialization of eta. If 0 is passed, the default value will be set.
-		static void init_eta(double eta1, double eta2, double eta3, uint64_t eta1_epoch, uint64_t eta2_epoch)
+		static void init_eta(double new_eta1, double new_eta2, double new_eta3,
+			uint64_t new_eta1_epoch, uint64_t new_eta2_epoch)
 		{
-			Weight::eta1 = (eta1 != 0) ? eta1 : 30.0;
-			Weight::eta2 = (eta2 != 0) ? eta2 : 30.0;
-			Weight::eta3 = (eta3 != 0) ? eta3 : 30.0;
-			Weight::eta1_epoch = (eta1_epoch != 0) ? eta1_epoch : 0;
-			Weight::eta2_epoch = (eta2_epoch != 0) ? eta2_epoch : 0;
+			Weight::eta1 = (new_eta1 != 0) ? new_eta1 : 30.0;
+			Weight::eta2 = (new_eta2 != 0) ? new_eta2 : 30.0;
+			Weight::eta3 = (new_eta3 != 0) ? new_eta3 : 30.0;
+			Weight::eta1_epoch = (new_eta1_epoch != 0) ? new_eta1_epoch : 0;
+			Weight::eta2_epoch = (new_eta2_epoch != 0) ? new_eta2_epoch : 0;
 		}
 
 		// Set eta according to epoch.

From 005009f4e531561618d44780025ccf638532912c Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 23:38:00 +0900
Subject: [PATCH 079/398] Changed a option name more descriptive, "Training" ->
 "PruneAtShallowDepthOnPvNode".  The default value was changed but the default
 behavior is not changed. Changed to set a global option
 prune_at_shallow_depth_on_pv_node on a callback function.

---
 src/search.cpp    | 12 +++++++-----
 src/search.h      |  4 ++++
 src/ucioption.cpp |  8 +++++++-
 3 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index 67348a2b..6fbfdedf 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -54,6 +54,10 @@ using std::string;
 using Eval::evaluate;
 using namespace Search;
 
+#if defined(EVAL_LEARN)
+bool Search::prune_at_shallow_depth_on_pv_node = false;
+#endif
+
 namespace {
 
   // Different node types, used as a template parameter
@@ -68,8 +72,6 @@ namespace {
     return Value(223 * (d - improving));
   }
 
-  bool training;
-
   // Reductions lookup table, initialized at startup
   int Reductions[MAX_MOVES]; // [depth or moveNumber]
 
@@ -195,8 +197,6 @@ void Search::init() {
 
   for (int i = 1; i < MAX_MOVES; ++i)
       Reductions[i] = int((22.0 + std::log(Threads.size())) * std::log(i));
-
-  training = Options["Training"];
 }
 
 
@@ -1011,7 +1011,9 @@ moves_loop: // When in check, search starts from here
 
       // Step 12. Pruning at shallow depth (~200 Elo)
       if (  !rootNode
-          && !(training && PvNode)
+#ifdef EVAL_LEARN
+          && !(!prune_at_shallow_depth_on_pv_node && PvNode)
+#endif
           && pos.non_pawn_material(us)
           && bestValue > VALUE_TB_LOSS_IN_MAX_PLY)
       {
diff --git a/src/search.h b/src/search.h
index 01d8a4c1..9d5ce279 100644
--- a/src/search.h
+++ b/src/search.h
@@ -33,6 +33,10 @@ namespace Search {
 constexpr int CounterMovePruneThreshold = 0;
 
 
+#if defined(EVAL_LEARN)
+extern bool prune_at_shallow_depth_on_pv_node;
+#endif
+
 /// Stack struct keeps track of the information we need to remember from nodes
 /// shallower and deeper in the tree during the search. Each search thread has
 /// its own array of Stack objects, indexed by the current ply.
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index 4f9fab5e..0e561416 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -42,6 +42,11 @@ void on_threads(const Option& o) { Threads.set(size_t(o)); }
 void on_tb_path(const Option& o) { Tablebases::init(o); }
 void on_use_NNUE(const Option& ) { Eval::init_NNUE(); }
 void on_eval_file(const Option& ) { Eval::init_NNUE(); }
+#ifdef EVAL_LEARN
+void on_prune_at_shallow_depth_on_pv_node(const Option& o) {
+  Search::prune_at_shallow_depth_on_pv_node = o;
+}
+#endif
 
 /// Our case insensitive less() function as required by UCI protocol
 bool CaseInsensitiveLess::operator() (const string& s1, const string& s2) const {
@@ -69,7 +74,6 @@ void init(OptionsMap& o) {
   o["Move Overhead"]         << Option(10, 0, 5000);
   o["Slow Mover"]            << Option(100, 10, 1000);
   o["nodestime"]             << Option(0, 0, 10000);
-  o["Training"]              << Option(false);
   o["UCI_Chess960"]          << Option(false);
   o["UCI_AnalyseMode"]       << Option(false);
   o["UCI_LimitStrength"]     << Option(false);
@@ -96,6 +100,8 @@ void init(OptionsMap& o) {
   // Evalsave by default. This folder shall be prepared in advance.
   // Automatically create a folder under this folder like "0/", "1/", ... and save the evaluation function file there.
   o["EvalSaveDir"] << Option("evalsave");
+  // Prune at shallow depth on PV nodes. Setting this value to true gains elo in shallow search.
+  o["PruneAtShallowDepthOnPvNode"] << Option(false, on_prune_at_shallow_depth_on_pv_node);
 #endif
 }
 

From e0a98607085655167cc01aed50db83976dbb3ec5 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Wed, 9 Sep 2020 19:08:56 +0200
Subject: [PATCH 080/398] Upgrade CI distro, remove special cases, fix one more
 warning

---
 .travis.yml                    | 35 ++++++++++++++++------------------
 src/nnue/features/index_list.h |  2 +-
 2 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 503d678a..608d22c1 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,5 +1,5 @@
 language: cpp
-dist: bionic
+dist: focal
 
 matrix:
   include:
@@ -7,9 +7,9 @@ matrix:
       compiler: gcc
       addons:
         apt:
-          packages: ['g++-8', 'g++-8-multilib', 'g++-multilib', 'valgrind', 'expect', 'curl', 'libopenblas-dev']
+          packages: ['g++-multilib', 'valgrind', 'expect', 'curl', 'libopenblas-dev']
       env:
-        - COMPILER=g++-8
+        - COMPILER=g++
         - COMP=gcc
 
 #    - os: linux
@@ -68,18 +68,17 @@ script:
   # TODO avoid _mm_malloc
   # - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-64 build && ../tests/signature.sh $benchref; fi
   # - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 optimize=no debug=yes build && ../tests/signature.sh $benchref; fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32-sse41-popcnt build && ../tests/signature.sh $benchref; fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32-sse2 build && ../tests/signature.sh $benchref; fi
+  - make clean && make -j2 ARCH=x86-32-sse41-popcnt build && ../tests/signature.sh $benchref
+  - make clean && make -j2 ARCH=x86-32-sse2 build && ../tests/signature.sh $benchref
   # TODO avoid _mm_malloc
   # - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 build && ../tests/signature.sh $benchref; fi
   # - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-32 build && ../tests/signature.sh $benchref; fi
-  # workaround: exclude a custom version of llvm+clang, which doesn't find llvm-profdata on ubuntu
-  - if [[ "$TRAVIS_OS_NAME" != "linux" || "$COMP" == "gcc" ]]; then make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref; fi
+  - make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref
 
   # start some basic learner CI
-  - export CXXFLAGS="-Werror"
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && LDFLAGS="-lstdc++fs" make -j2 ARCH=x86-64-modern learn; fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && LDFLAGS="-lstdc++fs" make -j2 ARCH=x86-64-modern profile-learn; fi
+  - make clean && make -j2 ARCH=x86-64-modern learn
+  - make clean && make -j2 ARCH=x86-64-modern profile-learn
+  - make clean && make -j2 ARCH=x86-64-modern debug=yes optimize=no learn
 
   # compile only for some more advanced architectures (might not run in travis)
   - make clean && make -j2 ARCH=x86-64-avx2 build
@@ -98,18 +97,16 @@ script:
   # Valgrind
   #
   - export CXXFLAGS="-O1 -fno-inline"
-  - if [ -x "$(command -v valgrind )" ]; then make clean && make -j2 ARCH=x86-64-modern debug=yes optimize=no build > /dev/null && ../tests/instrumented.sh --valgrind; fi
-  - if [ -x "$(command -v valgrind )" ]; then ../tests/instrumented.sh --valgrind-thread; fi
+  - make clean && make -j2 ARCH=x86-64-modern debug=yes optimize=no build > /dev/null && ../tests/instrumented.sh --valgrind
+  - ../tests/instrumented.sh --valgrind-thread
 
   #
   # Sanitizer
   #
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-undefined; fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64-modern sanitize=thread    optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-thread; fi
+  - make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-undefined
+  - make clean && make -j2 ARCH=x86-64-modern sanitize=thread    optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-thread
 
-  #
-  # NNUE testing / TODO should work with debug=yes as well
-  #
+  # NNUE testing
   - export CXXFLAGS="-O1 -fno-inline"
-  - if [ -x "$(command -v valgrind )" ]; then make clean && LDFLAGS="-lstdc++fs"  make -j2 ARCH=x86-64-modern debug=no optimize=no learn > /dev/null && ../tests/instrumented_learn.sh --valgrind; fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && LDFLAGS="-lstdc++fs"  make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=no learn > /dev/null && ../tests/instrumented_learn.sh --sanitizer-undefined; fi
+  - make clean && make -j2 ARCH=x86-64-modern debug=no optimize=no learn > /dev/null && ../tests/instrumented_learn.sh --valgrind
+  - make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=no learn > /dev/null && ../tests/instrumented_learn.sh --sanitizer-undefined
diff --git a/src/nnue/features/index_list.h b/src/nnue/features/index_list.h
index d9ad680a..dd055fb3 100644
--- a/src/nnue/features/index_list.h
+++ b/src/nnue/features/index_list.h
@@ -50,7 +50,7 @@ namespace Eval::NNUE::Features {
     }
 
    private:
-    T values_[MaxSize];
+    T values_[MaxSize] = {};
     std::size_t size_ = 0;
   };
 

From 69563aeed9726af36b6543be3572fbd825698f31 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 20:16:09 +0900
Subject: [PATCH 081/398] Remove compile warnings.


From 073d43738442657c30f1ddedd411b01a782f9d1b Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 21:21:10 +0900
Subject: [PATCH 082/398] Removed compile warnings.


From e63b6088ba8066844fdf47a5843355196e0e2ad1 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 23:38:00 +0900
Subject: [PATCH 083/398] Changed a option name more descriptive, "Training" ->
 "PruneAtShallowDepthOnPvNode".  The default value was changed but the default
 behavior is not changed. Changed to set a global option
 prune_at_shallow_depth_on_pv_node on a callback function.

---
 src/search.cpp    | 12 +++++++-----
 src/search.h      |  4 ++++
 src/ucioption.cpp |  8 +++++++-
 3 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index 67348a2b..6fbfdedf 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -54,6 +54,10 @@ using std::string;
 using Eval::evaluate;
 using namespace Search;
 
+#if defined(EVAL_LEARN)
+bool Search::prune_at_shallow_depth_on_pv_node = false;
+#endif
+
 namespace {
 
   // Different node types, used as a template parameter
@@ -68,8 +72,6 @@ namespace {
     return Value(223 * (d - improving));
   }
 
-  bool training;
-
   // Reductions lookup table, initialized at startup
   int Reductions[MAX_MOVES]; // [depth or moveNumber]
 
@@ -195,8 +197,6 @@ void Search::init() {
 
   for (int i = 1; i < MAX_MOVES; ++i)
       Reductions[i] = int((22.0 + std::log(Threads.size())) * std::log(i));
-
-  training = Options["Training"];
 }
 
 
@@ -1011,7 +1011,9 @@ moves_loop: // When in check, search starts from here
 
       // Step 12. Pruning at shallow depth (~200 Elo)
       if (  !rootNode
-          && !(training && PvNode)
+#ifdef EVAL_LEARN
+          && !(!prune_at_shallow_depth_on_pv_node && PvNode)
+#endif
           && pos.non_pawn_material(us)
           && bestValue > VALUE_TB_LOSS_IN_MAX_PLY)
       {
diff --git a/src/search.h b/src/search.h
index 01d8a4c1..9d5ce279 100644
--- a/src/search.h
+++ b/src/search.h
@@ -33,6 +33,10 @@ namespace Search {
 constexpr int CounterMovePruneThreshold = 0;
 
 
+#if defined(EVAL_LEARN)
+extern bool prune_at_shallow_depth_on_pv_node;
+#endif
+
 /// Stack struct keeps track of the information we need to remember from nodes
 /// shallower and deeper in the tree during the search. Each search thread has
 /// its own array of Stack objects, indexed by the current ply.
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index 4f9fab5e..0e561416 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -42,6 +42,11 @@ void on_threads(const Option& o) { Threads.set(size_t(o)); }
 void on_tb_path(const Option& o) { Tablebases::init(o); }
 void on_use_NNUE(const Option& ) { Eval::init_NNUE(); }
 void on_eval_file(const Option& ) { Eval::init_NNUE(); }
+#ifdef EVAL_LEARN
+void on_prune_at_shallow_depth_on_pv_node(const Option& o) {
+  Search::prune_at_shallow_depth_on_pv_node = o;
+}
+#endif
 
 /// Our case insensitive less() function as required by UCI protocol
 bool CaseInsensitiveLess::operator() (const string& s1, const string& s2) const {
@@ -69,7 +74,6 @@ void init(OptionsMap& o) {
   o["Move Overhead"]         << Option(10, 0, 5000);
   o["Slow Mover"]            << Option(100, 10, 1000);
   o["nodestime"]             << Option(0, 0, 10000);
-  o["Training"]              << Option(false);
   o["UCI_Chess960"]          << Option(false);
   o["UCI_AnalyseMode"]       << Option(false);
   o["UCI_LimitStrength"]     << Option(false);
@@ -96,6 +100,8 @@ void init(OptionsMap& o) {
   // Evalsave by default. This folder shall be prepared in advance.
   // Automatically create a folder under this folder like "0/", "1/", ... and save the evaluation function file there.
   o["EvalSaveDir"] << Option("evalsave");
+  // Prune at shallow depth on PV nodes. Setting this value to true gains elo in shallow search.
+  o["PruneAtShallowDepthOnPvNode"] << Option(false, on_prune_at_shallow_depth_on_pv_node);
 #endif
 }
 

From 94f3cae760f0ed6ab464cf8febd79ebe9925b53a Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Thu, 10 Sep 2020 08:23:21 +0900
Subject: [PATCH 084/398] Changed a sentence.

---
 src/search.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/search.cpp b/src/search.cpp
index 6fbfdedf..b92ea7c8 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1012,7 +1012,7 @@ moves_loop: // When in check, search starts from here
       // Step 12. Pruning at shallow depth (~200 Elo)
       if (  !rootNode
 #ifdef EVAL_LEARN
-          && !(!prune_at_shallow_depth_on_pv_node && PvNode)
+          && (PvNode ? prune_at_shallow_depth_on_pv_node : true)
 #endif
           && pos.non_pawn_material(us)
           && bestValue > VALUE_TB_LOSS_IN_MAX_PLY)

From 020e66d2e63acdbd5449de5f39e99c7e2bcb2551 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 9 Sep 2020 22:36:40 +0200
Subject: [PATCH 085/398] Add "sfen_format" option in gensfen. Valid values are
 "bin" and "binpack". It determines the output format of the sfens. Binpack is
 a highly compressed formats for consecutive sfens. Extension is now
 determined by the used format, output_file_name should contain just the stem.

---
 src/extra/nnue_data_binpack_format.h | 7469 ++++++++++++++++++++++++++
 src/learn/gensfen.cpp                |  129 +-
 2 files changed, 7587 insertions(+), 11 deletions(-)
 create mode 100644 src/extra/nnue_data_binpack_format.h

diff --git a/src/extra/nnue_data_binpack_format.h b/src/extra/nnue_data_binpack_format.h
new file mode 100644
index 00000000..9f810a3b
--- /dev/null
+++ b/src/extra/nnue_data_binpack_format.h
@@ -0,0 +1,7469 @@
+#pragma once
+
+#include <cstdio>
+#include <cassert>
+#include <string>
+#include <string_view>
+#include <vector>
+#include <memory>
+#include <fstream>
+#include <cstring>
+#include <iostream>
+#include <set>
+#include <cstdio>
+#include <cassert>
+#include <array>
+#include <immintrin.h>
+#include <intrin.h>
+#include <nmmintrin.h>
+#include <limits>
+
+
+namespace chess
+{
+    #if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__)
+
+    #define FORCEINLINE __attribute__((always_inline))
+
+    #elif defined(_MSC_VER)
+
+    // NOTE: for some reason it breaks the profiler a little
+    //       keep it on only when not profiling.
+    //#define FORCEINLINE __forceinline
+    #define FORCEINLINE
+
+    #else
+
+    #define FORCEINLINE inline
+
+    #endif
+
+    #if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__)
+
+    #define NOINLINE __attribute__((noinline))
+
+    #elif defined(_MSC_VER)
+
+    #define NOINLINE __declspec(noinline)
+
+    #else
+
+    #define NOINLINE
+
+    #endif
+
+    namespace intrin
+    {
+        [[nodiscard]] constexpr int popcount_constexpr(std::uint64_t value)
+        {
+            int r = 0;
+            while (value)
+            {
+                value &= value - 1;
+                ++r;
+            }
+            return r;
+        }
+
+        [[nodiscard]] constexpr int lsb_constexpr(std::uint64_t value)
+        {
+            int c = 0;
+            value &= ~value + 1; // leave only the lsb
+            if ((value & 0x00000000FFFFFFFFull) == 0) c += 32;
+            if ((value & 0x0000FFFF0000FFFFull) == 0) c += 16;
+            if ((value & 0x00FF00FF00FF00FFull) == 0) c += 8;
+            if ((value & 0x0F0F0F0F0F0F0F0Full) == 0) c += 4;
+            if ((value & 0x3333333333333333ull) == 0) c += 2;
+            if ((value & 0x5555555555555555ull) == 0) c += 1;
+            return c;
+        }
+
+        [[nodiscard]] constexpr int msb_constexpr(std::uint64_t value)
+        {
+            int c = 63;
+            if ((value & 0xFFFFFFFF00000000ull) == 0) { c -= 32; value <<= 32; }
+            if ((value & 0xFFFF000000000000ull) == 0) { c -= 16; value <<= 16; }
+            if ((value & 0xFF00000000000000ull) == 0) { c -= 8; value <<= 8; }
+            if ((value & 0xF000000000000000ull) == 0) { c -= 4; value <<= 4; }
+            if ((value & 0xC000000000000000ull) == 0) { c -= 2; value <<= 2; }
+            if ((value & 0x8000000000000000ull) == 0) { c -= 1; }
+            return c;
+        }
+    }
+
+    namespace intrin
+    {
+        [[nodiscard]] inline int popcount(std::uint64_t b)
+        {
+    #if (defined(_MSC_VER) || defined(__INTEL_COMPILER)) && !defined(__clang__)
+
+            return static_cast<int>(_mm_popcnt_u64(b));
+
+    #else
+
+            return static_cast<int>(__builtin_popcountll(b));
+
+    #endif
+        }
+
+    #if defined(_MSC_VER) && !defined(__clang__)
+
+        [[nodiscard]] inline int lsb(std::uint64_t value)
+        {
+            assert(value != 0);
+
+            unsigned long idx;
+            _BitScanForward64(&idx, value);
+            return static_cast<int>(idx);
+        }
+
+        [[nodiscard]] inline int msb(std::uint64_t value)
+        {
+            assert(value != 0);
+
+            unsigned long idx;
+            _BitScanReverse64(&idx, value);
+            return static_cast<int>(idx);
+        }
+
+    #else
+
+        [[nodiscard]] inline int lsb(std::uint64_t value)
+        {
+            assert(value != 0);
+
+            return __builtin_ctzll(value);
+        }
+
+        [[nodiscard]] inline int msb(std::uint64_t value)
+        {
+            assert(value != 0);
+
+            return 63 ^ __builtin_clzll(value);
+        }
+
+    #endif
+    }
+
+
+    template <typename IntT>
+    [[nodiscard]] constexpr IntT mulSaturate(IntT lhs, IntT rhs)
+    {
+        static_assert(std::is_unsigned_v<IntT>); // currently no support for signed
+
+    #if defined (_MSC_VER)
+
+        if (lhs == 0) return 0;
+
+        const IntT result = lhs * rhs;
+        return result / lhs == rhs ? result : std::numeric_limits<IntT>::max();
+
+    #elif defined (__GNUC__)
+
+        IntT result{};
+        return __builtin_mul_overflow(lhs, rhs, &result) ? std::numeric_limits<IntT>::max() : result;
+
+    #endif
+    }
+
+    template <typename IntT>
+    [[nodiscard]] constexpr IntT addSaturate(IntT lhs, IntT rhs)
+    {
+        static_assert(std::is_unsigned_v<IntT>); // currently no support for signed
+
+    #if defined (_MSC_VER)
+
+        const IntT result = lhs + rhs;
+        return result >= lhs ? result : std::numeric_limits<IntT>::max();
+
+    #elif defined (__GNUC__)
+
+        IntT result{};
+        return __builtin_add_overflow(lhs, rhs, &result) ? std::numeric_limits<IntT>::max() : result;
+
+    #endif
+    }
+
+    template <typename IntT>
+    [[nodiscard]] constexpr bool addOverflows(IntT lhs, IntT rhs)
+    {
+    #if defined (_MSC_VER)
+
+        return static_cast<IntT>(lhs + rhs) < lhs;
+
+    #elif defined (__GNUC__)
+
+        IntT result{};
+        __builtin_add_overflow(lhs, rhs, &result);
+        return result;
+
+    #endif
+    }
+
+    template <typename IntT>
+    [[nodiscard]] constexpr IntT floorLog2(IntT value)
+    {
+        return intrin::msb_constexpr(value);
+    }
+
+    template <typename IntT>
+    constexpr std::size_t maxFibonacciNumberIndexForType()
+    {
+        static_assert(std::is_unsigned_v<IntT>);
+
+        switch (sizeof(IntT))
+        {
+        case 8:
+            return 93;
+        case 4:
+            return 47;
+        case 2:
+            return 24;
+        case 1:
+            return 13;
+        }
+
+        return 0;
+    }
+
+    template <typename IntT>
+    constexpr auto computeMasks()
+    {
+        static_assert(std::is_unsigned_v<IntT>);
+
+        constexpr std::size_t numBits = sizeof(IntT) * CHAR_BIT;
+        std::array<IntT, numBits + 1u> nbitmasks{};
+
+        for (std::size_t i = 0; i < numBits; ++i)
+        {
+            nbitmasks[i] = (static_cast<IntT>(1u) << i) - 1u;
+        }
+        nbitmasks[numBits] = ~static_cast<IntT>(0u);
+
+        return nbitmasks;
+    }
+
+    template <typename IntT>
+    constexpr auto nbitmask = computeMasks<IntT>();
+
+    template <typename IntT>
+    constexpr auto computeFibonacciNumbers()
+    {
+        constexpr std::size_t size = maxFibonacciNumberIndexForType<IntT>() + 1;
+        std::array<IntT, size> numbers{};
+        numbers[0] = 0;
+        numbers[1] = 1;
+
+        for (std::size_t i = 2; i < size; ++i)
+        {
+            numbers[i] = numbers[i - 1] + numbers[i - 2];
+        }
+
+        return numbers;
+    }
+
+    // F(0) = 0, F(1) = 1
+    template <typename IntT>
+    constexpr auto fibonacciNumbers = computeFibonacciNumbers<IntT>();
+
+    template <std::size_t N, typename FromT, typename ToT = std::make_signed_t<FromT>>
+    inline ToT signExtend(FromT value)
+    {
+        static_assert(std::is_signed_v<ToT>);
+        static_assert(std::is_unsigned_v<FromT>);
+        static_assert(sizeof(ToT) == sizeof(FromT));
+
+        constexpr std::size_t totalBits = sizeof(FromT) * CHAR_BIT;
+
+        static_assert(N > 0 && N <= totalBits);
+
+        constexpr std::size_t unusedBits = totalBits - N;
+        if constexpr (ToT(~FromT(0)) >> 1 == ToT(~FromT(0)))
+        {
+            return ToT(value << unusedBits) >> ToT(unusedBits);
+        }
+        else
+        {
+            constexpr FromT mask = (~FromT(0)) >> unusedBits;
+            value &= mask;
+            if (value & (FromT(1) << (N - 1)))
+            {
+                value |= ~mask;
+            }
+            return static_cast<ToT>(value);
+        }
+    }
+
+    namespace lookup
+    {
+        constexpr int nthSetBitIndexNaive(std::uint64_t value, int n)
+        {
+            for (int i = 0; i < n; ++i)
+            {
+                value &= value - 1;
+            }
+            return intrin::lsb_constexpr(value);
+        }
+
+        constexpr std::array<std::array<std::uint8_t, 8>, 256> nthSetBitIndex = []()
+        {
+            std::array<std::array<std::uint8_t, 8>, 256> t{};
+
+            for (int i = 0; i < 256; ++i)
+            {
+                for (int j = 0; j < 8; ++j)
+                {
+                    t[i][j] = nthSetBitIndexNaive(i, j);
+                }
+            }
+
+            return t;
+        }();
+    }
+
+    inline int nthSetBitIndex(std::uint64_t v, std::uint64_t n)
+    {
+        std::uint64_t shift = 0;
+
+        std::uint64_t p = intrin::popcount(v & 0xFFFFFFFFull);
+        std::uint64_t pmask = static_cast<std::uint64_t>(p > n) - 1ull;
+        v >>= 32 & pmask;
+        shift += 32 & pmask;
+        n -= p & pmask;
+
+        p = intrin::popcount(v & 0xFFFFull);
+        pmask = static_cast<std::uint64_t>(p > n) - 1ull;
+        v >>= 16 & pmask;
+        shift += 16 & pmask;
+        n -= p & pmask;
+
+        p = intrin::popcount(v & 0xFFull);
+        pmask = static_cast<std::uint64_t>(p > n) - 1ull;
+        shift += 8 & pmask;
+        v >>= 8 & pmask;
+        n -= p & pmask;
+
+        return static_cast<int>(lookup::nthSetBitIndex[v & 0xFFull][n] + shift);
+    }
+
+    namespace util
+    {
+        inline std::size_t usedBits(std::size_t value)
+        {
+            if (value == 0) return 0;
+            return intrin::msb(value) + 1;
+        }
+    }
+
+    template <typename EnumT>
+    struct EnumTraits;
+
+    template <typename EnumT>
+    [[nodiscard]] constexpr auto hasEnumTraits() -> decltype(EnumTraits<EnumT>::cardinaliy, bool{})
+    {
+        return true;
+    }
+
+    template <typename EnumT>
+    [[nodiscard]] constexpr bool hasEnumTraits(...)
+    {
+        return false;
+    }
+
+    template <typename EnumT>
+    [[nodiscard]] constexpr bool isNaturalIndex() noexcept
+    {
+        return EnumTraits<EnumT>::isNaturalIndex;
+    }
+
+    template <typename EnumT>
+    [[nodiscard]] constexpr int cardinality() noexcept
+    {
+        return EnumTraits<EnumT>::cardinality;
+    }
+
+    template <typename EnumT>
+    [[nodiscard]] constexpr const std::array<EnumT, cardinality<EnumT>()>& values() noexcept
+    {
+        return EnumTraits<EnumT>::values;
+    }
+
+    template <typename EnumT>
+    [[nodiscard]] constexpr EnumT fromOrdinal(int id) noexcept
+    {
+        assert(!EnumTraits<EnumT>::isNaturalIndex || (id >= 0 && id < EnumTraits<EnumT>::cardinality));
+
+        return EnumTraits<EnumT>::fromOrdinal(id);
+    }
+
+    template <typename EnumT>
+    [[nodiscard]] constexpr typename EnumTraits<EnumT>::IdType ordinal(EnumT v) noexcept
+    {
+        return EnumTraits<EnumT>::ordinal(v);
+    }
+
+    template <typename EnumT, typename... ArgsTs, typename SFINAE = std::enable_if_t<hasEnumTraits<EnumT>()>>
+    [[nodiscard]] constexpr decltype(auto) toString(EnumT v, ArgsTs&&... args)
+    {
+        return EnumTraits<EnumT>::toString(v, std::forward<ArgsTs>(args)...);
+    }
+
+    template <typename EnumT>
+    [[nodiscard]] constexpr decltype(auto) toString(EnumT v)
+    {
+        return EnumTraits<EnumT>::toString(v);
+    }
+
+    template <typename EnumT, typename FormatT, typename SFINAE = std::enable_if_t<!hasEnumTraits<FormatT>()>>
+    [[nodiscard]] constexpr decltype(auto) toString(FormatT&& f, EnumT v)
+    {
+        return EnumTraits<EnumT>::toString(std::forward<FormatT>(f), v);
+    }
+
+    template <typename EnumT>
+    [[nodiscard]] constexpr decltype(auto) toChar(EnumT v)
+    {
+        return EnumTraits<EnumT>::toChar(v);
+    }
+
+    template <typename EnumT, typename FormatT>
+    [[nodiscard]] constexpr decltype(auto) toChar(FormatT&& f, EnumT v)
+    {
+        return EnumTraits<EnumT>::toChar(std::forward<FormatT>(f), v);
+    }
+
+    template <typename EnumT, typename... ArgsTs>
+    [[nodiscard]] constexpr decltype(auto) fromString(ArgsTs&& ... args)
+    {
+        return EnumTraits<EnumT>::fromString(std::forward<ArgsTs>(args)...);
+    }
+
+    template <typename EnumT, typename... ArgsTs>
+    [[nodiscard]] constexpr decltype(auto) fromChar(ArgsTs&& ... args)
+    {
+        return EnumTraits<EnumT>::fromChar(std::forward<ArgsTs>(args)...);
+    }
+
+    template <>
+    struct EnumTraits<bool>
+    {
+        using IdType = int;
+        using EnumType = bool;
+
+        static constexpr int cardinality = 2;
+        static constexpr bool isNaturalIndex = true;
+
+        static constexpr std::array<EnumType, cardinality> values{
+            false,
+            true
+        };
+
+        [[nodiscard]] static constexpr int ordinal(EnumType c) noexcept
+        {
+            return static_cast<IdType>(c);
+        }
+
+        [[nodiscard]] static constexpr EnumType fromOrdinal(IdType id) noexcept
+        {
+            return static_cast<EnumType>(id);
+        }
+    };
+
+    template <typename EnumT, typename ValueT, std::size_t SizeV = cardinality<EnumT>()>
+    struct EnumArray
+    {
+        static_assert(isNaturalIndex<EnumT>(), "Enum must start with 0 and end with cardinality-1.");
+
+        using value_type      = ValueT;
+        using size_type       = std::size_t;
+        using difference_type = std::ptrdiff_t;
+        using pointer         = ValueT *;
+        using const_pointer   = const ValueT*;
+        using reference       = ValueT &;
+        using const_reference = const ValueT &;
+
+        using iterator       = pointer;
+        using const_iterator = const_pointer;
+
+        using reverse_iterator       = std::reverse_iterator<iterator>;
+        using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+
+        using KeyType = EnumT;
+        using ValueType = ValueT;
+
+        constexpr void fill(const ValueType& init)
+        {
+            for (auto& v : elements)
+            {
+                v = init;
+            }
+        }
+
+        [[nodiscard]] constexpr ValueType& operator[](const KeyType& dir)
+        {
+            assert(ordinal(dir) < SizeV);
+
+            return elements[ordinal(dir)];
+        }
+
+        [[nodiscard]] constexpr const ValueType& operator[](const KeyType& dir) const
+        {
+            assert(ordinal(dir) < SizeV);
+
+            return elements[ordinal(dir)];
+        }
+
+        [[nodiscard]] constexpr ValueType& front()
+        {
+            return elements[0];
+        }
+
+        [[nodiscard]] constexpr const ValueType& front() const
+        {
+            return elements[0];
+        }
+
+        [[nodiscard]] constexpr ValueType& back()
+        {
+            return elements[SizeV - 1];
+        }
+
+        [[nodiscard]] constexpr const ValueType& back() const
+        {
+            return elements[SizeV - 1];
+        }
+
+        [[nodiscard]] constexpr pointer data()
+        {
+            return elements;
+        }
+
+        [[nodiscard]] constexpr const_pointer data() const
+        {
+            return elements;
+        }
+
+        [[nodiscard]] constexpr iterator begin() noexcept
+        {
+            return elements;
+        }
+
+        [[nodiscard]] constexpr const_iterator begin() const noexcept
+        {
+            return elements;
+        }
+
+        [[nodiscard]] constexpr iterator end() noexcept
+        {
+            return elements + SizeV;
+        }
+
+        [[nodiscard]] constexpr const_iterator end() const noexcept
+        {
+            return elements + SizeV;
+        }
+
+        [[nodiscard]] constexpr reverse_iterator rbegin() noexcept
+        {
+            return reverse_iterator(end());
+        }
+
+        [[nodiscard]] constexpr const_reverse_iterator rbegin() const noexcept
+        {
+            return const_reverse_iterator(end());
+        }
+
+        [[nodiscard]] constexpr reverse_iterator rend() noexcept
+        {
+            return reverse_iterator(begin());
+        }
+
+        [[nodiscard]] constexpr const_reverse_iterator rend() const noexcept
+        {
+            return const_reverse_iterator(begin());
+        }
+
+        [[nodiscard]] constexpr const_iterator cbegin() const noexcept
+        {
+            return begin();
+        }
+
+        [[nodiscard]] constexpr const_iterator cend() const noexcept
+        {
+            return end();
+        }
+
+        [[nodiscard]] constexpr const_reverse_iterator crbegin() const noexcept
+        {
+            return rbegin();
+        }
+
+        [[nodiscard]] constexpr const_reverse_iterator crend() const noexcept
+        {
+            return rend();
+        }
+
+        [[nodiscard]] constexpr size_type size() const noexcept
+        {
+            return SizeV;
+        }
+
+        ValueT elements[SizeV];
+    };
+
+    template <typename Enum1T, typename Enum2T, typename ValueT, std::size_t Size1V = cardinality<Enum1T>(), std::size_t Size2V = cardinality<Enum2T>()>
+    using EnumArray2 = EnumArray<Enum1T, EnumArray<Enum2T, ValueT, Size2V>, Size1V>;
+
+    enum struct Color : std::uint8_t
+    {
+        White,
+        Black
+    };
+
+    template <>
+    struct EnumTraits<Color>
+    {
+        using IdType = int;
+        using EnumType = Color;
+
+        static constexpr int cardinality = 2;
+        static constexpr bool isNaturalIndex = true;
+
+        static constexpr std::array<EnumType, cardinality> values{
+            Color::White,
+            Color::Black
+        };
+
+        [[nodiscard]] static constexpr int ordinal(EnumType c) noexcept
+        {
+            return static_cast<IdType>(c);
+        }
+
+        [[nodiscard]] static constexpr EnumType fromOrdinal(IdType id) noexcept
+        {
+            assert(id >= 0 && id < cardinality);
+
+            return static_cast<EnumType>(id);
+        }
+
+        [[nodiscard]] static constexpr std::string_view toString(EnumType c) noexcept
+        {
+            return std::string_view("wb" + ordinal(c), 1);
+        }
+
+        [[nodiscard]] static constexpr char toChar(EnumType c) noexcept
+        {
+            return "wb"[ordinal(c)];
+        }
+
+        [[nodiscard]] static constexpr std::optional<Color> fromChar(char c) noexcept
+        {
+            if (c == 'w') return Color::White;
+            if (c == 'b') return Color::Black;
+
+            return {};
+        }
+
+        [[nodiscard]] static constexpr std::optional<Color> fromString(std::string_view sv) noexcept
+        {
+            if (sv.size() != 1) return {};
+
+            return fromChar(sv[0]);
+        }
+    };
+
+    constexpr Color operator!(Color c)
+    {
+        return fromOrdinal<Color>(ordinal(c) ^ 1);
+    }
+
+    enum struct PieceType : std::uint8_t
+    {
+        Pawn,
+        Knight,
+        Bishop,
+        Rook,
+        Queen,
+        King,
+
+        None
+    };
+
+    template <>
+    struct EnumTraits<PieceType>
+    {
+        using IdType = int;
+        using EnumType = PieceType;
+
+        static constexpr int cardinality = 7;
+        static constexpr bool isNaturalIndex = true;
+
+        static constexpr std::array<EnumType, cardinality> values{
+            PieceType::Pawn,
+            PieceType::Knight,
+            PieceType::Bishop,
+            PieceType::Rook,
+            PieceType::Queen,
+            PieceType::King,
+            PieceType::None
+        };
+
+        [[nodiscard]] static constexpr int ordinal(EnumType c) noexcept
+        {
+            return static_cast<IdType>(c);
+        }
+
+        [[nodiscard]] static constexpr EnumType fromOrdinal(IdType id) noexcept
+        {
+            assert(id >= 0 && id < cardinality);
+
+            return static_cast<EnumType>(id);
+        }
+
+        [[nodiscard]] static constexpr std::string_view toString(EnumType p, Color c) noexcept
+        {
+            return std::string_view("PpNnBbRrQqKk " + (chess::ordinal(p) * 2 + chess::ordinal(c)), 1);
+        }
+
+        [[nodiscard]] static constexpr char toChar(EnumType p, Color c) noexcept
+        {
+            return "PpNnBbRrQqKk "[chess::ordinal(p) * 2 + chess::ordinal(c)];
+        }
+
+        [[nodiscard]] static constexpr std::optional<PieceType> fromChar(char c) noexcept
+        {
+            auto it = std::string_view("PpNnBbRrQqKk ").find(c);
+            if (it == std::string::npos) return {};
+            else return static_cast<PieceType>(it/2);
+        }
+
+        [[nodiscard]] static constexpr std::optional<PieceType> fromString(std::string_view sv) noexcept
+        {
+            if (sv.size() != 1) return {};
+
+            return fromChar(sv[0]);
+        }
+    };
+
+    struct Piece
+    {
+        [[nodiscard]] static constexpr Piece fromId(int id)
+        {
+            return Piece(id);
+        }
+
+        [[nodiscard]] static constexpr Piece none()
+        {
+            return Piece(PieceType::None, Color::White);
+        }
+
+        constexpr Piece() noexcept :
+            Piece(PieceType::None, Color::White)
+        {
+
+        }
+
+        constexpr Piece(PieceType type, Color color) noexcept :
+            m_id((ordinal(type) << 1) | ordinal(color))
+        {
+            assert(type != PieceType::None || color == Color::White);
+        }
+
+        constexpr Piece& operator=(const Piece& other) = default;
+
+        [[nodiscard]] constexpr friend bool operator==(Piece lhs, Piece rhs) noexcept
+        {
+            return lhs.m_id == rhs.m_id;
+        }
+
+        [[nodiscard]] constexpr friend bool operator!=(Piece lhs, Piece rhs) noexcept
+        {
+            return !(lhs == rhs);
+        }
+
+        [[nodiscard]] constexpr PieceType type() const
+        {
+            return fromOrdinal<PieceType>(m_id >> 1);
+        }
+
+        [[nodiscard]] constexpr Color color() const
+        {
+            return fromOrdinal<Color>(m_id & 1);
+        }
+
+        [[nodiscard]] constexpr std::pair<PieceType, Color> parts() const
+        {
+            return std::make_pair(type(), color());
+        }
+
+        [[nodiscard]] constexpr explicit operator int() const
+        {
+            return static_cast<int>(m_id);
+        }
+
+    private:
+        constexpr Piece(int id) :
+            m_id(id)
+        {
+        }
+
+        std::uint8_t m_id; // lowest bit is a color, 7 highest bits are a piece type
+    };
+
+    [[nodiscard]] constexpr Piece operator|(PieceType type, Color color) noexcept
+    {
+        return Piece(type, color);
+    }
+
+    [[nodiscard]] constexpr Piece operator|(Color color, PieceType type) noexcept
+    {
+        return Piece(type, color);
+    }
+
+    constexpr Piece whitePawn = Piece(PieceType::Pawn, Color::White);
+    constexpr Piece whiteKnight = Piece(PieceType::Knight, Color::White);
+    constexpr Piece whiteBishop = Piece(PieceType::Bishop, Color::White);
+    constexpr Piece whiteRook = Piece(PieceType::Rook, Color::White);
+    constexpr Piece whiteQueen = Piece(PieceType::Queen, Color::White);
+    constexpr Piece whiteKing = Piece(PieceType::King, Color::White);
+
+    constexpr Piece blackPawn = Piece(PieceType::Pawn, Color::Black);
+    constexpr Piece blackKnight = Piece(PieceType::Knight, Color::Black);
+    constexpr Piece blackBishop = Piece(PieceType::Bishop, Color::Black);
+    constexpr Piece blackRook = Piece(PieceType::Rook, Color::Black);
+    constexpr Piece blackQueen = Piece(PieceType::Queen, Color::Black);
+    constexpr Piece blackKing = Piece(PieceType::King, Color::Black);
+
+    static_assert(Piece::none().type() == PieceType::None);
+
+    template <>
+    struct EnumTraits<Piece>
+    {
+        using IdType = int;
+        using EnumType = Piece;
+
+        static constexpr int cardinality = 13;
+        static constexpr bool isNaturalIndex = true;
+
+        static constexpr std::array<EnumType, cardinality> values{
+            whitePawn,
+            blackPawn,
+            whiteKnight,
+            blackKnight,
+            whiteBishop,
+            blackBishop,
+            whiteRook,
+            blackRook,
+            whiteQueen,
+            blackQueen,
+            whiteKing,
+            blackKing,
+            Piece::none()
+        };
+
+        [[nodiscard]] static constexpr int ordinal(EnumType c) noexcept
+        {
+            return static_cast<IdType>(c);
+        }
+
+        [[nodiscard]] static constexpr EnumType fromOrdinal(int id) noexcept
+        {
+            assert(id >= 0 && id < cardinality);
+
+            return Piece::fromId(id);
+        }
+
+        [[nodiscard]] static constexpr std::string_view toString(EnumType p) noexcept
+        {
+            return std::string_view("PpNnBbRrQqKk " + ordinal(p), 1);
+        }
+
+        [[nodiscard]] static constexpr char toChar(EnumType p) noexcept
+        {
+            return "PpNnBbRrQqKk "[ordinal(p)];
+        }
+
+        [[nodiscard]] static constexpr std::optional<Piece> fromChar(char c) noexcept
+        {
+            auto it = std::string_view("PpNnBbRrQqKk ").find(c);
+            if (it == std::string::npos) return {};
+            else return Piece::fromId(static_cast<int>(it));
+        }
+
+        [[nodiscard]] static constexpr std::optional<Piece> fromString(std::string_view sv) noexcept
+        {
+            if (sv.size() != 1) return {};
+
+            return fromChar(sv[0]);
+        }
+    };
+
+    template <typename TagT>
+    struct Coord
+    {
+        constexpr Coord() noexcept :
+            m_i(0)
+        {
+        }
+
+        constexpr explicit Coord(int i) noexcept :
+            m_i(i)
+        {
+        }
+
+        [[nodiscard]] constexpr explicit operator int() const
+        {
+            return static_cast<int>(m_i);
+        }
+
+        constexpr friend Coord& operator++(Coord& c)
+        {
+            ++c.m_i;
+            return c;
+        }
+
+        constexpr friend Coord& operator--(Coord& c)
+        {
+            --c.m_i;
+            return c;
+        }
+
+        constexpr friend Coord& operator+=(Coord& c, int d)
+        {
+            c.m_i += d;
+            return c;
+        }
+
+        constexpr friend Coord& operator-=(Coord& c, int d)
+        {
+            c.m_i -= d;
+            return c;
+        }
+
+        constexpr friend Coord operator+(const Coord& c, int d)
+        {
+            Coord cpy(c);
+            cpy += d;
+            return cpy;
+        }
+
+        constexpr friend Coord operator-(const Coord& c, int d)
+        {
+            Coord cpy(c);
+            cpy -= d;
+            return cpy;
+        }
+
+        constexpr friend int operator-(const Coord& c1, const Coord& c2)
+        {
+            return c1.m_i - c2.m_i;
+        }
+
+        [[nodiscard]] constexpr friend bool operator==(const Coord& c1, const Coord& c2) noexcept
+        {
+            return c1.m_i == c2.m_i;
+        }
+
+        [[nodiscard]] constexpr friend bool operator!=(const Coord& c1, const Coord& c2) noexcept
+        {
+            return c1.m_i != c2.m_i;
+        }
+
+        [[nodiscard]] constexpr friend bool operator<(const Coord& c1, const Coord& c2) noexcept
+        {
+            return c1.m_i < c2.m_i;
+        }
+
+        [[nodiscard]] constexpr friend bool operator<=(const Coord& c1, const Coord& c2) noexcept
+        {
+            return c1.m_i <= c2.m_i;
+        }
+
+        [[nodiscard]] constexpr friend bool operator>(const Coord& c1, const Coord& c2) noexcept
+        {
+            return c1.m_i > c2.m_i;
+        }
+
+        [[nodiscard]] constexpr friend bool operator>=(const Coord& c1, const Coord& c2) noexcept
+        {
+            return c1.m_i >= c2.m_i;
+        }
+
+    private:
+        std::int8_t m_i;
+    };
+
+    struct FileTag;
+    struct RankTag;
+    using File = Coord<FileTag>;
+    using Rank = Coord<RankTag>;
+
+    constexpr File fileA = File(0);
+    constexpr File fileB = File(1);
+    constexpr File fileC = File(2);
+    constexpr File fileD = File(3);
+    constexpr File fileE = File(4);
+    constexpr File fileF = File(5);
+    constexpr File fileG = File(6);
+    constexpr File fileH = File(7);
+
+    constexpr Rank rank1 = Rank(0);
+    constexpr Rank rank2 = Rank(1);
+    constexpr Rank rank3 = Rank(2);
+    constexpr Rank rank4 = Rank(3);
+    constexpr Rank rank5 = Rank(4);
+    constexpr Rank rank6 = Rank(5);
+    constexpr Rank rank7 = Rank(6);
+    constexpr Rank rank8 = Rank(7);
+
+    template <>
+    struct EnumTraits<File>
+    {
+        using IdType = int;
+        using EnumType = File;
+
+        static constexpr int cardinality = 8;
+        static constexpr bool isNaturalIndex = true;
+
+        [[nodiscard]] static constexpr int ordinal(EnumType c) noexcept
+        {
+            return static_cast<IdType>(c);
+        }
+
+        [[nodiscard]] static constexpr EnumType fromOrdinal(IdType id) noexcept
+        {
+            assert(id >= 0 && id < cardinality);
+
+            return static_cast<EnumType>(id);
+        }
+
+        [[nodiscard]] static constexpr std::string_view toString(EnumType c) noexcept
+        {
+            assert(ordinal(c) >= 0 && ordinal(c) < 8);
+
+            return std::string_view("abcdefgh" + ordinal(c), 1);
+        }
+
+        [[nodiscard]] static constexpr std::optional<File> fromChar(char c) noexcept
+        {
+            if (c < 'a' || c > 'h') return {};
+            return static_cast<File>(c - 'a');
+        }
+
+        [[nodiscard]] static constexpr std::optional<File> fromString(std::string_view sv) noexcept
+        {
+            if (sv.size() != 1) return {};
+
+            return fromChar(sv[0]);
+        }
+    };
+
+    template <>
+    struct EnumTraits<Rank>
+    {
+        using IdType = int;
+        using EnumType = Rank;
+
+        static constexpr int cardinality = 8;
+        static constexpr bool isNaturalIndex = true;
+
+        [[nodiscard]] static constexpr int ordinal(EnumType c) noexcept
+        {
+            return static_cast<IdType>(c);
+        }
+
+        [[nodiscard]] static constexpr EnumType fromOrdinal(IdType id) noexcept
+        {
+            assert(id >= 0 && id < cardinality);
+
+            return static_cast<EnumType>(id);
+        }
+
+        [[nodiscard]] static constexpr std::string_view toString(EnumType c) noexcept
+        {
+            assert(ordinal(c) >= 0 && ordinal(c) < 8);
+
+            return std::string_view("12345678" + ordinal(c), 1);
+        }
+
+        [[nodiscard]] static constexpr std::optional<Rank> fromChar(char c) noexcept
+        {
+            if (c < '1' || c > '8') return {};
+            return static_cast<Rank>(c - '1');
+        }
+
+        [[nodiscard]] static constexpr std::optional<Rank> fromString(std::string_view sv) noexcept
+        {
+            if (sv.size() != 1) return {};
+
+            return fromChar(sv[0]);
+        }
+    };
+
+    // files east
+    // ranks north
+    struct FlatSquareOffset
+    {
+        std::int8_t value;
+
+        constexpr FlatSquareOffset() noexcept :
+            value(0)
+        {
+        }
+
+        constexpr FlatSquareOffset(int files, int ranks) noexcept :
+            value(files + ranks * cardinality<File>())
+        {
+            assert(files + ranks * cardinality<File>() >= std::numeric_limits<std::int8_t>::min());
+            assert(files + ranks * cardinality<File>() <= std::numeric_limits<std::int8_t>::max());
+        }
+
+        constexpr FlatSquareOffset operator-() const noexcept
+        {
+            return FlatSquareOffset(-value);
+        }
+
+    private:
+        constexpr FlatSquareOffset(int v) noexcept :
+            value(v)
+        {
+        }
+    };
+
+    struct Offset
+    {
+        std::int8_t files;
+        std::int8_t ranks;
+
+        constexpr Offset() :
+            files(0),
+            ranks(0)
+        {
+        }
+
+        constexpr Offset(int files, int ranks) :
+            files(files),
+            ranks(ranks)
+        {
+        }
+
+        [[nodiscard]] constexpr FlatSquareOffset flat() const
+        {
+            return { files, ranks };
+        }
+
+        [[nodiscard]] constexpr Offset operator-() const
+        {
+            return { -files, -ranks };
+        }
+    };
+
+    struct SquareCoords
+    {
+        File file;
+        Rank rank;
+
+        constexpr SquareCoords() noexcept :
+            file{},
+            rank{}
+        {
+        }
+
+        constexpr SquareCoords(File f, Rank r) noexcept :
+            file(f),
+            rank(r)
+        {
+        }
+
+        constexpr friend SquareCoords& operator+=(SquareCoords& c, Offset offset)
+        {
+            c.file += offset.files;
+            c.rank += offset.ranks;
+            return c;
+        }
+
+        [[nodiscard]] constexpr friend SquareCoords operator+(const SquareCoords& c, Offset offset)
+        {
+            SquareCoords cpy(c);
+            cpy.file += offset.files;
+            cpy.rank += offset.ranks;
+            return cpy;
+        }
+
+        [[nodiscard]] constexpr bool isOk() const
+        {
+            return file >= fileA && file <= fileH && rank >= rank1 && rank <= rank8;
+        }
+    };
+
+    struct Square
+    {
+    private:
+        static constexpr std::int8_t m_noneId = cardinality<Rank>() * cardinality<File>();
+
+        static constexpr std::uint8_t fileMask = 0b111;
+        static constexpr std::uint8_t rankMask = 0b111000;
+        static constexpr std::uint8_t rankShift = 3;
+
+    public:
+        [[nodiscard]] static constexpr Square none()
+        {
+            return Square(m_noneId);
+        }
+
+        constexpr Square() noexcept :
+            m_id(0)
+        {
+        }
+
+        constexpr explicit Square(int idx) noexcept :
+            m_id(idx)
+        {
+            assert(isOk() || m_id == m_noneId);
+        }
+
+        constexpr Square(File file, Rank rank) noexcept :
+            m_id(ordinal(file) + ordinal(rank) * cardinality<File>())
+        {
+            assert(isOk());
+        }
+
+        constexpr explicit Square(SquareCoords coords) noexcept :
+            Square(coords.file, coords.rank)
+        {
+        }
+
+        [[nodiscard]] constexpr friend bool operator<(Square lhs, Square rhs) noexcept
+        {
+            return lhs.m_id < rhs.m_id;
+        }
+
+        [[nodiscard]] constexpr friend bool operator>(Square lhs, Square rhs) noexcept
+        {
+            return lhs.m_id > rhs.m_id;
+        }
+
+        [[nodiscard]] constexpr friend bool operator<=(Square lhs, Square rhs) noexcept
+        {
+            return lhs.m_id <= rhs.m_id;
+        }
+
+        [[nodiscard]] constexpr friend bool operator>=(Square lhs, Square rhs) noexcept
+        {
+            return lhs.m_id >= rhs.m_id;
+        }
+
+        [[nodiscard]] constexpr friend bool operator==(Square lhs, Square rhs) noexcept
+        {
+            return lhs.m_id == rhs.m_id;
+        }
+
+        [[nodiscard]] constexpr friend bool operator!=(Square lhs, Square rhs) noexcept
+        {
+            return !(lhs == rhs);
+        }
+
+        constexpr friend Square& operator++(Square& sq)
+        {
+            ++sq.m_id;
+            return sq;
+        }
+
+        constexpr friend Square& operator--(Square& sq)
+        {
+            --sq.m_id;
+            return sq;
+        }
+
+        [[nodiscard]] constexpr friend Square operator+(Square sq, FlatSquareOffset offset)
+        {
+            Square sqCpy = sq;
+            sqCpy += offset;
+            return sqCpy;
+        }
+
+        constexpr friend Square& operator+=(Square& sq, FlatSquareOffset offset)
+        {
+            assert(sq.m_id + offset.value >= 0 && sq.m_id + offset.value < Square::m_noneId);
+            sq.m_id += offset.value;
+            return sq;
+        }
+
+        [[nodiscard]] constexpr friend Square operator+(Square sq, Offset offset)
+        {
+            assert(sq.file() + offset.files >= fileA);
+            assert(sq.file() + offset.files <= fileH);
+            assert(sq.rank() + offset.ranks >= rank1);
+            assert(sq.rank() + offset.ranks <= rank8);
+            return operator+(sq, offset.flat());
+        }
+
+        constexpr friend Square& operator+=(Square& sq, Offset offset)
+        {
+            return operator+=(sq, offset.flat());
+        }
+
+        [[nodiscard]] constexpr explicit operator int() const
+        {
+            return m_id;
+        }
+
+        [[nodiscard]] constexpr File file() const
+        {
+            assert(isOk());
+            return File(static_cast<unsigned>(m_id) & fileMask);
+        }
+
+        [[nodiscard]] constexpr Rank rank() const
+        {
+            assert(isOk());
+            return Rank(static_cast<unsigned>(m_id) >> rankShift);
+        }
+
+        [[nodiscard]] constexpr SquareCoords coords() const
+        {
+            return { file(), rank() };
+        }
+
+        [[nodiscard]] constexpr Color color() const
+        {
+            assert(isOk());
+            return !fromOrdinal<Color>(ordinal(rank()) + ordinal(file()) & 1);
+        }
+
+        constexpr void flipVertically()
+        {
+            m_id ^= rankMask;
+        }
+
+        constexpr void flipHorizontally()
+        {
+            m_id ^= fileMask;
+        }
+
+        constexpr Square flippedVertically() const
+        {
+            return Square(m_id ^ rankMask);
+        }
+
+        constexpr Square flippedHorizontally() const
+        {
+            return Square(m_id ^ fileMask);
+        }
+
+        [[nodiscard]] constexpr bool isOk() const
+        {
+            return m_id >= 0 && m_id < m_noneId;
+        }
+
+    private:
+        std::int8_t m_id;
+    };
+
+    constexpr Square a1(fileA, rank1);
+    constexpr Square a2(fileA, rank2);
+    constexpr Square a3(fileA, rank3);
+    constexpr Square a4(fileA, rank4);
+    constexpr Square a5(fileA, rank5);
+    constexpr Square a6(fileA, rank6);
+    constexpr Square a7(fileA, rank7);
+    constexpr Square a8(fileA, rank8);
+
+    constexpr Square b1(fileB, rank1);
+    constexpr Square b2(fileB, rank2);
+    constexpr Square b3(fileB, rank3);
+    constexpr Square b4(fileB, rank4);
+    constexpr Square b5(fileB, rank5);
+    constexpr Square b6(fileB, rank6);
+    constexpr Square b7(fileB, rank7);
+    constexpr Square b8(fileB, rank8);
+
+    constexpr Square c1(fileC, rank1);
+    constexpr Square c2(fileC, rank2);
+    constexpr Square c3(fileC, rank3);
+    constexpr Square c4(fileC, rank4);
+    constexpr Square c5(fileC, rank5);
+    constexpr Square c6(fileC, rank6);
+    constexpr Square c7(fileC, rank7);
+    constexpr Square c8(fileC, rank8);
+
+    constexpr Square d1(fileD, rank1);
+    constexpr Square d2(fileD, rank2);
+    constexpr Square d3(fileD, rank3);
+    constexpr Square d4(fileD, rank4);
+    constexpr Square d5(fileD, rank5);
+    constexpr Square d6(fileD, rank6);
+    constexpr Square d7(fileD, rank7);
+    constexpr Square d8(fileD, rank8);
+
+    constexpr Square e1(fileE, rank1);
+    constexpr Square e2(fileE, rank2);
+    constexpr Square e3(fileE, rank3);
+    constexpr Square e4(fileE, rank4);
+    constexpr Square e5(fileE, rank5);
+    constexpr Square e6(fileE, rank6);
+    constexpr Square e7(fileE, rank7);
+    constexpr Square e8(fileE, rank8);
+
+    constexpr Square f1(fileF, rank1);
+    constexpr Square f2(fileF, rank2);
+    constexpr Square f3(fileF, rank3);
+    constexpr Square f4(fileF, rank4);
+    constexpr Square f5(fileF, rank5);
+    constexpr Square f6(fileF, rank6);
+    constexpr Square f7(fileF, rank7);
+    constexpr Square f8(fileF, rank8);
+
+    constexpr Square g1(fileG, rank1);
+    constexpr Square g2(fileG, rank2);
+    constexpr Square g3(fileG, rank3);
+    constexpr Square g4(fileG, rank4);
+    constexpr Square g5(fileG, rank5);
+    constexpr Square g6(fileG, rank6);
+    constexpr Square g7(fileG, rank7);
+    constexpr Square g8(fileG, rank8);
+
+    constexpr Square h1(fileH, rank1);
+    constexpr Square h2(fileH, rank2);
+    constexpr Square h3(fileH, rank3);
+    constexpr Square h4(fileH, rank4);
+    constexpr Square h5(fileH, rank5);
+    constexpr Square h6(fileH, rank6);
+    constexpr Square h7(fileH, rank7);
+    constexpr Square h8(fileH, rank8);
+
+    static_assert(e1.color() == Color::Black);
+    static_assert(e8.color() == Color::White);
+
+    static_assert(e1.file() == fileE);
+    static_assert(e1.rank() == rank1);
+
+    static_assert(e1.flippedHorizontally() == d1);
+    static_assert(e1.flippedVertically() == e8);
+
+    template <>
+    struct EnumTraits<Square>
+    {
+        using IdType = int;
+        using EnumType = Square;
+
+        static constexpr int cardinality = chess::cardinality<Rank>() * chess::cardinality<File>();
+        static constexpr bool isNaturalIndex = true;
+
+        static constexpr std::array<EnumType, cardinality> values{
+            a1, b1, c1, d1, e1, f1, g1, h1,
+            a2, b2, c2, d2, e2, f2, g2, h2,
+            a3, b3, c3, d3, e3, f3, g3, h3,
+            a4, b4, c4, d4, e4, f4, g4, h4,
+            a5, b5, c5, d5, e5, f5, g5, h5,
+            a6, b6, c6, d6, e6, f6, g6, h6,
+            a7, b7, c7, d7, e7, f7, g7, h7,
+            a8, b8, c8, d8, e8, f8, g8, h8
+        };
+
+        [[nodiscard]] static constexpr int ordinal(EnumType c) noexcept
+        {
+            return static_cast<IdType>(c);
+        }
+
+        [[nodiscard]] static constexpr EnumType fromOrdinal(IdType id) noexcept
+        {
+            assert(id >= 0 && id < cardinality + 1);
+
+            return static_cast<EnumType>(id);
+        }
+
+        [[nodiscard]] static constexpr std::string_view toString(Square sq)
+        {
+            assert(sq.isOk());
+
+            return
+                std::string_view(
+                    "a1b1c1d1e1f1g1h1"
+                    "a2b2c2d2e2f2g2h2"
+                    "a3b3c3d3e3f3g3h3"
+                    "a4b4c4d4e4f4g4h4"
+                    "a5b5c5d5e5f5g5h5"
+                    "a6b6c6d6e6f6g6h6"
+                    "a7b7c7d7e7f7g7h7"
+                    "a8b8c8d8e8f8g8h8"
+                    + (ordinal(sq) * 2),
+                    2
+                );
+        }
+
+        [[nodiscard]] static constexpr std::optional<Square> fromString(std::string_view sv) noexcept
+        {
+            if (sv.size() != 2) return {};
+
+            const char f = sv[0];
+            const char r = sv[1];
+            if (f < 'a' || f > 'h') return {};
+            if (r < '1' || r > '8') return {};
+
+            return Square(static_cast<File>(f - 'a'), static_cast<Rank>(r - '1'));
+        }
+    };
+
+    static_assert(toString(d1) == std::string_view("d1"));
+    static_assert(values<Square>()[29] == f4);
+
+    enum struct MoveType : std::uint8_t
+    {
+        Normal,
+        Promotion,
+        Castle,
+        EnPassant
+    };
+
+    template <>
+    struct EnumTraits<MoveType>
+    {
+        using IdType = int;
+        using EnumType = MoveType;
+
+        static constexpr int cardinality = 4;
+        static constexpr bool isNaturalIndex = true;
+
+        static constexpr std::array<EnumType, cardinality> values{
+            MoveType::Normal,
+            MoveType::Promotion,
+            MoveType::Castle,
+            MoveType::EnPassant
+        };
+
+        [[nodiscard]] static constexpr int ordinal(EnumType c) noexcept
+        {
+            return static_cast<IdType>(c);
+        }
+
+        [[nodiscard]] static constexpr EnumType fromOrdinal(IdType id) noexcept
+        {
+            assert(id >= 0 && id < cardinality);
+
+            return static_cast<EnumType>(id);
+        }
+    };
+
+    enum struct CastleType : std::uint8_t
+    {
+        Short,
+        Long
+    };
+
+    [[nodiscard]] constexpr CastleType operator!(CastleType ct)
+    {
+        return static_cast<CastleType>(static_cast<std::uint8_t>(ct) ^ 1);
+    }
+
+    template <>
+    struct EnumTraits<CastleType>
+    {
+        using IdType = int;
+        using EnumType = CastleType;
+
+        static constexpr int cardinality = 2;
+        static constexpr bool isNaturalIndex = true;
+
+        static constexpr std::array<EnumType, cardinality> values{
+            CastleType::Short,
+            CastleType::Long
+        };
+
+        [[nodiscard]] static constexpr int ordinal(EnumType c) noexcept
+        {
+            return static_cast<IdType>(c);
+        }
+
+        [[nodiscard]] static constexpr EnumType fromOrdinal(IdType id) noexcept
+        {
+            assert(id >= 0 && id < cardinality);
+
+            return static_cast<EnumType>(id);
+        }
+    };
+
+    struct CompressedMove;
+
+    // castling is encoded as a king capturing rook
+    // ep is encoded as a normal pawn capture (move.to is empty on the board)
+    struct Move
+    {
+        Square from;
+        Square to;
+        MoveType type = MoveType::Normal;
+        Piece promotedPiece = Piece::none();
+
+        [[nodiscard]] constexpr friend bool operator==(const Move& lhs, const Move& rhs) noexcept
+        {
+            return lhs.from == rhs.from
+                && lhs.to == rhs.to
+                && lhs.type == rhs.type
+                && lhs.promotedPiece == rhs.promotedPiece;
+        }
+
+        [[nodiscard]] constexpr friend bool operator!=(const Move& lhs, const Move& rhs) noexcept
+        {
+            return !(lhs == rhs);
+        }
+
+        [[nodiscard]] constexpr CompressedMove compress() const noexcept;
+
+        [[nodiscard]] constexpr static Move null()
+        {
+            return Move{ Square::none(), Square::none() };
+        }
+
+        [[nodiscard]] constexpr static Move castle(CastleType ct, Color c);
+
+        [[nodiscard]] constexpr static Move normal(Square from, Square to)
+        {
+            return Move{ from, to, MoveType::Normal, Piece::none() };
+        }
+
+        [[nodiscard]] constexpr static Move enPassant(Square from, Square to)
+        {
+            return Move{ from, to, MoveType::EnPassant, Piece::none() };
+        }
+
+        [[nodiscard]] constexpr static Move promotion(Square from, Square to, Piece piece)
+        {
+            return Move{ from, to, MoveType::Promotion, piece };
+        }
+    };
+
+    namespace detail::castle
+    {
+        constexpr EnumArray2<CastleType, Color, Move> moves = { {
+            {{ { e1, h1, MoveType::Castle }, { e8, h8, MoveType::Castle } }},
+            {{ { e1, a1, MoveType::Castle }, { e8, a8, MoveType::Castle } }}
+        } };
+    }
+
+    [[nodiscard]] constexpr Move Move::castle(CastleType ct, Color c)
+    {
+        return detail::castle::moves[ct][c];
+    }
+
+    static_assert(sizeof(Move) == 4);
+
+    struct CompressedMove
+    {
+    private:
+        // from most significant bits
+        // 2 bits for move type
+        // 6 bits for from square
+        // 6 bits for to square
+        // 2 bits for promoted piece type
+        //    0 if not a promotion
+        static constexpr std::uint16_t squareMask = 0b111111u;
+        static constexpr std::uint16_t promotedPieceTypeMask = 0b11u;
+        static constexpr std::uint16_t moveTypeMask = 0b11u;
+
+    public:
+        [[nodiscard]] constexpr static CompressedMove readFromBigEndian(const unsigned char* data)
+        {
+            CompressedMove move{};
+            move.m_packed = (data[0] << 8) | data[1];
+            return move;
+        }
+
+        constexpr CompressedMove() noexcept :
+            m_packed(0)
+        {
+        }
+
+        // move must be either valid or a null move
+        constexpr CompressedMove(Move move) noexcept :
+            m_packed(0)
+        {
+            // else null move
+            if (move.from != move.to)
+            {
+                assert(move.from != Square::none());
+                assert(move.to != Square::none());
+
+                m_packed =
+                    (static_cast<std::uint16_t>(ordinal(move.type)) << (16 - 2))
+                    | (static_cast<std::uint16_t>(ordinal(move.from)) << (16 - 2 - 6))
+                    | (static_cast<std::uint16_t>(ordinal(move.to)) << (16 - 2 - 6 - 6));
+
+                if (move.type == MoveType::Promotion)
+                {
+                    assert(move.promotedPiece != Piece::none());
+
+                    m_packed |= ordinal(move.promotedPiece.type()) - ordinal(PieceType::Knight);
+                }
+                else
+                {
+                    assert(move.promotedPiece == Piece::none());
+                }
+            }
+        }
+
+        void writeToBigEndian(unsigned char* data) const
+        {
+            *data++ = m_packed >> 8;
+            *data++ = m_packed & 0xFF;
+        }
+
+        [[nodiscard]] constexpr std::uint16_t packed() const
+        {
+            return m_packed;
+        }
+
+        [[nodiscard]] constexpr MoveType type() const
+        {
+            return fromOrdinal<MoveType>(m_packed >> (16 - 2));
+        }
+
+        [[nodiscard]] constexpr Square from() const
+        {
+            return fromOrdinal<Square>((m_packed >> (16 - 2 - 6)) & squareMask);
+        }
+
+        [[nodiscard]] constexpr Square to() const
+        {
+            return fromOrdinal<Square>((m_packed >> (16 - 2 - 6 - 6)) & squareMask);
+        }
+
+        [[nodiscard]] constexpr Piece promotedPiece() const
+        {
+            if (type() == MoveType::Promotion)
+            {
+                const Color color =
+                    (to().rank() == rank1)
+                    ? Color::Black
+                    : Color::White;
+
+                const PieceType pt = fromOrdinal<PieceType>((m_packed & promotedPieceTypeMask) + ordinal(PieceType::Knight));
+                return color | pt;
+            }
+            else
+            {
+                return Piece::none();
+            }
+        }
+
+        [[nodiscard]] constexpr Move decompress() const noexcept
+        {
+            if (m_packed == 0)
+            {
+                return Move::null();
+            }
+            else
+            {
+                const MoveType type = fromOrdinal<MoveType>(m_packed >> (16 - 2));
+                const Square from = fromOrdinal<Square>((m_packed >> (16 - 2 - 6)) & squareMask);
+                const Square to = fromOrdinal<Square>((m_packed >> (16 - 2 - 6 - 6)) & squareMask);
+                const Piece promotedPiece = [&]() {
+                    if (type == MoveType::Promotion)
+                    {
+                        const Color color =
+                            (to.rank() == rank1)
+                            ? Color::Black
+                            : Color::White;
+
+                        const PieceType pt = fromOrdinal<PieceType>((m_packed & promotedPieceTypeMask) + ordinal(PieceType::Knight));
+                        return color | pt;
+                    }
+                    else
+                    {
+                        return Piece::none();
+                    }
+                }();
+
+                return Move{ from, to, type, promotedPiece };
+            }
+        }
+
+    private:
+        std::uint16_t m_packed;
+    };
+
+    static_assert(sizeof(CompressedMove) == 2);
+
+    [[nodiscard]] constexpr CompressedMove Move::compress() const noexcept
+    {
+        return CompressedMove(*this);
+    }
+
+    static_assert(a4 + Offset{ 0, 1 } == a5);
+    static_assert(a4 + Offset{ 0, 2 } == a6);
+    static_assert(a4 + Offset{ 0, -2 } == a2);
+    static_assert(a4 + Offset{ 0, -1 } == a3);
+
+    static_assert(e4 + Offset{ 1, 0 } == f4);
+    static_assert(e4 + Offset{ 2, 0 } == g4);
+    static_assert(e4 + Offset{ -1, 0 } == d4);
+    static_assert(e4 + Offset{ -2, 0 } == c4);
+
+    enum struct CastlingRights : std::uint8_t
+    {
+        None = 0x0,
+        WhiteKingSide = 0x1,
+        WhiteQueenSide = 0x2,
+        BlackKingSide = 0x4,
+        BlackQueenSide = 0x8,
+        White = WhiteKingSide | WhiteQueenSide,
+        Black = BlackKingSide | BlackQueenSide,
+        All = WhiteKingSide | WhiteQueenSide | BlackKingSide | BlackQueenSide
+    };
+
+    [[nodiscard]] constexpr CastlingRights operator|(CastlingRights lhs, CastlingRights rhs)
+    {
+        return static_cast<CastlingRights>(static_cast<std::uint8_t>(lhs) | static_cast<std::uint8_t>(rhs));
+    }
+
+    [[nodiscard]] constexpr CastlingRights operator&(CastlingRights lhs, CastlingRights rhs)
+    {
+        return static_cast<CastlingRights>(static_cast<std::uint8_t>(lhs) & static_cast<std::uint8_t>(rhs));
+    }
+
+    [[nodiscard]] constexpr CastlingRights operator~(CastlingRights lhs)
+    {
+        return static_cast<CastlingRights>(~static_cast<std::uint8_t>(lhs) & static_cast<std::uint8_t>(CastlingRights::All));
+    }
+
+    constexpr CastlingRights& operator|=(CastlingRights& lhs, CastlingRights rhs)
+    {
+        lhs = static_cast<CastlingRights>(static_cast<std::uint8_t>(lhs) | static_cast<std::uint8_t>(rhs));
+        return lhs;
+    }
+
+    constexpr CastlingRights& operator&=(CastlingRights& lhs, CastlingRights rhs)
+    {
+        lhs = static_cast<CastlingRights>(static_cast<std::uint8_t>(lhs) & static_cast<std::uint8_t>(rhs));
+        return lhs;
+    }
+    // checks whether lhs contains rhs
+    [[nodiscard]] constexpr bool contains(CastlingRights lhs, CastlingRights rhs)
+    {
+        return (lhs & rhs) == rhs;
+    }
+
+    template <>
+    struct EnumTraits<CastlingRights>
+    {
+        using IdType = int;
+        using EnumType = CastlingRights;
+
+        static constexpr int cardinality = 4;
+        static constexpr bool isNaturalIndex = false;
+
+        static constexpr std::array<EnumType, cardinality> values{
+            CastlingRights::WhiteKingSide,
+            CastlingRights::WhiteQueenSide,
+            CastlingRights::BlackKingSide,
+            CastlingRights::BlackQueenSide
+        };
+
+        [[nodiscard]] static constexpr int ordinal(EnumType c) noexcept
+        {
+            return static_cast<IdType>(c);
+        }
+
+        [[nodiscard]] static constexpr EnumType fromOrdinal(IdType id) noexcept
+        {
+            return static_cast<EnumType>(id);
+        }
+    };
+
+    struct CompressedReverseMove;
+
+    struct ReverseMove
+    {
+        Move move;
+        Piece capturedPiece;
+        Square oldEpSquare;
+        CastlingRights oldCastlingRights;
+
+        // We need a well defined case for the starting position.
+        constexpr ReverseMove() :
+            move(Move::null()),
+            capturedPiece(Piece::none()),
+            oldEpSquare(Square::none()),
+            oldCastlingRights(CastlingRights::All)
+        {
+        }
+
+        constexpr ReverseMove(const Move& move, Piece capturedPiece, Square oldEpSquare, CastlingRights oldCastlingRights) :
+            move(move),
+            capturedPiece(capturedPiece),
+            oldEpSquare(oldEpSquare),
+            oldCastlingRights(oldCastlingRights)
+        {
+        }
+
+        constexpr bool isNull() const
+        {
+            return move.from == move.to;
+        }
+
+        [[nodiscard]] constexpr CompressedReverseMove compress() const noexcept;
+
+        [[nodiscard]] constexpr friend bool operator==(const ReverseMove& lhs, const ReverseMove& rhs) noexcept
+        {
+            return lhs.move == rhs.move
+                && lhs.capturedPiece == rhs.capturedPiece
+                && lhs.oldEpSquare == rhs.oldEpSquare
+                && lhs.oldCastlingRights == rhs.oldCastlingRights;
+        }
+
+        [[nodiscard]] constexpr friend bool operator!=(const ReverseMove& lhs, const ReverseMove& rhs) noexcept
+        {
+            return !(lhs == rhs);
+        }
+    };
+
+    static_assert(sizeof(ReverseMove) == 7);
+
+    struct CompressedReverseMove
+    {
+    private:
+        // we use 7 bits because it can be Square::none()
+        static constexpr std::uint32_t squareMask = 0b1111111u;
+        static constexpr std::uint32_t pieceMask = 0b1111u;
+        static constexpr std::uint32_t castlingRightsMask = 0b1111;
+    public:
+
+        constexpr CompressedReverseMove() noexcept :
+            m_move{},
+            m_oldState{}
+        {
+        }
+
+        constexpr CompressedReverseMove(const ReverseMove& rm) noexcept :
+            m_move(rm.move.compress()),
+            m_oldState{ static_cast<uint16_t>(
+                ((ordinal(rm.capturedPiece) & pieceMask) << 11)
+                | ((ordinal(rm.oldCastlingRights) & castlingRightsMask) << 7)
+                | (ordinal(rm.oldEpSquare) & squareMask)
+                )
+            }
+        {
+        }
+
+        [[nodiscard]] constexpr Move move() const
+        {
+            return m_move.decompress();
+        }
+
+        [[nodiscard]] const CompressedMove& compressedMove() const
+        {
+            return m_move;
+        }
+
+        [[nodiscard]] constexpr Piece capturedPiece() const
+        {
+            return fromOrdinal<Piece>(m_oldState >> 11);
+        }
+
+        [[nodiscard]] constexpr CastlingRights oldCastlingRights() const
+        {
+            return fromOrdinal<CastlingRights>((m_oldState >> 7) & castlingRightsMask);
+        }
+
+        [[nodiscard]] constexpr Square oldEpSquare() const
+        {
+            return fromOrdinal<Square>(m_oldState & squareMask);
+        }
+
+        [[nodiscard]] constexpr ReverseMove decompress() const noexcept
+        {
+            const Piece capturedPiece = fromOrdinal<Piece>(m_oldState >> 11);
+            const CastlingRights castlingRights = fromOrdinal<CastlingRights>((m_oldState >> 7) & castlingRightsMask);
+            // We could pack the ep square more, but don't have to, because
+            // can't save another byte anyway.
+            const Square epSquare = fromOrdinal<Square>(m_oldState & squareMask);
+
+            return ReverseMove(m_move.decompress(), capturedPiece, epSquare, castlingRights);
+        }
+
+    private:
+        CompressedMove m_move;
+        std::uint16_t m_oldState;
+    };
+
+    static_assert(sizeof(CompressedReverseMove) == 4);
+
+    [[nodiscard]] constexpr CompressedReverseMove ReverseMove::compress() const noexcept
+    {
+        return CompressedReverseMove(*this);
+    }
+
+    // This can be regarded as a perfect hash. Going back is hard.
+    struct PackedReverseMove
+    {
+        static constexpr std::uint32_t mask = 0x7FFFFFFu;
+        static constexpr std::size_t numBits = 27;
+
+    private:
+        static constexpr std::uint32_t squareMask = 0b111111u;
+        static constexpr std::uint32_t pieceMask = 0b1111u;
+        static constexpr std::uint32_t pieceTypeMask = 0b111u;
+        static constexpr std::uint32_t castlingRightsMask = 0b1111;
+        static constexpr std::uint32_t fileMask = 0b111;
+
+    public:
+        constexpr PackedReverseMove(const std::uint32_t packed) :
+            m_packed(packed)
+        {
+
+        }
+
+        constexpr PackedReverseMove(const ReverseMove& reverseMove) :
+            m_packed(
+                0u
+                // The only move when square is none() is null move and
+                // then both squares are none(). No other move is like that
+                // so we don't lose any information by storing only
+                // the 6 bits of each square.
+                | ((ordinal(reverseMove.move.from) & squareMask) << 21)
+                | ((ordinal(reverseMove.move.to) & squareMask) << 15)
+                // Other masks are just for code clarity, they should
+                // never change the values.
+                | ((ordinal(reverseMove.capturedPiece) & pieceMask) << 11)
+                | ((ordinal(reverseMove.oldCastlingRights) & castlingRightsMask) << 7)
+                | ((ordinal(reverseMove.move.promotedPiece.type()) & pieceTypeMask) << 4)
+                | (((reverseMove.oldEpSquare != Square::none()) & 1) << 3)
+                // We probably could omit the squareMask here but for clarity it's left.
+                | (ordinal(Square(ordinal(reverseMove.oldEpSquare) & squareMask).file()) & fileMask)
+            )
+        {
+        }
+
+        constexpr std::uint32_t packed() const
+        {
+            return m_packed;
+        }
+
+        constexpr ReverseMove unpack(Color sideThatMoved) const
+        {
+            ReverseMove rmove{};
+
+            rmove.move.from = fromOrdinal<Square>((m_packed >> 21) & squareMask);
+            rmove.move.to = fromOrdinal<Square>((m_packed >> 15) & squareMask);
+            rmove.capturedPiece = fromOrdinal<Piece>((m_packed >> 11) & pieceMask);
+            rmove.oldCastlingRights = fromOrdinal<CastlingRights>((m_packed >> 7) & castlingRightsMask);
+            const PieceType promotedPieceType = fromOrdinal<PieceType>((m_packed >> 4) & pieceTypeMask);
+            if (promotedPieceType != PieceType::None)
+            {
+                rmove.move.promotedPiece = Piece(promotedPieceType, sideThatMoved);
+                rmove.move.type = MoveType::Promotion;
+            }
+            const bool hasEpSquare = static_cast<bool>((m_packed >> 3) & 1);
+            if (hasEpSquare)
+            {
+                // ep square is always where the opponent moved
+                const Rank rank =
+                    sideThatMoved == Color::White
+                    ? rank6
+                    : rank3;
+                const File file = fromOrdinal<File>(m_packed & fileMask);
+                rmove.oldEpSquare = Square(file, rank);
+                if (rmove.oldEpSquare == rmove.move.to)
+                {
+                    rmove.move.type = MoveType::EnPassant;
+                }
+            }
+            else
+            {
+                rmove.oldEpSquare = Square::none();
+            }
+
+            if (rmove.move.type == MoveType::Normal && rmove.oldCastlingRights != CastlingRights::None)
+            {
+                // If castling was possible then we know it was the king that moved from e1/e8.
+                if (rmove.move.from == e1)
+                {
+                    if (rmove.move.to == h1 || rmove.move.to == a1)
+                    {
+                        rmove.move.type = MoveType::Castle;
+                    }
+                }
+                else if (rmove.move.from == e8)
+                {
+                    if (rmove.move.to == h8 || rmove.move.to == a8)
+                    {
+                        rmove.move.type = MoveType::Castle;
+                    }
+                }
+            }
+
+            return rmove;
+        }
+
+    private:
+        // Uses only 27 lowest bits.
+        // Bit meaning from highest to lowest.
+        // - 6 bits from
+        // - 6 bits to
+        // - 4 bits for the captured piece
+        // - 4 bits for prev castling rights
+        // - 3 bits promoted piece type
+        // - 1 bit  to specify if the ep square was valid (false if none())
+        // - 3 bits for prev ep square file
+        std::uint32_t m_packed;
+    };
+
+    struct MoveCompareLess
+    {
+        [[nodiscard]] bool operator()(const Move& lhs, const Move& rhs) const noexcept
+        {
+            if (ordinal(lhs.from) < ordinal(rhs.from)) return true;
+            if (ordinal(lhs.from) > ordinal(rhs.from)) return false;
+
+            if (ordinal(lhs.to) < ordinal(rhs.to)) return true;
+            if (ordinal(lhs.to) > ordinal(rhs.to)) return false;
+
+            if (ordinal(lhs.type) < ordinal(rhs.type)) return true;
+            if (ordinal(lhs.type) > ordinal(rhs.type)) return false;
+
+            if (ordinal(lhs.promotedPiece) < ordinal(rhs.promotedPiece)) return true;
+
+            return false;
+        }
+    };
+
+    struct ReverseMoveCompareLess
+    {
+        [[nodiscard]] bool operator()(const ReverseMove& lhs, const ReverseMove& rhs) const noexcept
+        {
+            if (MoveCompareLess{}(lhs.move, rhs.move)) return true;
+            if (MoveCompareLess{}(rhs.move, lhs.move)) return false;
+
+            if (ordinal(lhs.capturedPiece) < ordinal(rhs.capturedPiece)) return true;
+            if (ordinal(lhs.capturedPiece) > ordinal(rhs.capturedPiece)) return false;
+
+            if (static_cast<unsigned>(lhs.oldCastlingRights) < static_cast<unsigned>(rhs.oldCastlingRights)) return true;
+            if (static_cast<unsigned>(lhs.oldCastlingRights) > static_cast<unsigned>(rhs.oldCastlingRights)) return false;
+
+            if (ordinal(lhs.oldEpSquare) < ordinal(rhs.oldEpSquare)) return true;
+            if (ordinal(lhs.oldEpSquare) > ordinal(rhs.oldEpSquare)) return false;
+
+            return false;
+        }
+    };
+
+    struct BitboardIterator
+    {
+        using value_type = Square;
+        using difference_type = std::ptrdiff_t;
+        using reference = Square;
+        using iterator_category = std::input_iterator_tag;
+        using pointer = const Square*;
+
+        constexpr BitboardIterator() noexcept :
+            m_squares(0)
+        {
+        }
+
+        constexpr BitboardIterator(std::uint64_t v) noexcept :
+            m_squares(v)
+        {
+        }
+
+        constexpr BitboardIterator(const BitboardIterator&) = default;
+        constexpr BitboardIterator(BitboardIterator&&) = default;
+        constexpr BitboardIterator& operator=(const BitboardIterator&) = default;
+        constexpr BitboardIterator& operator=(BitboardIterator&&) = default;
+
+        [[nodiscard]] constexpr bool friend operator==(BitboardIterator lhs, BitboardIterator rhs) noexcept
+        {
+            return lhs.m_squares == rhs.m_squares;
+        }
+
+        [[nodiscard]] constexpr bool friend operator!=(BitboardIterator lhs, BitboardIterator rhs) noexcept
+        {
+            return lhs.m_squares != rhs.m_squares;
+        }
+
+        [[nodiscard]] inline Square operator*() const
+        {
+            return first();
+        }
+
+        constexpr BitboardIterator& operator++() noexcept
+        {
+            popFirst();
+            return *this;
+        }
+
+    private:
+        std::uint64_t m_squares;
+
+        constexpr void popFirst() noexcept
+        {
+            m_squares &= m_squares - 1;
+        }
+
+        [[nodiscard]] inline Square first() const
+        {
+            assert(m_squares != 0);
+
+            return fromOrdinal<Square>(intrin::lsb(m_squares));
+        }
+    };
+
+    struct Bitboard
+    {
+        // bits counted from the LSB
+        // order is A1 B2 ... G8 H8
+        // just like in Square
+
+    public:
+        constexpr Bitboard() noexcept :
+            m_squares(0)
+        {
+        }
+
+    private:
+        constexpr explicit Bitboard(Square sq) noexcept :
+            m_squares(static_cast<std::uint64_t>(1ULL) << ordinal(sq))
+        {
+            assert(sq.isOk());
+        }
+
+        constexpr explicit Bitboard(Rank r) noexcept :
+            m_squares(static_cast<std::uint64_t>(0xFFULL) << (ordinal(r) * 8))
+        {
+        }
+
+        constexpr explicit Bitboard(File f) noexcept :
+            m_squares(static_cast<std::uint64_t>(0x0101010101010101ULL) << ordinal(f))
+        {
+        }
+
+        constexpr explicit Bitboard(Color c) noexcept :
+            m_squares(c == Color::White ? 0xAA55AA55AA55AA55ULL : ~0xAA55AA55AA55AA55ULL)
+        {
+        }
+
+        constexpr explicit Bitboard(std::uint64_t bb) noexcept :
+            m_squares(bb)
+        {
+        }
+
+        // files A..file inclusive
+        static constexpr EnumArray<File, std::uint64_t> m_filesUpToBB{
+            0x0101010101010101ULL,
+            0x0303030303030303ULL,
+            0x0707070707070707ULL,
+            0x0F0F0F0F0F0F0F0FULL,
+            0x1F1F1F1F1F1F1F1FULL,
+            0x3F3F3F3F3F3F3F3FULL,
+            0x7F7F7F7F7F7F7F7FULL,
+            0xFFFFFFFFFFFFFFFFULL
+        };
+
+    public:
+
+        [[nodiscard]] static constexpr Bitboard none()
+        {
+            return Bitboard{};
+        }
+
+        [[nodiscard]] static constexpr Bitboard all()
+        {
+            return ~none();
+        }
+
+        [[nodiscard]] static constexpr Bitboard square(Square sq)
+        {
+            return Bitboard(sq);
+        }
+
+        [[nodiscard]] static constexpr Bitboard file(File f)
+        {
+            return Bitboard(f);
+        }
+
+        [[nodiscard]] static constexpr Bitboard rank(Rank r)
+        {
+            return Bitboard(r);
+        }
+
+        [[nodiscard]] static constexpr Bitboard color(Color c)
+        {
+            return Bitboard(c);
+        }
+
+        [[nodiscard]] static constexpr Bitboard fromBits(std::uint64_t bits)
+        {
+            return Bitboard(bits);
+        }
+
+        // inclusive
+        [[nodiscard]] static constexpr Bitboard betweenFiles(File left, File right)
+        {
+            assert(left <= right);
+
+            if (left == fileA)
+            {
+                return Bitboard::fromBits(m_filesUpToBB[right]);
+            }
+            else
+            {
+                return Bitboard::fromBits(m_filesUpToBB[right] ^ m_filesUpToBB[left - 1]);
+            }
+        }
+
+        [[nodiscard]] constexpr bool isEmpty() const
+        {
+            return m_squares == 0;
+        }
+
+        [[nodiscard]] constexpr bool isSet(Square sq) const
+        {
+            return !!((m_squares >> ordinal(sq)) & 1ull);
+        }
+
+        constexpr void set(Square sq)
+        {
+            *this |= Bitboard(sq);
+        }
+
+        constexpr void unset(Square sq)
+        {
+            *this &= ~(Bitboard(sq));
+        }
+
+        constexpr void toggle(Square sq)
+        {
+            *this ^= Bitboard(sq);
+        }
+
+        [[nodiscard]] constexpr BitboardIterator begin() const
+        {
+            return BitboardIterator(m_squares);
+        }
+
+        [[nodiscard]] constexpr BitboardIterator end() const
+        {
+            return BitboardIterator{};
+        }
+
+        [[nodiscard]] constexpr BitboardIterator cbegin() const
+        {
+            return BitboardIterator(m_squares);
+        }
+
+        [[nodiscard]] constexpr BitboardIterator cend() const
+        {
+            return BitboardIterator{};
+        }
+
+        [[nodiscard]] constexpr bool friend operator==(Bitboard lhs, Bitboard rhs) noexcept
+        {
+            return lhs.m_squares == rhs.m_squares;
+        }
+
+        [[nodiscard]] constexpr bool friend operator!=(Bitboard lhs, Bitboard rhs) noexcept
+        {
+            return lhs.m_squares != rhs.m_squares;
+        }
+
+        constexpr Bitboard shiftedVertically(int ranks) const
+        {
+            if (ranks >= 0)
+            {
+                return fromBits(m_squares << 8 * ranks);
+            }
+            else
+            {
+                return fromBits(m_squares >> -8 * ranks);
+            }
+        }
+
+        template <int files, int ranks>
+        constexpr void shift()
+        {
+            static_assert(files >= -7);
+            static_assert(ranks >= -7);
+            static_assert(files <= 7);
+            static_assert(ranks <= 7);
+
+            if constexpr (files != 0)
+            {
+                constexpr Bitboard mask =
+                    files > 0
+                    ? Bitboard::betweenFiles(fileA, fileH - files)
+                    : Bitboard::betweenFiles(fileA - files, fileH);
+
+                m_squares &= mask.m_squares;
+            }
+
+            constexpr int shift = files + ranks * 8;
+            if constexpr (shift == 0)
+            {
+                return;
+            }
+
+            if constexpr (shift < 0)
+            {
+                m_squares >>= -shift;
+            }
+            else
+            {
+                m_squares <<= shift;
+            }
+        }
+
+        template <int files, int ranks>
+        constexpr Bitboard shifted() const
+        {
+            Bitboard bbCpy(*this);
+            bbCpy.shift<files, ranks>();
+            return bbCpy;
+        }
+
+        constexpr void shift(Offset offset)
+        {
+            assert(offset.files >= -7);
+            assert(offset.ranks >= -7);
+            assert(offset.files <= 7);
+            assert(offset.ranks <= 7);
+
+            if (offset.files != 0)
+            {
+                const Bitboard mask =
+                    offset.files > 0
+                    ? Bitboard::betweenFiles(fileA, fileH - offset.files)
+                    : Bitboard::betweenFiles(fileA - offset.files, fileH);
+
+                m_squares &= mask.m_squares;
+            }
+
+            const int shift = offset.files + offset.ranks * 8;
+            if (shift < 0)
+            {
+                m_squares >>= -shift;
+            }
+            else
+            {
+                m_squares <<= shift;
+            }
+        }
+
+        [[nodiscard]] constexpr Bitboard shifted(Offset offset) const
+        {
+            Bitboard bbCpy(*this);
+            bbCpy.shift(offset);
+            return bbCpy;
+        }
+
+        [[nodiscard]] constexpr Bitboard operator~() const
+        {
+            Bitboard bb = *this;
+            bb.m_squares = ~m_squares;
+            return bb;
+        }
+
+        constexpr Bitboard& operator^=(Color c)
+        {
+            m_squares ^= Bitboard(c).m_squares;
+            return *this;
+        }
+
+        constexpr Bitboard& operator&=(Color c)
+        {
+            m_squares &= Bitboard(c).m_squares;
+            return *this;
+        }
+
+        constexpr Bitboard& operator|=(Color c)
+        {
+            m_squares |= Bitboard(c).m_squares;
+            return *this;
+        }
+
+        [[nodiscard]] constexpr Bitboard operator^(Color c) const
+        {
+            Bitboard bb = *this;
+            bb ^= c;
+            return bb;
+        }
+
+        [[nodiscard]] constexpr Bitboard operator&(Color c) const
+        {
+            Bitboard bb = *this;
+            bb &= c;
+            return bb;
+        }
+
+        [[nodiscard]] constexpr Bitboard operator|(Color c) const
+        {
+            Bitboard bb = *this;
+            bb |= c;
+            return bb;
+        }
+
+        constexpr Bitboard& operator^=(Square sq)
+        {
+            m_squares ^= Bitboard(sq).m_squares;
+            return *this;
+        }
+
+        constexpr Bitboard& operator&=(Square sq)
+        {
+            m_squares &= Bitboard(sq).m_squares;
+            return *this;
+        }
+
+        constexpr Bitboard& operator|=(Square sq)
+        {
+            m_squares |= Bitboard(sq).m_squares;
+            return *this;
+        }
+
+        [[nodiscard]] constexpr Bitboard operator^(Square sq) const
+        {
+            Bitboard bb = *this;
+            bb ^= sq;
+            return bb;
+        }
+
+        [[nodiscard]] constexpr Bitboard operator&(Square sq) const
+        {
+            Bitboard bb = *this;
+            bb &= sq;
+            return bb;
+        }
+
+        [[nodiscard]] constexpr Bitboard operator|(Square sq) const
+        {
+            Bitboard bb = *this;
+            bb |= sq;
+            return bb;
+        }
+
+        [[nodiscard]] constexpr friend Bitboard operator^(Square sq, Bitboard bb)
+        {
+            return bb ^ sq;
+        }
+
+        [[nodiscard]] constexpr friend Bitboard operator&(Square sq, Bitboard bb)
+        {
+            return bb & sq;
+        }
+
+        [[nodiscard]] constexpr friend Bitboard operator|(Square sq, Bitboard bb)
+        {
+            return bb | sq;
+        }
+
+        constexpr Bitboard& operator^=(Bitboard rhs)
+        {
+            m_squares ^= rhs.m_squares;
+            return *this;
+        }
+
+        constexpr Bitboard& operator&=(Bitboard rhs)
+        {
+            m_squares &= rhs.m_squares;
+            return *this;
+        }
+
+        constexpr Bitboard& operator|=(Bitboard rhs)
+        {
+            m_squares |= rhs.m_squares;
+            return *this;
+        }
+
+        [[nodiscard]] constexpr Bitboard operator^(Bitboard sq) const
+        {
+            Bitboard bb = *this;
+            bb ^= sq;
+            return bb;
+        }
+
+        [[nodiscard]] constexpr Bitboard operator&(Bitboard sq) const
+        {
+            Bitboard bb = *this;
+            bb &= sq;
+            return bb;
+        }
+
+        [[nodiscard]] constexpr Bitboard operator|(Bitboard sq) const
+        {
+            Bitboard bb = *this;
+            bb |= sq;
+            return bb;
+        }
+
+        [[nodiscard]] inline int count() const
+        {
+            return static_cast<int>(intrin::popcount(m_squares));
+        }
+
+        [[nodiscard]] constexpr bool moreThanOne() const
+        {
+            return !!(m_squares & (m_squares - 1));
+        }
+
+        [[nodiscard]] constexpr bool exactlyOne() const
+        {
+            return m_squares != 0 && !moreThanOne();
+        }
+
+        [[nodiscard]] constexpr bool any() const
+        {
+            return !!m_squares;
+        }
+
+        [[nodiscard]] inline Square first() const
+        {
+            assert(m_squares != 0);
+
+            return fromOrdinal<Square>(intrin::lsb(m_squares));
+        }
+
+        [[nodiscard]] inline Square nth(int n) const
+        {
+            assert(count() > n);
+
+            Bitboard cpy = *this;
+            while (n--) cpy.popFirst();
+            return cpy.first();
+        }
+
+        [[nodiscard]] inline Square last() const
+        {
+            assert(m_squares != 0);
+
+            return fromOrdinal<Square>(intrin::msb(m_squares));
+        }
+
+        [[nodiscard]] constexpr std::uint64_t bits() const
+        {
+            return m_squares;
+        }
+
+        constexpr void popFirst()
+        {
+            assert(m_squares != 0);
+
+            m_squares &= m_squares - 1;
+        }
+
+        constexpr Bitboard& operator=(const Bitboard& other) = default;
+
+    private:
+        std::uint64_t m_squares;
+    };
+
+    [[nodiscard]] constexpr Bitboard operator^(Square sq0, Square sq1)
+    {
+        return Bitboard::square(sq0) ^ sq1;
+    }
+
+    [[nodiscard]] constexpr Bitboard operator&(Square sq0, Square sq1)
+    {
+        return Bitboard::square(sq0) & sq1;
+    }
+
+    [[nodiscard]] constexpr Bitboard operator|(Square sq0, Square sq1)
+    {
+        return Bitboard::square(sq0) | sq1;
+    }
+
+    [[nodiscard]] constexpr Bitboard operator""_bb(std::uint64_t bits)
+    {
+        return Bitboard::fromBits(bits);
+    }
+
+    namespace bb
+    {
+        namespace fancy_magics
+        {
+            // Implementation based on https://github.com/syzygy1/Cfish
+
+            alignas(64) constexpr EnumArray<Square, std::uint64_t> g_rookMagics{ {
+                0x0A80004000801220ull,
+                0x8040004010002008ull,
+                0x2080200010008008ull,
+                0x1100100008210004ull,
+                0xC200209084020008ull,
+                0x2100010004000208ull,
+                0x0400081000822421ull,
+                0x0200010422048844ull,
+                0x0800800080400024ull,
+                0x0001402000401000ull,
+                0x3000801000802001ull,
+                0x4400800800100083ull,
+                0x0904802402480080ull,
+                0x4040800400020080ull,
+                0x0018808042000100ull,
+                0x4040800080004100ull,
+                0x0040048001458024ull,
+                0x00A0004000205000ull,
+                0x3100808010002000ull,
+                0x4825010010000820ull,
+                0x5004808008000401ull,
+                0x2024818004000A00ull,
+                0x0005808002000100ull,
+                0x2100060004806104ull,
+                0x0080400880008421ull,
+                0x4062220600410280ull,
+                0x010A004A00108022ull,
+                0x0000100080080080ull,
+                0x0021000500080010ull,
+                0x0044000202001008ull,
+                0x0000100400080102ull,
+                0xC020128200040545ull,
+                0x0080002000400040ull,
+                0x0000804000802004ull,
+                0x0000120022004080ull,
+                0x010A386103001001ull,
+                0x9010080080800400ull,
+                0x8440020080800400ull,
+                0x0004228824001001ull,
+                0x000000490A000084ull,
+                0x0080002000504000ull,
+                0x200020005000C000ull,
+                0x0012088020420010ull,
+                0x0010010080080800ull,
+                0x0085001008010004ull,
+                0x0002000204008080ull,
+                0x0040413002040008ull,
+                0x0000304081020004ull,
+                0x0080204000800080ull,
+                0x3008804000290100ull,
+                0x1010100080200080ull,
+                0x2008100208028080ull,
+                0x5000850800910100ull,
+                0x8402019004680200ull,
+                0x0120911028020400ull,
+                0x0000008044010200ull,
+                0x0020850200244012ull,
+                0x0020850200244012ull,
+                0x0000102001040841ull,
+                0x140900040A100021ull,
+                0x000200282410A102ull,
+                0x000200282410A102ull,
+                0x000200282410A102ull,
+                0x4048240043802106ull
+                    } };
+            alignas(64) extern EnumArray<Square, Bitboard> g_rookMasks;
+            alignas(64) extern EnumArray<Square, std::uint8_t> g_rookShifts;
+            alignas(64) extern EnumArray<Square, const Bitboard*> g_rookAttacks;
+
+            alignas(64) constexpr EnumArray<Square, std::uint64_t> g_bishopMagics{ {
+                0x40106000A1160020ull,
+                0x0020010250810120ull,
+                0x2010010220280081ull,
+                0x002806004050C040ull,
+                0x0002021018000000ull,
+                0x2001112010000400ull,
+                0x0881010120218080ull,
+                0x1030820110010500ull,
+                0x0000120222042400ull,
+                0x2000020404040044ull,
+                0x8000480094208000ull,
+                0x0003422A02000001ull,
+                0x000A220210100040ull,
+                0x8004820202226000ull,
+                0x0018234854100800ull,
+                0x0100004042101040ull,
+                0x0004001004082820ull,
+                0x0010000810010048ull,
+                0x1014004208081300ull,
+                0x2080818802044202ull,
+                0x0040880C00A00100ull,
+                0x0080400200522010ull,
+                0x0001000188180B04ull,
+                0x0080249202020204ull,
+                0x1004400004100410ull,
+                0x00013100A0022206ull,
+                0x2148500001040080ull,
+                0x4241080011004300ull,
+                0x4020848004002000ull,
+                0x10101380D1004100ull,
+                0x0008004422020284ull,
+                0x01010A1041008080ull,
+                0x0808080400082121ull,
+                0x0808080400082121ull,
+                0x0091128200100C00ull,
+                0x0202200802010104ull,
+                0x8C0A020200440085ull,
+                0x01A0008080B10040ull,
+                0x0889520080122800ull,
+                0x100902022202010Aull,
+                0x04081A0816002000ull,
+                0x0000681208005000ull,
+                0x8170840041008802ull,
+                0x0A00004200810805ull,
+                0x0830404408210100ull,
+                0x2602208106006102ull,
+                0x1048300680802628ull,
+                0x2602208106006102ull,
+                0x0602010120110040ull,
+                0x0941010801043000ull,
+                0x000040440A210428ull,
+                0x0008240020880021ull,
+                0x0400002012048200ull,
+                0x00AC102001210220ull,
+                0x0220021002009900ull,
+                0x84440C080A013080ull,
+                0x0001008044200440ull,
+                0x0004C04410841000ull,
+                0x2000500104011130ull,
+                0x1A0C010011C20229ull,
+                0x0044800112202200ull,
+                0x0434804908100424ull,
+                0x0300404822C08200ull,
+                0x48081010008A2A80ull
+            } };
+            alignas(64) extern EnumArray<Square, Bitboard> g_bishopMasks;
+            alignas(64) extern EnumArray<Square, std::uint8_t> g_bishopShifts;
+            alignas(64) extern EnumArray<Square, const Bitboard*> g_bishopAttacks;
+
+            inline Bitboard bishopAttacks(Square s, Bitboard occupied)
+            {
+                const std::size_t idx =
+                    (occupied & fancy_magics::g_bishopMasks[s]).bits()
+                    * fancy_magics::g_bishopMagics[s]
+                    >> fancy_magics::g_bishopShifts[s];
+
+                return fancy_magics::g_bishopAttacks[s][idx];
+            }
+
+            inline Bitboard rookAttacks(Square s, Bitboard occupied)
+            {
+                const std::size_t idx =
+                    (occupied & fancy_magics::g_rookMasks[s]).bits()
+                    * fancy_magics::g_rookMagics[s]
+                    >> fancy_magics::g_rookShifts[s];
+
+                return fancy_magics::g_rookAttacks[s][idx];
+            }
+        }
+
+        [[nodiscard]] constexpr Bitboard square(Square sq)
+        {
+            return Bitboard::square(sq);
+        }
+
+        [[nodiscard]] constexpr Bitboard rank(Rank rank)
+        {
+            return Bitboard::rank(rank);
+        }
+
+        [[nodiscard]] constexpr Bitboard file(File file)
+        {
+            return Bitboard::file(file);
+        }
+
+        [[nodiscard]] constexpr Bitboard color(Color c)
+        {
+            return Bitboard::color(c);
+        }
+
+        [[nodiscard]] constexpr Bitboard before(Square sq)
+        {
+            return Bitboard::fromBits(nbitmask<std::uint64_t>[ordinal(sq)]);
+        }
+
+        constexpr Bitboard lightSquares = bb::color(Color::White);
+        constexpr Bitboard darkSquares = bb::color(Color::Black);
+
+        constexpr Bitboard fileA = bb::file(chess::fileA);
+        constexpr Bitboard fileB = bb::file(chess::fileB);
+        constexpr Bitboard fileC = bb::file(chess::fileC);
+        constexpr Bitboard fileD = bb::file(chess::fileD);
+        constexpr Bitboard fileE = bb::file(chess::fileE);
+        constexpr Bitboard fileF = bb::file(chess::fileF);
+        constexpr Bitboard fileG = bb::file(chess::fileG);
+        constexpr Bitboard fileH = bb::file(chess::fileH);
+
+        constexpr Bitboard rank1 = bb::rank(chess::rank1);
+        constexpr Bitboard rank2 = bb::rank(chess::rank2);
+        constexpr Bitboard rank3 = bb::rank(chess::rank3);
+        constexpr Bitboard rank4 = bb::rank(chess::rank4);
+        constexpr Bitboard rank5 = bb::rank(chess::rank5);
+        constexpr Bitboard rank6 = bb::rank(chess::rank6);
+        constexpr Bitboard rank7 = bb::rank(chess::rank7);
+        constexpr Bitboard rank8 = bb::rank(chess::rank8);
+
+        constexpr Bitboard a1 = bb::square(chess::a1);
+        constexpr Bitboard a2 = bb::square(chess::a2);
+        constexpr Bitboard a3 = bb::square(chess::a3);
+        constexpr Bitboard a4 = bb::square(chess::a4);
+        constexpr Bitboard a5 = bb::square(chess::a5);
+        constexpr Bitboard a6 = bb::square(chess::a6);
+        constexpr Bitboard a7 = bb::square(chess::a7);
+        constexpr Bitboard a8 = bb::square(chess::a8);
+
+        constexpr Bitboard b1 = bb::square(chess::b1);
+        constexpr Bitboard b2 = bb::square(chess::b2);
+        constexpr Bitboard b3 = bb::square(chess::b3);
+        constexpr Bitboard b4 = bb::square(chess::b4);
+        constexpr Bitboard b5 = bb::square(chess::b5);
+        constexpr Bitboard b6 = bb::square(chess::b6);
+        constexpr Bitboard b7 = bb::square(chess::b7);
+        constexpr Bitboard b8 = bb::square(chess::b8);
+
+        constexpr Bitboard c1 = bb::square(chess::c1);
+        constexpr Bitboard c2 = bb::square(chess::c2);
+        constexpr Bitboard c3 = bb::square(chess::c3);
+        constexpr Bitboard c4 = bb::square(chess::c4);
+        constexpr Bitboard c5 = bb::square(chess::c5);
+        constexpr Bitboard c6 = bb::square(chess::c6);
+        constexpr Bitboard c7 = bb::square(chess::c7);
+        constexpr Bitboard c8 = bb::square(chess::c8);
+
+        constexpr Bitboard d1 = bb::square(chess::d1);
+        constexpr Bitboard d2 = bb::square(chess::d2);
+        constexpr Bitboard d3 = bb::square(chess::d3);
+        constexpr Bitboard d4 = bb::square(chess::d4);
+        constexpr Bitboard d5 = bb::square(chess::d5);
+        constexpr Bitboard d6 = bb::square(chess::d6);
+        constexpr Bitboard d7 = bb::square(chess::d7);
+        constexpr Bitboard d8 = bb::square(chess::d8);
+
+        constexpr Bitboard e1 = bb::square(chess::e1);
+        constexpr Bitboard e2 = bb::square(chess::e2);
+        constexpr Bitboard e3 = bb::square(chess::e3);
+        constexpr Bitboard e4 = bb::square(chess::e4);
+        constexpr Bitboard e5 = bb::square(chess::e5);
+        constexpr Bitboard e6 = bb::square(chess::e6);
+        constexpr Bitboard e7 = bb::square(chess::e7);
+        constexpr Bitboard e8 = bb::square(chess::e8);
+
+        constexpr Bitboard f1 = bb::square(chess::f1);
+        constexpr Bitboard f2 = bb::square(chess::f2);
+        constexpr Bitboard f3 = bb::square(chess::f3);
+        constexpr Bitboard f4 = bb::square(chess::f4);
+        constexpr Bitboard f5 = bb::square(chess::f5);
+        constexpr Bitboard f6 = bb::square(chess::f6);
+        constexpr Bitboard f7 = bb::square(chess::f7);
+        constexpr Bitboard f8 = bb::square(chess::f8);
+
+        constexpr Bitboard g1 = bb::square(chess::g1);
+        constexpr Bitboard g2 = bb::square(chess::g2);
+        constexpr Bitboard g3 = bb::square(chess::g3);
+        constexpr Bitboard g4 = bb::square(chess::g4);
+        constexpr Bitboard g5 = bb::square(chess::g5);
+        constexpr Bitboard g6 = bb::square(chess::g6);
+        constexpr Bitboard g7 = bb::square(chess::g7);
+        constexpr Bitboard g8 = bb::square(chess::g8);
+
+        constexpr Bitboard h1 = bb::square(chess::h1);
+        constexpr Bitboard h2 = bb::square(chess::h2);
+        constexpr Bitboard h3 = bb::square(chess::h3);
+        constexpr Bitboard h4 = bb::square(chess::h4);
+        constexpr Bitboard h5 = bb::square(chess::h5);
+        constexpr Bitboard h6 = bb::square(chess::h6);
+        constexpr Bitboard h7 = bb::square(chess::h7);
+        constexpr Bitboard h8 = bb::square(chess::h8);
+
+        [[nodiscard]] Bitboard between(Square s1, Square s2);
+
+        [[nodiscard]] Bitboard line(Square s1, Square s2);
+
+        template <PieceType PieceTypeV>
+        [[nodiscard]] Bitboard pseudoAttacks(Square sq);
+
+        [[nodiscard]] Bitboard pseudoAttacks(PieceType pt, Square sq);
+
+        template <PieceType PieceTypeV>
+        Bitboard attacks(Square sq, Bitboard occupied)
+        {
+            static_assert(PieceTypeV != PieceType::None && PieceTypeV != PieceType::Pawn);
+
+            assert(sq.isOk());
+
+            if constexpr (PieceTypeV == PieceType::Bishop)
+            {
+                return fancy_magics::bishopAttacks(sq, occupied);
+            }
+            else if constexpr (PieceTypeV == PieceType::Rook)
+            {
+                return fancy_magics::rookAttacks(sq, occupied);
+            }
+            else if constexpr (PieceTypeV == PieceType::Queen)
+            {
+                return
+                    fancy_magics::bishopAttacks(sq, occupied)
+                    | fancy_magics::rookAttacks(sq, occupied);
+            }
+            else
+            {
+                return pseudoAttacks<PieceTypeV>(sq);
+            }
+        }
+
+        [[nodiscard]] inline Bitboard attacks(PieceType pt, Square sq, Bitboard occupied)
+        {
+            assert(sq.isOk());
+
+            switch (pt)
+            {
+            case PieceType::Bishop:
+                return attacks<PieceType::Bishop>(sq, occupied);
+            case PieceType::Rook:
+                return attacks<PieceType::Rook>(sq, occupied);
+            case PieceType::Queen:
+                return attacks<PieceType::Queen>(sq, occupied);
+            default:
+                return pseudoAttacks(pt, sq);
+            }
+        }
+
+        [[nodiscard]] inline Bitboard pawnAttacks(Bitboard pawns, Color color);
+
+        [[nodiscard]] inline Bitboard westPawnAttacks(Bitboard pawns, Color color);
+
+        [[nodiscard]] inline Bitboard eastPawnAttacks(Bitboard pawns, Color color);
+
+        [[nodiscard]] inline bool isAttackedBySlider(
+            Square sq,
+            Bitboard bishops,
+            Bitboard rooks,
+            Bitboard queens,
+            Bitboard occupied
+        );
+
+        namespace detail
+        {
+            static constexpr std::array<Offset, 8> knightOffsets{ { {-1, -2}, {-1, 2}, {1, -2}, {1, 2}, {-2, -1}, {-2, 1}, {2, -1}, {2, 1} } };
+            static constexpr std::array<Offset, 8> kingOffsets{ { {-1, -1}, {-1, 0}, {-1, 1}, {0, -1}, {0, 1}, {1, -1}, {1, 0}, {1, 1} } };
+
+            enum Direction
+            {
+                North = 0,
+                NorthEast,
+                East,
+                SouthEast,
+                South,
+                SouthWest,
+                West,
+                NorthWest
+            };
+
+            constexpr std::array<Offset, 8> offsets = { {
+                { 0, 1 },
+                { 1, 1 },
+                { 1, 0 },
+                { 1, -1 },
+                { 0, -1 },
+                { -1, -1 },
+                { -1, 0 },
+                { -1, 1 }
+            } };
+
+            static constexpr std::array<Offset, 4> bishopOffsets{
+                offsets[NorthEast],
+                offsets[SouthEast],
+                offsets[SouthWest],
+                offsets[NorthWest]
+            };
+            static constexpr std::array<Offset, 4> rookOffsets{
+                offsets[North],
+                offsets[East],
+                offsets[South],
+                offsets[West]
+            };
+
+            [[nodiscard]] static EnumArray<Square, Bitboard> generatePseudoAttacks_Pawn()
+            {
+                // pseudo attacks don't make sense for pawns
+                return {};
+            }
+
+            [[nodiscard]] static EnumArray<Square, Bitboard> generatePseudoAttacks_Knight()
+            {
+                EnumArray<Square, Bitboard> bbs{};
+
+                for (Square fromSq = chess::a1; fromSq != Square::none(); ++fromSq)
+                {
+                    Bitboard bb{};
+
+                    for (auto&& offset : knightOffsets)
+                    {
+                        const SquareCoords toSq = fromSq.coords() + offset;
+                        if (toSq.isOk())
+                        {
+                            bb |= Square(toSq);
+                        }
+                    }
+
+                    bbs[fromSq] = bb;
+                }
+
+                return bbs;
+            }
+
+            [[nodiscard]] static Bitboard generateSliderPseudoAttacks(const std::array<Offset, 4> & offsets, Square fromSq)
+            {
+                assert(fromSq.isOk());
+
+                Bitboard bb{};
+
+                for (auto&& offset : offsets)
+                {
+                    SquareCoords fromSqC = fromSq.coords();
+
+                    for (;;)
+                    {
+                        fromSqC += offset;
+
+                        if (!fromSqC.isOk())
+                        {
+                            break;
+                        }
+
+                        bb |= Square(fromSqC);
+                    }
+                }
+
+                return bb;
+            }
+
+            [[nodiscard]] static EnumArray<Square, Bitboard> generatePseudoAttacks_Bishop()
+            {
+                EnumArray<Square, Bitboard> bbs{};
+
+                for (Square fromSq = chess::a1; fromSq != Square::none(); ++fromSq)
+                {
+                    bbs[fromSq] = generateSliderPseudoAttacks(bishopOffsets, fromSq);
+                }
+
+                return bbs;
+            }
+
+            [[nodiscard]] static EnumArray<Square, Bitboard> generatePseudoAttacks_Rook()
+            {
+                EnumArray<Square, Bitboard> bbs{};
+
+                for (Square fromSq = chess::a1; fromSq != Square::none(); ++fromSq)
+                {
+                    bbs[fromSq] = generateSliderPseudoAttacks(rookOffsets, fromSq);
+                }
+
+                return bbs;
+            }
+
+            [[nodiscard]] static EnumArray<Square, Bitboard> generatePseudoAttacks_Queen()
+            {
+                EnumArray<Square, Bitboard> bbs{};
+
+                for (Square fromSq = chess::a1; fromSq != Square::none(); ++fromSq)
+                {
+                    bbs[fromSq] =
+                        generateSliderPseudoAttacks(bishopOffsets, fromSq)
+                        | generateSliderPseudoAttacks(rookOffsets, fromSq);
+                }
+
+                return bbs;
+            }
+
+            [[nodiscard]] static EnumArray<Square, Bitboard> generatePseudoAttacks_King()
+            {
+                EnumArray<Square, Bitboard> bbs{};
+
+                for (Square fromSq = chess::a1; fromSq != Square::none(); ++fromSq)
+                {
+                    Bitboard bb{};
+
+                    for (auto&& offset : kingOffsets)
+                    {
+                        const SquareCoords toSq = fromSq.coords() + offset;
+                        if (toSq.isOk())
+                        {
+                            bb |= Square(toSq);
+                        }
+                    }
+
+                    bbs[fromSq] = bb;
+                }
+
+                return bbs;
+            }
+
+            [[nodiscard]] static EnumArray2<PieceType, Square, Bitboard> generatePseudoAttacks()
+            {
+                return EnumArray2<PieceType, Square, Bitboard>{
+                    generatePseudoAttacks_Pawn(),
+                        generatePseudoAttacks_Knight(),
+                        generatePseudoAttacks_Bishop(),
+                        generatePseudoAttacks_Rook(),
+                        generatePseudoAttacks_Queen(),
+                        generatePseudoAttacks_King()
+                };
+            }
+
+            static const EnumArray2<PieceType, Square, Bitboard> pseudoAttacks = generatePseudoAttacks();
+
+            [[nodiscard]] static Bitboard generatePositiveRayAttacks(Direction dir, Square fromSq)
+            {
+                assert(fromSq.isOk());
+
+                Bitboard bb{};
+
+                const auto offset = offsets[dir];
+                SquareCoords fromSqC = fromSq.coords();
+                for (;;)
+                {
+                    fromSqC += offset;
+
+                    if (!fromSqC.isOk())
+                    {
+                        break;
+                    }
+
+                    bb |= Square(fromSqC);
+                }
+
+                return bb;
+            }
+
+            // classical slider move generation approach https://www.chessprogramming.org/Classical_Approach
+
+            [[nodiscard]] static EnumArray<Square, Bitboard> generatePositiveRayAttacks(Direction dir)
+            {
+                EnumArray<Square, Bitboard> bbs{};
+
+                for (Square fromSq = chess::a1; fromSq != Square::none(); ++fromSq)
+                {
+                    bbs[fromSq] = generatePositiveRayAttacks(dir, fromSq);
+                }
+
+                return bbs;
+            }
+
+            [[nodiscard]] static std::array<EnumArray<Square, Bitboard>, 8> generatePositiveRayAttacks()
+            {
+                std::array<EnumArray<Square, Bitboard>, 8> bbs{};
+
+                bbs[North] = generatePositiveRayAttacks(North);
+                bbs[NorthEast] = generatePositiveRayAttacks(NorthEast);
+                bbs[East] = generatePositiveRayAttacks(East);
+                bbs[SouthEast] = generatePositiveRayAttacks(SouthEast);
+                bbs[South] = generatePositiveRayAttacks(South);
+                bbs[SouthWest] = generatePositiveRayAttacks(SouthWest);
+                bbs[West] = generatePositiveRayAttacks(West);
+                bbs[NorthWest] = generatePositiveRayAttacks(NorthWest);
+
+                return bbs;
+            }
+
+            static const std::array<EnumArray<Square, Bitboard>, 8> positiveRayAttacks = generatePositiveRayAttacks();
+
+            template <Direction DirV>
+            [[nodiscard]] static Bitboard slidingAttacks(Square sq, Bitboard occupied)
+            {
+                assert(sq.isOk());
+
+                Bitboard attacks = positiveRayAttacks[DirV][sq];
+
+                if constexpr (DirV == NorthWest || DirV == North || DirV == NorthEast || DirV == East)
+                {
+                    Bitboard blocker = (attacks & occupied) | h8; // set highest bit (H8) so msb never fails
+                    return attacks ^ positiveRayAttacks[DirV][blocker.first()];
+                }
+                else
+                {
+                    Bitboard blocker = (attacks & occupied) | a1;
+                    return attacks ^ positiveRayAttacks[DirV][blocker.last()];
+                }
+            }
+
+            template Bitboard slidingAttacks<Direction::North>(Square, Bitboard);
+            template Bitboard slidingAttacks<Direction::NorthEast>(Square, Bitboard);
+            template Bitboard slidingAttacks<Direction::East>(Square, Bitboard);
+            template Bitboard slidingAttacks<Direction::SouthEast>(Square, Bitboard);
+            template Bitboard slidingAttacks<Direction::South>(Square, Bitboard);
+            template Bitboard slidingAttacks<Direction::SouthWest>(Square, Bitboard);
+            template Bitboard slidingAttacks<Direction::West>(Square, Bitboard);
+            template Bitboard slidingAttacks<Direction::NorthWest>(Square, Bitboard);
+
+            template <PieceType PieceTypeV>
+            [[nodiscard]] inline Bitboard pieceSlidingAttacks(Square sq, Bitboard occupied)
+            {
+                static_assert(
+                    PieceTypeV == PieceType::Rook
+                    || PieceTypeV == PieceType::Bishop
+                    || PieceTypeV == PieceType::Queen);
+
+                assert(sq.isOk());
+
+                if constexpr (PieceTypeV == PieceType::Bishop)
+                {
+                    return
+                        detail::slidingAttacks<detail::NorthEast>(sq, occupied)
+                        | detail::slidingAttacks<detail::SouthEast>(sq, occupied)
+                        | detail::slidingAttacks<detail::SouthWest>(sq, occupied)
+                        | detail::slidingAttacks<detail::NorthWest>(sq, occupied);
+                }
+                else if constexpr (PieceTypeV == PieceType::Rook)
+                {
+                    return
+                        detail::slidingAttacks<detail::North>(sq, occupied)
+                        | detail::slidingAttacks<detail::East>(sq, occupied)
+                        | detail::slidingAttacks<detail::South>(sq, occupied)
+                        | detail::slidingAttacks<detail::West>(sq, occupied);
+                }
+                else // if constexpr (PieceTypeV == PieceType::Queen)
+                {
+                    return
+                        detail::slidingAttacks<detail::North>(sq, occupied)
+                        | detail::slidingAttacks<detail::NorthEast>(sq, occupied)
+                        | detail::slidingAttacks<detail::East>(sq, occupied)
+                        | detail::slidingAttacks<detail::SouthEast>(sq, occupied)
+                        | detail::slidingAttacks<detail::South>(sq, occupied)
+                        | detail::slidingAttacks<detail::SouthWest>(sq, occupied)
+                        | detail::slidingAttacks<detail::West>(sq, occupied)
+                        | detail::slidingAttacks<detail::NorthWest>(sq, occupied);
+                }
+            }
+
+            static Bitboard generateBetween(Square s1, Square s2)
+            {
+                Bitboard bb = Bitboard::none();
+
+                if (s1 == s2)
+                {
+                    return bb;
+                }
+
+                const int fd = s2.file() - s1.file();
+                const int rd = s2.rank() - s1.rank();
+
+                if (fd == 0 || rd == 0 || fd == rd || fd == -rd)
+                {
+                    // s1 and s2 lie on a line.
+                    const int fileStep = (fd > 0) - (fd < 0);
+                    const int rankStep = (rd > 0) - (rd < 0);
+                    const auto step = FlatSquareOffset(fileStep, rankStep);
+                    s1 += step; // omit s1
+                    while(s1 != s2) // omit s2
+                    {
+                        bb |= s1;
+                        s1 += step;
+                    }
+                }
+
+                return bb;
+            }
+
+            static Bitboard generateLine(Square s1, Square s2)
+            {
+                for (PieceType pt : { PieceType::Bishop, PieceType::Rook })
+                {
+                    const Bitboard s1Attacks = pseudoAttacks[pt][s1];
+                    if (s1Attacks.isSet(s2))
+                    {
+                        const Bitboard s2Attacks = pseudoAttacks[pt][s2];
+                        return (s1Attacks & s2Attacks) | s1 | s2;
+                    }
+                }
+
+                return Bitboard::none();
+            }
+
+            static const EnumArray2<Square, Square, Bitboard> between = []()
+            {
+                EnumArray2<Square, Square, Bitboard> between;
+
+                for (Square s1 : values<Square>())
+                {
+                    for (Square s2 : values<Square>())
+                    {
+                        between[s1][s2] = generateBetween(s1, s2);
+                    }
+                }
+
+                return between;
+            }();
+
+            static const EnumArray2<Square, Square, Bitboard> line = []()
+            {
+                EnumArray2<Square, Square, Bitboard> line;
+
+                for (Square s1 : values<Square>())
+                {
+                    for (Square s2 : values<Square>())
+                    {
+                        line[s1][s2] = generateLine(s1, s2);
+                    }
+                }
+
+                return line;
+            }();
+        }
+
+        namespace fancy_magics
+        {
+            enum struct MagicsType
+            {
+                Rook,
+                Bishop
+            };
+
+            alignas(64) EnumArray<Square, Bitboard> g_rookMasks;
+            alignas(64) EnumArray<Square, std::uint8_t> g_rookShifts;
+            alignas(64) EnumArray<Square, const Bitboard*> g_rookAttacks;
+
+            alignas(64) EnumArray<Square, Bitboard> g_bishopMasks;
+            alignas(64) EnumArray<Square, std::uint8_t> g_bishopShifts;
+            alignas(64) EnumArray<Square, const Bitboard*> g_bishopAttacks;
+
+            alignas(64) static std::array<Bitboard, 102400> g_allRookAttacks;
+            alignas(64) static std::array<Bitboard, 5248> g_allBishopAttacks;
+
+            template <MagicsType TypeV>
+            [[nodiscard]] inline Bitboard slidingAttacks(Square sq, Bitboard occupied)
+            {
+                if (TypeV == MagicsType::Rook)
+                {
+                    return chess::bb::detail::pieceSlidingAttacks<PieceType::Rook>(sq, occupied);
+                }
+
+                if (TypeV == MagicsType::Bishop)
+                {
+                    return chess::bb::detail::pieceSlidingAttacks<PieceType::Bishop>(sq, occupied);
+                }
+
+                return Bitboard::none();
+            }
+
+            template <MagicsType TypeV, std::size_t SizeV>
+            [[nodiscard]] inline bool initMagics(
+                const EnumArray<Square, std::uint64_t>& magics,
+                std::array<Bitboard, SizeV>& table,
+                EnumArray<Square, Bitboard>& masks,
+                EnumArray<Square, std::uint8_t>& shifts,
+                EnumArray<Square, const Bitboard*>& attacks
+            )
+            {
+                std::size_t size = 0;
+                for (Square sq : values<Square>())
+                {
+                    const Bitboard edges =
+                        ((bb::rank1 | bb::rank8) & ~Bitboard::rank(sq.rank()))
+                        | ((bb::fileA | bb::fileH) & ~Bitboard::file(sq.file()));
+
+                    Bitboard* currentAttacks = table.data() + size;
+
+                    attacks[sq] = currentAttacks;
+                    masks[sq] = slidingAttacks<TypeV>(sq, Bitboard::none()) & ~edges;
+                    shifts[sq] = 64 - masks[sq].count();
+
+                    Bitboard occupied = Bitboard::none();
+                    do
+                    {
+                        const std::size_t idx =
+                            (occupied & masks[sq]).bits()
+                            * magics[sq]
+                            >> shifts[sq];
+
+                        currentAttacks[idx] = slidingAttacks<TypeV>(sq, occupied);
+
+                        ++size;
+                        occupied = Bitboard::fromBits(occupied.bits() - masks[sq].bits()) & masks[sq];
+                    } while (occupied.any());
+                }
+
+                return true;
+            }
+
+            static bool g_isRookMagicsInitialized =
+                initMagics<MagicsType::Rook>(g_rookMagics, g_allRookAttacks, g_rookMasks, g_rookShifts, g_rookAttacks);
+
+            static bool g_isBishopMagicsInitialized =
+                initMagics<MagicsType::Bishop>(g_bishopMagics, g_allBishopAttacks, g_bishopMasks, g_bishopShifts, g_bishopAttacks);
+        }
+
+        [[nodiscard]] inline Bitboard between(Square s1, Square s2)
+        {
+            return detail::between[s1][s2];
+        }
+
+        [[nodiscard]] inline Bitboard line(Square s1, Square s2)
+        {
+            return detail::line[s1][s2];
+        }
+
+        template <PieceType PieceTypeV>
+        [[nodiscard]] inline Bitboard pseudoAttacks(Square sq)
+        {
+            static_assert(PieceTypeV != PieceType::None && PieceTypeV != PieceType::Pawn);
+
+            assert(sq.isOk());
+
+            return detail::pseudoAttacks[PieceTypeV][sq];
+        }
+
+        [[nodiscard]] inline Bitboard pseudoAttacks(PieceType pt, Square sq)
+        {
+            assert(sq.isOk());
+
+            return detail::pseudoAttacks[pt][sq];
+        }
+
+        [[nodiscard]] inline Bitboard pawnAttacks(Bitboard pawns, Color color)
+        {
+            if (color == Color::White)
+            {
+                return pawns.shifted<1, 1>() | pawns.shifted<-1, 1>();
+            }
+            else
+            {
+                return pawns.shifted<1, -1>() | pawns.shifted<-1, -1>();
+            }
+        }
+
+        [[nodiscard]] inline Bitboard westPawnAttacks(Bitboard pawns, Color color)
+        {
+            if (color == Color::White)
+            {
+                return pawns.shifted<-1, 1>();
+            }
+            else
+            {
+                return pawns.shifted<-1, -1>();
+            }
+        }
+
+        [[nodiscard]] inline Bitboard eastPawnAttacks(Bitboard pawns, Color color)
+        {
+            if (color == Color::White)
+            {
+                return pawns.shifted<1, 1>();
+            }
+            else
+            {
+                return pawns.shifted<1, -1>();
+            }
+        }
+
+        [[nodiscard]] inline bool isAttackedBySlider(
+            Square sq,
+            Bitboard bishops,
+            Bitboard rooks,
+            Bitboard queens,
+            Bitboard occupied
+        )
+        {
+            const Bitboard opponentBishopLikePieces = (bishops | queens);
+            const Bitboard bishopAttacks = bb::attacks<PieceType::Bishop>(sq, occupied);
+            if ((bishopAttacks & opponentBishopLikePieces).any())
+            {
+                return true;
+            }
+
+            const Bitboard opponentRookLikePieces = (rooks | queens);
+            const Bitboard rookAttacks = bb::attacks<PieceType::Rook>(sq, occupied);
+            return (rookAttacks & opponentRookLikePieces).any();
+        }
+    }
+
+    struct CastlingTraits
+    {
+        static constexpr EnumArray2<Color, CastleType, Square> rookDestination = { { {{ f1, d1 }}, {{ f8, d8 }} } };
+        static constexpr EnumArray2<Color, CastleType, Square> kingDestination = { { {{ g1, c1 }}, {{ g8, c8 }} } };
+
+        static constexpr EnumArray2<Color, CastleType, Square> rookStart = { { {{ h1, a1 }}, {{ h8, a8 }} } };
+
+        static constexpr EnumArray<Color, Square> kingStart = { { e1, e8 } };
+
+        static constexpr EnumArray2<Color, CastleType, Bitboard> castlingPath = {
+            {
+                {{ Bitboard::square(f1) | g1, Bitboard::square(b1) | c1 | d1 }},
+                {{ Bitboard::square(f8) | g8, Bitboard::square(b8) | c8 | d8 }}
+            }
+        };
+
+        static constexpr EnumArray2<Color, CastleType, Square> squarePassedByKing = {
+            {
+                {{ f1, d1 }},
+                {{ f8, d8 }}
+            }
+        };
+
+        static constexpr EnumArray2<Color, CastleType, CastlingRights> castlingRights = {
+            {
+                {{ CastlingRights::WhiteKingSide, CastlingRights::WhiteQueenSide }},
+                {{ CastlingRights::BlackKingSide, CastlingRights::BlackQueenSide }}
+            }
+        };
+
+        // Move has to be a legal castling move.
+        static constexpr CastleType moveCastlingType(const Move& move)
+        {
+            return (move.to.file() == fileH) ? CastleType::Short : CastleType::Long;
+        }
+
+        // Move must be a legal castling move.
+        static constexpr CastlingRights moveCastlingRight(Move move)
+        {
+            if (move.to == h1) return CastlingRights::WhiteKingSide;
+            if (move.to == a1) return CastlingRights::WhiteQueenSide;
+            if (move.to == h8) return CastlingRights::WhiteKingSide;
+            if (move.to == a8) return CastlingRights::WhiteQueenSide;
+            return CastlingRights::None;
+        }
+    };
+
+    namespace parser_bits
+    {
+        [[nodiscard]] constexpr bool isFile(char c)
+        {
+            return c >= 'a' && c <= 'h';
+        }
+
+        [[nodiscard]] constexpr bool isRank(char c)
+        {
+            return c >= '1' && c <= '8';
+        }
+
+        [[nodiscard]] constexpr Rank parseRank(char c)
+        {
+            assert(isRank(c));
+
+            return fromOrdinal<Rank>(c - '1');
+        }
+
+        [[nodiscard]] constexpr File parseFile(char c)
+        {
+            assert(isFile(c));
+
+            return fromOrdinal<File>(c - 'a');
+        }
+
+        [[nodiscard]] constexpr bool isSquare(const char* s)
+        {
+            return isFile(s[0]) && isRank(s[1]);
+        }
+
+        [[nodiscard]] constexpr Square parseSquare(const char* s)
+        {
+            const File file = parseFile(s[0]);
+            const Rank rank = parseRank(s[1]);
+            return Square(file, rank);
+        }
+
+        [[nodiscard]] constexpr std::optional<Square> tryParseSquare(std::string_view s)
+        {
+            if (s.size() != 2) return {};
+            if (!isSquare(s.data())) return {};
+            return parseSquare(s.data());
+        }
+
+        [[nodiscard]] constexpr std::optional<Square> tryParseEpSquare(std::string_view s)
+        {
+            if (s == std::string_view("-")) return Square::none();
+            return tryParseSquare(s);
+        }
+
+        [[nodiscard]] constexpr std::optional<CastlingRights> tryParseCastlingRights(std::string_view s)
+        {
+            if (s == std::string_view("-")) return CastlingRights::None;
+
+            CastlingRights rights = CastlingRights::None;
+
+            for (auto& c : s)
+            {
+                CastlingRights toAdd = CastlingRights::None;
+                switch (c)
+                {
+                case 'K':
+                    toAdd = CastlingRights::WhiteKingSide;
+                    break;
+                case 'Q':
+                    toAdd = CastlingRights::WhiteQueenSide;
+                    break;
+                case 'k':
+                    toAdd = CastlingRights::BlackKingSide;
+                    break;
+                case 'q':
+                    toAdd = CastlingRights::BlackQueenSide;
+                    break;
+                }
+
+                // If there are duplicated castling rights specification we bail.
+                // If there is an invalid character we bail.
+                // (It always contains None)
+                if (contains(rights, toAdd)) return {};
+                else rights |= toAdd;
+            }
+
+            return rights;
+        }
+
+        [[nodiscard]] constexpr CastlingRights readCastlingRights(const char*& s)
+        {
+            CastlingRights rights = CastlingRights::None;
+
+            while (*s != ' ')
+            {
+                switch (*s)
+                {
+                case 'K':
+                    rights |= CastlingRights::WhiteKingSide;
+                    break;
+                case 'Q':
+                    rights |= CastlingRights::WhiteQueenSide;
+                    break;
+                case 'k':
+                    rights |= CastlingRights::BlackKingSide;
+                    break;
+                case 'q':
+                    rights |= CastlingRights::BlackQueenSide;
+                    break;
+                }
+
+                ++s;
+            }
+
+            return rights;
+        }
+
+        FORCEINLINE inline void appendCastlingRightsToString(CastlingRights rights, std::string& str)
+        {
+            if (rights == CastlingRights::None)
+            {
+                str += '-';
+            }
+            else
+            {
+                if (contains(rights, CastlingRights::WhiteKingSide)) str += 'K';
+                if (contains(rights, CastlingRights::WhiteQueenSide)) str += 'Q';
+                if (contains(rights, CastlingRights::BlackKingSide)) str += 'k';
+                if (contains(rights, CastlingRights::BlackQueenSide)) str += 'q';
+            }
+        }
+
+        FORCEINLINE inline void appendSquareToString(Square sq, std::string& str)
+        {
+            str += static_cast<char>('a' + ordinal(sq.file()));
+            str += static_cast<char>('1' + ordinal(sq.rank()));
+        }
+
+        FORCEINLINE inline void appendEpSquareToString(Square sq, std::string& str)
+        {
+            if (sq == Square::none())
+            {
+                str += '-';
+            }
+            else
+            {
+                appendSquareToString(sq, str);
+            }
+        }
+
+        FORCEINLINE inline void appendRankToString(Rank r, std::string& str)
+        {
+            str += static_cast<char>('1' + ordinal(r));
+        }
+
+        FORCEINLINE inline void appendFileToString(File f, std::string& str)
+        {
+            str += static_cast<char>('a' + ordinal(f));
+        }
+
+        [[nodiscard]] FORCEINLINE inline bool isDigit(char c)
+        {
+            return c >= '0' && c <= '9';
+        }
+
+        [[nodiscard]] inline std::uint16_t parseUInt16(std::string_view sv)
+        {
+            assert(sv.size() > 0);
+            assert(sv.size() <= 5);
+
+            std::uint16_t v = 0;
+
+            std::size_t idx = 0;
+            switch (sv.size())
+            {
+            case 5:
+                v += (sv[idx++] - '0') * 10000;
+            case 4:
+                v += (sv[idx++] - '0') * 1000;
+            case 3:
+                v += (sv[idx++] - '0') * 100;
+            case 2:
+                v += (sv[idx++] - '0') * 10;
+            case 1:
+                v += sv[idx] - '0';
+                break;
+
+            default:
+                assert(false);
+            }
+
+            return v;
+        }
+
+        [[nodiscard]] inline std::optional<std::uint16_t> tryParseUInt16(std::string_view sv)
+        {
+            if (sv.size() == 0 || sv.size() > 5) return std::nullopt;
+
+            std::uint32_t v = 0;
+
+            std::size_t idx = 0;
+            switch (sv.size())
+            {
+            case 5:
+                v += (sv[idx++] - '0') * 10000;
+            case 4:
+                v += (sv[idx++] - '0') * 1000;
+            case 3:
+                v += (sv[idx++] - '0') * 100;
+            case 2:
+                v += (sv[idx++] - '0') * 10;
+            case 1:
+                v += sv[idx] - '0';
+                break;
+
+            default:
+                assert(false);
+            }
+
+            if (v > std::numeric_limits<std::uint16_t>::max())
+            {
+                return std::nullopt;
+            }
+
+            return static_cast<std::uint16_t>(v);
+        }
+    }
+
+
+    struct Board
+    {
+        constexpr Board() noexcept :
+            m_pieces{},
+            m_pieceBB{},
+            m_piecesByColorBB{},
+            m_pieceCount{}
+        {
+            m_pieces.fill(Piece::none());
+            m_pieceBB.fill(Bitboard::none());
+            m_pieceBB[Piece::none()] = Bitboard::all();
+            m_piecesByColorBB.fill(Bitboard::none());
+            m_pieceCount.fill(0);
+            m_pieceCount[Piece::none()] = 64;
+        }
+
+        [[nodiscard]] inline bool isValid() const
+        {
+            if (piecesBB(whiteKing).count() != 1) return false;
+            if (piecesBB(blackKing).count() != 1) return false;
+            if (((piecesBB(whitePawn) | piecesBB(blackPawn)) & (bb::rank(rank1) | bb::rank(rank8))).any()) return false;
+            return true;
+        }
+
+        [[nodiscard]] std::string fen() const;
+
+        [[nodiscard]] inline bool trySet(std::string_view boardState)
+        {
+            File f = fileA;
+            Rank r = rank8;
+            bool lastWasSkip = false;
+            for (auto c : boardState)
+            {
+                Piece piece = Piece::none();
+                switch (c)
+                {
+                case 'r':
+                    piece = Piece(PieceType::Rook, Color::Black);
+                    break;
+                case 'n':
+                    piece = Piece(PieceType::Knight, Color::Black);
+                    break;
+                case 'b':
+                    piece = Piece(PieceType::Bishop, Color::Black);
+                    break;
+                case 'q':
+                    piece = Piece(PieceType::Queen, Color::Black);
+                    break;
+                case 'k':
+                    piece = Piece(PieceType::King, Color::Black);
+                    break;
+                case 'p':
+                    piece = Piece(PieceType::Pawn, Color::Black);
+                    break;
+
+                case 'R':
+                    piece = Piece(PieceType::Rook, Color::White);
+                    break;
+                case 'N':
+                    piece = Piece(PieceType::Knight, Color::White);
+                    break;
+                case 'B':
+                    piece = Piece(PieceType::Bishop, Color::White);
+                    break;
+                case 'Q':
+                    piece = Piece(PieceType::Queen, Color::White);
+                    break;
+                case 'K':
+                    piece = Piece(PieceType::King, Color::White);
+                    break;
+                case 'P':
+                    piece = Piece(PieceType::Pawn, Color::White);
+                    break;
+
+                case '1':
+                case '2':
+                case '3':
+                case '4':
+                case '5':
+                case '6':
+                case '7':
+                case '8':
+                {
+                    if (lastWasSkip) return false;
+                    lastWasSkip = true;
+
+                    const int skip = c - '0';
+                    f += skip;
+                    if (f > fileH + 1) return false;
+                    break;
+                }
+
+                case '/':
+                    lastWasSkip = false;
+                    if (f != fileH + 1) return false;
+                    f = fileA;
+                    --r;
+                    break;
+
+                default:
+                    return false;
+                }
+
+                if (piece != Piece::none())
+                {
+                    lastWasSkip = false;
+
+                    const Square sq(f, r);
+                    if (!sq.isOk()) return false;
+
+                    place(piece, sq);
+                    ++f;
+                }
+            }
+
+            if (f != fileH + 1) return false;
+            if (r != rank1) return false;
+
+            return isValid();
+        }
+
+        // returns side to move
+        [[nodiscard]] constexpr const char* set(const char* fen)
+        {
+            assert(fen != nullptr);
+
+            File f = fileA;
+            Rank r = rank8;
+            auto current = fen;
+            bool done = false;
+            while (*current != '\0')
+            {
+                Piece piece = Piece::none();
+                switch (*current)
+                {
+                case 'r':
+                    piece = Piece(PieceType::Rook, Color::Black);
+                    break;
+                case 'n':
+                    piece = Piece(PieceType::Knight, Color::Black);
+                    break;
+                case 'b':
+                    piece = Piece(PieceType::Bishop, Color::Black);
+                    break;
+                case 'q':
+                    piece = Piece(PieceType::Queen, Color::Black);
+                    break;
+                case 'k':
+                    piece = Piece(PieceType::King, Color::Black);
+                    break;
+                case 'p':
+                    piece = Piece(PieceType::Pawn, Color::Black);
+                    break;
+
+                case 'R':
+                    piece = Piece(PieceType::Rook, Color::White);
+                    break;
+                case 'N':
+                    piece = Piece(PieceType::Knight, Color::White);
+                    break;
+                case 'B':
+                    piece = Piece(PieceType::Bishop, Color::White);
+                    break;
+                case 'Q':
+                    piece = Piece(PieceType::Queen, Color::White);
+                    break;
+                case 'K':
+                    piece = Piece(PieceType::King, Color::White);
+                    break;
+                case 'P':
+                    piece = Piece(PieceType::Pawn, Color::White);
+                    break;
+
+                case ' ':
+                    done = true;
+                    break;
+
+                case '1':
+                case '2':
+                case '3':
+                case '4':
+                case '5':
+                case '6':
+                case '7':
+                case '8':
+                {
+                    const int skip = (*current) - '0';
+                    f += skip;
+                    break;
+                }
+
+                case '/':
+                    f = fileA;
+                    --r;
+                    break;
+
+                default:
+                    break;
+                }
+
+                if (done)
+                {
+                    break;
+                }
+
+                if (piece != Piece::none())
+                {
+                    place(piece, Square(f, r));
+                    ++f;
+                }
+
+                ++current;
+            }
+
+            return current;
+        }
+
+        static constexpr Board fromFen(const char* fen)
+        {
+            Board board;
+            (void)board.set(fen);
+            return board;
+        }
+
+        [[nodiscard]] constexpr friend bool operator==(const Board& lhs, const Board& rhs) noexcept
+        {
+            bool equal = true;
+            for (Square sq = a1; sq <= h8; ++sq)
+            {
+                if (lhs.m_pieces[sq] != rhs.m_pieces[sq])
+                {
+                    equal = false;
+                    break;
+                }
+            }
+
+            assert(bbsEqual(lhs, rhs) == equal);
+
+            return equal;
+        }
+
+        constexpr void place(Piece piece, Square sq)
+        {
+            assert(sq.isOk());
+
+            auto oldPiece = m_pieces[sq];
+            m_pieceBB[oldPiece] ^= sq;
+            if (oldPiece != Piece::none())
+            {
+                m_piecesByColorBB[oldPiece.color()] ^= sq;
+            }
+            m_pieces[sq] = piece;
+            m_pieceBB[piece] |= sq;
+            m_piecesByColorBB[piece.color()] |= sq;
+            --m_pieceCount[oldPiece];
+            ++m_pieceCount[piece];
+        }
+
+        // returns captured piece
+        // doesn't check validity
+        FORCEINLINE constexpr Piece doMove(Move move)
+        {
+            if (move.type == MoveType::Normal)
+            {
+                const Piece capturedPiece = m_pieces[move.to];
+                const Piece piece = m_pieces[move.from];
+
+                const Bitboard frombb = Bitboard::square(move.from);
+                const Bitboard tobb = Bitboard::square(move.to);
+                const Bitboard xormove = frombb ^ tobb;
+
+                m_pieces[move.to] = piece;
+                m_pieces[move.from] = Piece::none();
+
+                m_pieceBB[piece] ^= xormove;
+
+                m_piecesByColorBB[piece.color()] ^= xormove;
+
+                if (capturedPiece == Piece::none())
+                {
+                    m_pieceBB[Piece::none()] ^= xormove;
+                }
+                else
+                {
+                    m_pieceBB[capturedPiece] ^= tobb;
+                    m_pieceBB[Piece::none()] ^= frombb;
+
+                    m_piecesByColorBB[capturedPiece.color()] ^= tobb;
+
+                    --m_pieceCount[capturedPiece];
+                    ++m_pieceCount[Piece::none()];
+                }
+
+                return capturedPiece;
+            }
+
+            return doMoveColdPath(move);
+        }
+
+        NOINLINE constexpr Piece doMoveColdPath(Move move)
+        {
+            if (move.type == MoveType::Promotion)
+            {
+                // We split it even though it's similar just because
+                // the normal case is much more common.
+                const Piece capturedPiece = m_pieces[move.to];
+                const Piece fromPiece = m_pieces[move.from];
+                const Piece toPiece = move.promotedPiece;
+
+                m_pieces[move.to] = toPiece;
+                m_pieces[move.from] = Piece::none();
+
+                m_pieceBB[fromPiece] ^= move.from;
+                m_pieceBB[toPiece] ^= move.to;
+
+                m_pieceBB[capturedPiece] ^= move.to;
+                m_pieceBB[Piece::none()] ^= move.from;
+
+                m_piecesByColorBB[fromPiece.color()] ^= move.to;
+                m_piecesByColorBB[fromPiece.color()] ^= move.from;
+                if (capturedPiece != Piece::none())
+                {
+                    m_piecesByColorBB[capturedPiece.color()] ^= move.to;
+                    --m_pieceCount[capturedPiece];
+                    ++m_pieceCount[Piece::none()];
+                }
+
+                --m_pieceCount[fromPiece];
+                ++m_pieceCount[toPiece];
+
+                return capturedPiece;
+            }
+            else if (move.type == MoveType::EnPassant)
+            {
+                const Piece movedPiece = m_pieces[move.from];
+                const Piece capturedPiece(PieceType::Pawn, !movedPiece.color());
+                const Square capturedPieceSq(move.to.file(), move.from.rank());
+
+                // on ep move there are 3 squares involved
+                m_pieces[move.to] = movedPiece;
+                m_pieces[move.from] = Piece::none();
+                m_pieces[capturedPieceSq] = Piece::none();
+
+                m_pieceBB[movedPiece] ^= move.from;
+                m_pieceBB[movedPiece] ^= move.to;
+
+                m_pieceBB[Piece::none()] ^= move.from;
+                m_pieceBB[Piece::none()] ^= move.to;
+
+                m_pieceBB[capturedPiece] ^= capturedPieceSq;
+                m_pieceBB[Piece::none()] ^= capturedPieceSq;
+
+                m_piecesByColorBB[movedPiece.color()] ^= move.to;
+                m_piecesByColorBB[movedPiece.color()] ^= move.from;
+                m_piecesByColorBB[capturedPiece.color()] ^= capturedPieceSq;
+
+                --m_pieceCount[capturedPiece];
+                ++m_pieceCount[Piece::none()];
+
+                return capturedPiece;
+            }
+            else // if (move.type == MoveType::Castle)
+            {
+                const Square rookFromSq = move.to;
+                const Square kingFromSq = move.from;
+
+                const Piece rook = m_pieces[rookFromSq];
+                const Piece king = m_pieces[kingFromSq];
+                const Color color = king.color();
+
+                const CastleType castleType = CastlingTraits::moveCastlingType(move);
+                const Square rookToSq = CastlingTraits::rookDestination[color][castleType];
+                const Square kingToSq = CastlingTraits::kingDestination[color][castleType];
+
+                // 4 squares are involved
+                m_pieces[rookFromSq] = Piece::none();
+                m_pieces[kingFromSq] = Piece::none();
+                m_pieces[rookToSq] = rook;
+                m_pieces[kingToSq] = king;
+
+                m_pieceBB[rook] ^= rookFromSq;
+                m_pieceBB[rook] ^= rookToSq;
+
+                m_pieceBB[king] ^= kingFromSq;
+                m_pieceBB[king] ^= kingToSq;
+
+                m_pieceBB[Piece::none()] ^= rookFromSq;
+                m_pieceBB[Piece::none()] ^= rookToSq;
+
+                m_pieceBB[Piece::none()] ^= kingFromSq;
+                m_pieceBB[Piece::none()] ^= kingToSq;
+
+                m_piecesByColorBB[color] ^= rookFromSq;
+                m_piecesByColorBB[color] ^= rookToSq;
+                m_piecesByColorBB[color] ^= kingFromSq;
+                m_piecesByColorBB[color] ^= kingToSq;
+
+                return Piece::none();
+            }
+        }
+
+        constexpr void undoMove(Move move, Piece capturedPiece)
+        {
+            if (move.type == MoveType::Normal || move.type == MoveType::Promotion)
+            {
+                const Piece toPiece = m_pieces[move.to];
+                const Piece fromPiece = move.promotedPiece == Piece::none() ? toPiece : Piece(PieceType::Pawn, toPiece.color());
+
+                m_pieces[move.from] = fromPiece;
+                m_pieces[move.to] = capturedPiece;
+
+                m_pieceBB[fromPiece] ^= move.from;
+                m_pieceBB[toPiece] ^= move.to;
+
+                m_pieceBB[capturedPiece] ^= move.to;
+                m_pieceBB[Piece::none()] ^= move.from;
+
+                m_piecesByColorBB[fromPiece.color()] ^= move.to;
+                m_piecesByColorBB[fromPiece.color()] ^= move.from;
+                if (capturedPiece != Piece::none())
+                {
+                    m_piecesByColorBB[capturedPiece.color()] ^= move.to;
+                    ++m_pieceCount[capturedPiece];
+                    --m_pieceCount[Piece::none()];
+                }
+
+                if (move.type == MoveType::Promotion)
+                {
+                    --m_pieceCount[toPiece];
+                    ++m_pieceCount[fromPiece];
+                }
+            }
+            else if (move.type == MoveType::EnPassant)
+            {
+                const Piece movedPiece = m_pieces[move.to];
+                const Piece capturedPiece(PieceType::Pawn, !movedPiece.color());
+                const Square capturedPieceSq(move.to.file(), move.from.rank());
+
+                m_pieces[move.to] = Piece::none();
+                m_pieces[move.from] = movedPiece;
+                m_pieces[capturedPieceSq] = capturedPiece;
+
+                m_pieceBB[movedPiece] ^= move.from;
+                m_pieceBB[movedPiece] ^= move.to;
+
+                m_pieceBB[Piece::none()] ^= move.from;
+                m_pieceBB[Piece::none()] ^= move.to;
+
+                // on ep move there are 3 squares involved
+                m_pieceBB[capturedPiece] ^= capturedPieceSq;
+                m_pieceBB[Piece::none()] ^= capturedPieceSq;
+
+                m_piecesByColorBB[movedPiece.color()] ^= move.to;
+                m_piecesByColorBB[movedPiece.color()] ^= move.from;
+                m_piecesByColorBB[capturedPiece.color()] ^= capturedPieceSq;
+
+                ++m_pieceCount[capturedPiece];
+                --m_pieceCount[Piece::none()];
+            }
+            else // if (move.type == MoveType::Castle)
+            {
+                const Square rookFromSq = move.to;
+                const Square kingFromSq = move.from;
+
+                const Color color = move.to.rank() == rank1 ? Color::White : Color::Black;
+
+                const CastleType castleType = CastlingTraits::moveCastlingType(move);
+                const Square rookToSq = CastlingTraits::rookDestination[color][castleType];
+                const Square kingToSq = CastlingTraits::kingDestination[color][castleType];
+
+                const Piece rook = m_pieces[rookToSq];
+                const Piece king = m_pieces[kingToSq];
+
+                // 4 squares are involved
+                m_pieces[rookFromSq] = rook;
+                m_pieces[kingFromSq] = king;
+                m_pieces[rookToSq] = Piece::none();
+                m_pieces[kingToSq] = Piece::none();
+
+                m_pieceBB[rook] ^= rookFromSq;
+                m_pieceBB[rook] ^= rookToSq;
+
+                m_pieceBB[king] ^= kingFromSq;
+                m_pieceBB[king] ^= kingToSq;
+
+                m_pieceBB[Piece::none()] ^= rookFromSq;
+                m_pieceBB[Piece::none()] ^= rookToSq;
+
+                m_pieceBB[Piece::none()] ^= kingFromSq;
+                m_pieceBB[Piece::none()] ^= kingToSq;
+
+                m_piecesByColorBB[color] ^= rookFromSq;
+                m_piecesByColorBB[color] ^= rookToSq;
+                m_piecesByColorBB[color] ^= kingFromSq;
+                m_piecesByColorBB[color] ^= kingToSq;
+            }
+        }
+
+        // Returns whether a given square is attacked by any piece
+        // of `attackerColor` side.
+        [[nodiscard]] bool isSquareAttacked(Square sq, Color attackerColor) const;
+
+        // Returns whether a given square is attacked by any piece
+        // of `attackerColor` side after `move` is made.
+        // Move must be pseudo legal.
+        [[nodiscard]] bool isSquareAttackedAfterMove(Move move, Square sq, Color attackerColor) const;
+
+        // Move must be pseudo legal.
+        // Must not be a king move.
+        [[nodiscard]] bool createsDiscoveredAttackOnOwnKing(Move move) const;
+
+        // Returns whether a piece on a given square is attacked
+        // by any enemy piece. False if square is empty.
+        [[nodiscard]] bool isPieceAttacked(Square sq) const;
+
+        // Returns whether a piece on a given square is attacked
+        // by any enemy piece after `move` is made. False if square is empty.
+        // Move must be pseudo legal.
+        [[nodiscard]] bool isPieceAttackedAfterMove(Move move, Square sq) const;
+
+        // Returns whether the king of the moving side is attacked
+        // by any enemy piece after a move is made.
+        // Move must be pseudo legal.
+        [[nodiscard]] bool isOwnKingAttackedAfterMove(Move move) const;
+
+        // Return a bitboard with all (pseudo legal) attacks by the piece on
+        // the given square. Empty if no piece on the square.
+        [[nodiscard]] Bitboard attacks(Square sq) const;
+
+        // Returns a bitboard with all squared that have pieces
+        // that attack a given square (pseudo legally)
+        [[nodiscard]] Bitboard attackers(Square sq, Color attackerColor) const;
+
+        [[nodiscard]] constexpr Piece pieceAt(Square sq) const
+        {
+            assert(sq.isOk());
+
+            return m_pieces[sq];
+        }
+
+        [[nodiscard]] constexpr Bitboard piecesBB(Color c) const
+        {
+            return m_piecesByColorBB[c];
+        }
+
+        [[nodiscard]] inline Square kingSquare(Color c) const
+        {
+            return piecesBB(Piece(PieceType::King, c)).first();
+        }
+
+        [[nodiscard]] constexpr Bitboard piecesBB(Piece pc) const
+        {
+            return m_pieceBB[pc];
+        }
+
+        [[nodiscard]] constexpr Bitboard piecesBB() const
+        {
+            Bitboard bb{};
+
+            // don't collect from null piece
+            return piecesBB(Color::White) | piecesBB(Color::Black);
+
+            return bb;
+        }
+
+        [[nodiscard]] constexpr std::uint8_t pieceCount(Piece pt) const
+        {
+            return m_pieceCount[pt];
+        }
+
+        [[nodiscard]] constexpr bool isPromotion(Square from, Square to) const
+        {
+            assert(from.isOk() && to.isOk());
+
+            return m_pieces[from].type() == PieceType::Pawn && (to.rank() == rank1 || to.rank() == rank8);
+        }
+
+        const Piece* piecesRaw() const;
+
+    private:
+        EnumArray<Square, Piece> m_pieces;
+        EnumArray<Piece, Bitboard> m_pieceBB;
+        EnumArray<Color, Bitboard> m_piecesByColorBB;
+        EnumArray<Piece, uint8_t> m_pieceCount;
+
+        // NOTE: currently we don't track it because it's not
+        // required to perform ep if we don't need to check validity
+        // Square m_epSquare = Square::none();
+
+        [[nodiscard]] static constexpr bool bbsEqual(const Board& lhs, const Board& rhs) noexcept
+        {
+            for (Piece pc : values<Piece>())
+            {
+                if (lhs.m_pieceBB[pc] != rhs.m_pieceBB[pc])
+                {
+                    return false;
+                }
+            }
+
+            return true;
+        }
+    };
+
+    struct Position;
+
+    struct MoveLegalityChecker
+    {
+        MoveLegalityChecker(const Position& position);
+
+        [[nodiscard]] bool isPseudoLegalMoveLegal(const Move& move) const;
+
+    private:
+        const Position* m_position;
+        Bitboard m_checkers;
+        Bitboard m_ourBlockersForKing;
+        Bitboard m_potentialCheckRemovals;
+        Square m_ksq;
+    };
+
+    struct CompressedPosition;
+
+    struct PositionHash128
+    {
+        std::uint64_t high;
+        std::uint64_t low;
+    };
+
+    struct Position : public Board
+    {
+        using BaseType = Board;
+
+        constexpr Position() noexcept :
+            Board(),
+            m_sideToMove(Color::White),
+            m_epSquare(Square::none()),
+            m_castlingRights(CastlingRights::All),
+            m_rule50Counter(0),
+            m_ply(0)
+        {
+        }
+
+        constexpr Position(const Board& board, Color sideToMove, Square epSquare, CastlingRights castlingRights) :
+            Board(board),
+            m_sideToMove(sideToMove),
+            m_epSquare(epSquare),
+            m_castlingRights(castlingRights),
+            m_rule50Counter(0),
+            m_ply(0)
+        {
+        }
+
+        void set(std::string_view fen);
+
+        // Returns false if the fen was not valid
+        // If the returned value was false the position
+        // is in unspecified state.
+        [[nodiscard]] bool trySet(std::string_view fen);
+
+        [[nodiscard]] static Position fromFen(std::string_view fen);
+
+        [[nodiscard]] static std::optional<Position> tryFromFen(std::string_view fen);
+
+        [[nodiscard]] static Position startPosition();
+
+        [[nodiscard]] std::string fen() const;
+
+        constexpr void setEpSquareUnchecked(Square sq)
+        {
+            m_epSquare = sq;
+        }
+
+        void setEpSquare(Square sq)
+        {
+            m_epSquare = sq;
+            nullifyEpSquareIfNotPossible();
+        }
+
+        constexpr void setSideToMove(Color color)
+        {
+            m_sideToMove = color;
+        }
+
+        constexpr void addCastlingRights(CastlingRights rights)
+        {
+            m_castlingRights |= rights;
+        }
+
+        constexpr void setCastlingRights(CastlingRights rights)
+        {
+            m_castlingRights = rights;
+        }
+
+        constexpr void setRule50Counter(std::uint8_t v)
+        {
+            m_rule50Counter = v;
+        }
+
+        constexpr void setPly(std::uint16_t ply)
+        {
+            m_ply = ply;
+        }
+
+        ReverseMove doMove(const Move& move);
+
+        constexpr void undoMove(const ReverseMove& reverseMove)
+        {
+            const Move& move = reverseMove.move;
+            BaseType::undoMove(move, reverseMove.capturedPiece);
+
+            m_epSquare = reverseMove.oldEpSquare;
+            m_castlingRights = reverseMove.oldCastlingRights;
+
+            m_sideToMove = !m_sideToMove;
+
+            --m_ply;
+            if (m_rule50Counter > 0)
+            {
+                m_rule50Counter -= 1;
+            }
+        }
+
+        [[nodiscard]] constexpr Color sideToMove() const
+        {
+            return m_sideToMove;
+        }
+
+        [[nodiscard]] std::uint8_t rule50Counter() const
+        {
+            return m_rule50Counter;
+        }
+
+        [[nodiscard]] std::uint16_t ply() const
+        {
+            return m_ply;
+        }
+
+        [[nodiscard]] std::uint16_t halfMove() const
+        {
+            return (m_ply + 1) / 2;
+        }
+
+        void setHalfMove(std::uint16_t hm)
+        {
+            m_ply = 2 * hm - 1 + (m_sideToMove == Color::Black);
+        }
+
+        [[nodiscard]] bool isCheck() const;
+
+        [[nodiscard]] Bitboard checkers() const;
+
+        [[nodiscard]] bool isCheckAfterMove(Move move) const;
+
+        // Checks whether ANY `move` is legal.
+        [[nodiscard]] bool isMoveLegal(Move move) const;
+
+        [[nodiscard]] bool isPseudoLegalMoveLegal(Move move) const;
+
+        [[nodiscard]] bool isMovePseudoLegal(Move move) const;
+
+        // Returns all pieces that block a slider
+        // from attacking our king. When two or more
+        // pieces block a single slider then none
+        // of these pieces are included.
+        [[nodiscard]] Bitboard blockersForKing(Color color) const;
+
+        [[nodiscard]] MoveLegalityChecker moveLegalityChecker() const
+        {
+            return { *this };
+        }
+
+        [[nodiscard]] constexpr Square epSquare() const
+        {
+            return m_epSquare;
+        }
+
+        [[nodiscard]] constexpr CastlingRights castlingRights() const
+        {
+            return m_castlingRights;
+        }
+
+        [[nodiscard]] constexpr bool friend operator==(const Position& lhs, const Position& rhs) noexcept
+        {
+            return
+                lhs.m_sideToMove == rhs.m_sideToMove
+                && lhs.m_epSquare == rhs.m_epSquare
+                && lhs.m_castlingRights == rhs.m_castlingRights
+                && static_cast<const Board&>(lhs) == static_cast<const Board&>(rhs);
+        }
+
+        [[nodiscard]] constexpr bool friend operator!=(const Position& lhs, const Position& rhs) noexcept
+        {
+            return !(lhs == rhs);
+        }
+
+        // these are supposed to be used only for testing
+        // that's why there's this assert in afterMove
+
+        [[nodiscard]] constexpr Position beforeMove(const ReverseMove& reverseMove) const
+        {
+            Position cpy(*this);
+            cpy.undoMove(reverseMove);
+            return cpy;
+        }
+
+        [[nodiscard]] Position afterMove(Move move) const;
+
+        [[nodiscard]] constexpr bool isEpPossible() const
+        {
+            return m_epSquare != Square::none();
+        }
+
+        [[nodiscard]] inline CompressedPosition compress() const;
+
+    protected:
+        Color m_sideToMove;
+        Square m_epSquare;
+        CastlingRights m_castlingRights;
+        std::uint8_t m_rule50Counter;
+        std::uint16_t m_ply;
+
+        static_assert(sizeof(Color) + sizeof(Square) + sizeof(CastlingRights) + sizeof(std::uint8_t) == 4);
+
+        [[nodiscard]] FORCEINLINE bool isEpPossible(Square epSquare, Color sideToMove) const;
+
+        [[nodiscard]] NOINLINE bool isEpPossibleColdPath(Square epSquare, Bitboard pawnsAttackingEpSquare, Color sideToMove) const;
+
+        void nullifyEpSquareIfNotPossible();
+    };
+
+    struct CompressedPosition
+    {
+        friend struct Position;
+
+        // Occupied bitboard has bits set for
+        // each square with a piece on it.
+        // Each packedState byte holds 2 values (nibbles).
+        // First one at low bits, second one at high bits.
+        // Values correspond to consecutive squares
+        // in bitboard iteration order.
+        // Nibble values:
+        // these are the same as for Piece
+        // knights, bishops, queens can just be copied
+        //  0 : white pawn
+        //  1 : black pawn
+        //  2 : white knight
+        //  3 : black knight
+        //  4 : white bishop
+        //  5 : black bishop
+        //  6 : white rook
+        //  7 : black rook
+        //  8 : white queen
+        //  9 : black queen
+        // 10 : white king
+        // 11 : black king
+        //
+        // these are special
+        // 12 : pawn with ep square behind (white or black, depending on rank)
+        // 13 : white rook with coresponding castling rights
+        // 14 : black rook with coresponding castling rights
+        // 15 : black king and black is side to move
+        //
+        // Let N be the number of bits set in occupied bitboard.
+        // Only N nibbles are present. (N+1)/2 bytes are initialized.
+
+        static CompressedPosition readFromBigEndian(const unsigned char* data)
+        {
+            CompressedPosition pos{};
+            pos.m_occupied = Bitboard::fromBits(
+                (std::uint64_t)data[0] << 56
+                | (std::uint64_t)data[1] << 48
+                | (std::uint64_t)data[2] << 40
+                | (std::uint64_t)data[3] << 32
+                | (std::uint64_t)data[4] << 24
+                | (std::uint64_t)data[5] << 16
+                | (std::uint64_t)data[6] << 8
+                | (std::uint64_t)data[7]
+                );
+            std::memcpy(pos.m_packedState, data + 8, 16);
+            return pos;
+        }
+
+        constexpr CompressedPosition() :
+            m_occupied{},
+            m_packedState{}
+        {
+        }
+
+        [[nodiscard]] friend bool operator<(const CompressedPosition& lhs, const CompressedPosition& rhs)
+        {
+            if (lhs.m_occupied.bits() < rhs.m_occupied.bits()) return true;
+            if (lhs.m_occupied.bits() > rhs.m_occupied.bits()) return false;
+
+            return std::strcmp(reinterpret_cast<const char*>(lhs.m_packedState), reinterpret_cast<const char*>(rhs.m_packedState)) < 0;
+        }
+
+        [[nodiscard]] friend bool operator==(const CompressedPosition& lhs, const CompressedPosition& rhs)
+        {
+            return lhs.m_occupied == rhs.m_occupied
+                && std::strcmp(reinterpret_cast<const char*>(lhs.m_packedState), reinterpret_cast<const char*>(rhs.m_packedState)) == 0;
+        }
+
+        [[nodiscard]] inline Position decompress() const;
+
+        [[nodiscard]] constexpr Bitboard pieceBB() const
+        {
+            return m_occupied;
+        }
+
+        void writeToBigEndian(unsigned char* data)
+        {
+            const auto occupied = m_occupied.bits();
+            *data++ = occupied >> 56;
+            *data++ = (occupied >> 48) & 0xFF;
+            *data++ = (occupied >> 40) & 0xFF;
+            *data++ = (occupied >> 32) & 0xFF;
+            *data++ = (occupied >> 24) & 0xFF;
+            *data++ = (occupied >> 16) & 0xFF;
+            *data++ = (occupied >> 8) & 0xFF;
+            *data++ = occupied & 0xFF;
+            std::memcpy(data, m_packedState, 16);
+        }
+
+    private:
+        Bitboard m_occupied;
+        std::uint8_t m_packedState[16];
+    };
+
+    static_assert(sizeof(CompressedPosition) == 24);
+    static_assert(std::is_trivially_copyable_v<CompressedPosition>);
+
+    namespace detail
+    {
+        [[nodiscard]] FORCEINLINE constexpr std::uint8_t compressOrdinaryPiece(const Position&, Square, Piece piece)
+        {
+            return static_cast<std::uint8_t>(ordinal(piece));
+        }
+
+        [[nodiscard]] FORCEINLINE constexpr std::uint8_t compressPawn(const Position& position, Square sq, Piece piece)
+        {
+            const Square epSquare = position.epSquare();
+            if (epSquare == Square::none())
+            {
+                return static_cast<std::uint8_t>(ordinal(piece));
+            }
+            else
+            {
+                const Color sideToMove = position.sideToMove();
+                const Rank rank = sq.rank();
+                const File file = sq.file();
+                // use bitwise operators, there is a lot of unpredictable branches but in
+                // total the result is quite predictable
+                if (
+                    (file == epSquare.file())
+                    && (
+                    ((rank == rank4) & (sideToMove == Color::Black))
+                        | ((rank == rank5) & (sideToMove == Color::White))
+                        )
+                    )
+                {
+                    return 12;
+                }
+                else
+                {
+                    return static_cast<std::uint8_t>(ordinal(piece));
+                }
+            }
+        }
+
+        [[nodiscard]] FORCEINLINE constexpr std::uint8_t compressRook(const Position& position, Square sq, Piece piece)
+        {
+            const CastlingRights castlingRights = position.castlingRights();
+            const Color color = piece.color();
+
+            if (color == Color::White
+                && (
+                (sq == a1 && contains(castlingRights, CastlingRights::WhiteQueenSide))
+                    || (sq == h1 && contains(castlingRights, CastlingRights::WhiteKingSide))
+                    )
+                )
+            {
+                return 13;
+            }
+            else if (
+                color == Color::Black
+                && (
+                (sq == a8 && contains(castlingRights, CastlingRights::BlackQueenSide))
+                    || (sq == h8 && contains(castlingRights, CastlingRights::BlackKingSide))
+                    )
+                )
+            {
+                return 14;
+            }
+            else
+            {
+                return static_cast<std::uint8_t>(ordinal(piece));
+            }
+        }
+
+        [[nodiscard]] FORCEINLINE constexpr std::uint8_t compressKing(const Position& position, Square sq, Piece piece)
+        {
+            const Color color = piece.color();
+            const Color sideToMove = position.sideToMove();
+
+            if (color == Color::White)
+            {
+                return 10;
+            }
+            else if (sideToMove == Color::White)
+            {
+                return 11;
+            }
+            else
+            {
+                return 15;
+            }
+        }
+    }
+
+    namespace detail::lookup
+    {
+        static constexpr EnumArray<PieceType, std::uint8_t(*)(const Position&, Square, Piece)> pieceCompressorFunc = []() {
+            EnumArray<PieceType, std::uint8_t(*)(const Position&, Square, Piece)> pieceCompressorFunc{};
+
+            pieceCompressorFunc[PieceType::Knight] = detail::compressOrdinaryPiece;
+            pieceCompressorFunc[PieceType::Bishop] = detail::compressOrdinaryPiece;
+            pieceCompressorFunc[PieceType::Queen] = detail::compressOrdinaryPiece;
+
+            pieceCompressorFunc[PieceType::Pawn] = detail::compressPawn;
+            pieceCompressorFunc[PieceType::Rook] = detail::compressRook;
+            pieceCompressorFunc[PieceType::King] = detail::compressKing;
+
+            pieceCompressorFunc[PieceType::None] = [](const Position&, Square, Piece) -> std::uint8_t { /* should never happen */ return 0; };
+
+            return pieceCompressorFunc;
+        }();
+    }
+
+    [[nodiscard]] inline CompressedPosition Position::compress() const
+    {
+        auto compressPiece = [this](Square sq, Piece piece) -> std::uint8_t {
+            if (piece.type() == PieceType::Pawn) // it's likely to be a pawn
+            {
+                return detail::compressPawn(*this, sq, piece);
+            }
+            else
+            {
+                return detail::lookup::pieceCompressorFunc[piece.type()](*this, sq, piece);
+            }
+        };
+
+        const Bitboard occ = piecesBB();
+
+        CompressedPosition compressed;
+        compressed.m_occupied = occ;
+
+        auto it = occ.begin();
+        auto end = occ.end();
+        for (int i = 0;; ++i)
+        {
+            if (it == end) break;
+            compressed.m_packedState[i] = compressPiece(*it, pieceAt(*it));
+            ++it;
+
+            if (it == end) break;
+            compressed.m_packedState[i] |= compressPiece(*it, pieceAt(*it)) << 4;
+            ++it;
+        }
+
+        return compressed;
+    }
+
+    [[nodiscard]] inline Position CompressedPosition::decompress() const
+    {
+        Position pos;
+        pos.setCastlingRights(CastlingRights::None);
+
+        auto decompressPiece = [&pos](Square sq, std::uint8_t nibble) {
+            switch (nibble)
+            {
+            case 0:
+            case 1:
+            case 2:
+            case 3:
+            case 4:
+            case 5:
+            case 6:
+            case 7:
+            case 8:
+            case 9:
+            case 10:
+            case 11:
+            {
+                pos.place(fromOrdinal<Piece>(nibble), sq);
+                return;
+            }
+
+            case 12:
+            {
+                const Rank rank = sq.rank();
+                if (rank == rank4)
+                {
+                    pos.place(whitePawn, sq);
+                    pos.setEpSquareUnchecked(sq + Offset{ 0, -1 });
+                }
+                else // (rank == rank5)
+                {
+                    pos.place(blackPawn, sq);
+                    pos.setEpSquareUnchecked(sq + Offset{ 0, 1 });
+                }
+                return;
+            }
+
+            case 13:
+            {
+                pos.place(whiteRook, sq);
+                if (sq == a1)
+                {
+                    pos.addCastlingRights(CastlingRights::WhiteQueenSide);
+                }
+                else // (sq == H1)
+                {
+                    pos.addCastlingRights(CastlingRights::WhiteKingSide);
+                }
+                return;
+            }
+
+            case 14:
+            {
+                pos.place(blackRook, sq);
+                if (sq == a8)
+                {
+                    pos.addCastlingRights(CastlingRights::BlackQueenSide);
+                }
+                else // (sq == H8)
+                {
+                    pos.addCastlingRights(CastlingRights::BlackKingSide);
+                }
+                return;
+            }
+
+            case 15:
+            {
+                pos.place(blackKing, sq);
+                pos.setSideToMove(Color::Black);
+                return;
+            }
+
+            }
+
+            return;
+        };
+
+        const Bitboard occ = m_occupied;
+
+        auto it = occ.begin();
+        auto end = occ.end();
+        for (int i = 0;; ++i)
+        {
+            if (it == end) break;
+            decompressPiece(*it, m_packedState[i] & 0xF);
+            ++it;
+
+            if (it == end) break;
+            decompressPiece(*it, m_packedState[i] >> 4);
+            ++it;
+        }
+
+        return pos;
+    }
+
+
+    [[nodiscard]] bool Board::isSquareAttacked(Square sq, Color attackerColor) const
+    {
+        assert(sq.isOk());
+
+        const Bitboard occupied = piecesBB();
+        const Bitboard bishops = piecesBB(Piece(PieceType::Bishop, attackerColor));
+        const Bitboard rooks = piecesBB(Piece(PieceType::Rook, attackerColor));
+        const Bitboard queens = piecesBB(Piece(PieceType::Queen, attackerColor));
+
+        const Bitboard allSliders = (bishops | rooks | queens);
+        if ((bb::pseudoAttacks<PieceType::Queen>(sq) & allSliders).any())
+        {
+            if (bb::isAttackedBySlider(
+                sq,
+                bishops,
+                rooks,
+                queens,
+                occupied
+            ))
+            {
+                return true;
+            }
+        }
+
+        const Bitboard king = piecesBB(Piece(PieceType::King, attackerColor));
+        if ((bb::pseudoAttacks<PieceType::King>(sq) & king).any())
+        {
+            return true;
+        }
+
+        const Bitboard knights = piecesBB(Piece(PieceType::Knight, attackerColor));
+        if ((bb::pseudoAttacks<PieceType::Knight>(sq) & knights).any())
+        {
+            return true;
+        }
+
+        const Bitboard pawns = piecesBB(Piece(PieceType::Pawn, attackerColor));
+        const Bitboard pawnAttacks = bb::pawnAttacks(pawns, attackerColor);
+
+        return pawnAttacks.isSet(sq);
+    }
+
+    [[nodiscard]] bool Board::isSquareAttackedAfterMove(Move move, Square sq, Color attackerColor) const
+    {
+        const Bitboard occupiedChange = Bitboard::square(move.from) | move.to;
+
+        Bitboard occupied = (piecesBB() ^ move.from) | move.to;
+
+        Bitboard bishops = piecesBB(Piece(PieceType::Bishop, attackerColor));
+        Bitboard rooks = piecesBB(Piece(PieceType::Rook, attackerColor));
+        Bitboard queens = piecesBB(Piece(PieceType::Queen, attackerColor));
+        Bitboard king = piecesBB(Piece(PieceType::King, attackerColor));
+        Bitboard knights = piecesBB(Piece(PieceType::Knight, attackerColor));
+        Bitboard pawns = piecesBB(Piece(PieceType::Pawn, attackerColor));
+
+        if (move.type == MoveType::EnPassant)
+        {
+            const Square capturedPawnSq(move.to.file(), move.from.rank());
+            occupied ^= capturedPawnSq;
+            pawns ^= capturedPawnSq;
+        }
+        else if (pieceAt(move.to) != Piece::none())
+        {
+            const Bitboard notCaptured = ~Bitboard::square(move.to);
+            bishops &= notCaptured;
+            rooks &= notCaptured;
+            queens &= notCaptured;
+            knights &= notCaptured;
+            pawns &= notCaptured;
+        }
+
+        // Potential attackers may have moved.
+        const Piece movedPiece = pieceAt(move.from);
+        if (movedPiece.color() == attackerColor)
+        {
+            switch (movedPiece.type())
+            {
+            case PieceType::Pawn:
+                pawns ^= occupiedChange;
+                break;
+            case PieceType::Knight:
+                knights ^= occupiedChange;
+                break;
+            case PieceType::Bishop:
+                bishops ^= occupiedChange;
+                break;
+            case PieceType::Rook:
+                rooks ^= occupiedChange;
+                break;
+            case PieceType::Queen:
+                queens ^= occupiedChange;
+                break;
+            case PieceType::King:
+            {
+                if (move.type == MoveType::Castle)
+                {
+                    const CastleType castleType = CastlingTraits::moveCastlingType(move);
+
+                    king ^= move.from;
+                    king ^= CastlingTraits::kingDestination[attackerColor][castleType];
+                    rooks ^= move.to;
+                    rooks ^= CastlingTraits::rookDestination[attackerColor][castleType];
+
+                    break;
+                }
+                else
+                {
+                    king ^= occupiedChange;
+                }
+            }
+            }
+        }
+
+        // If it's a castling move then the change in square occupation
+        // cannot have an effect because otherwise there would be
+        // a slider attacker attacking the castling king.
+        // (It could have an effect in chess960 if the slider
+        // attacker was behind the rook involved in castling,
+        // but we don't care about chess960.)
+
+        const Bitboard allSliders = (bishops | rooks | queens);
+        if ((bb::pseudoAttacks<PieceType::Queen>(sq) & allSliders).any())
+        {
+            if (bb::isAttackedBySlider(
+                sq,
+                bishops,
+                rooks,
+                queens,
+                occupied
+            ))
+            {
+                return true;
+            }
+        }
+
+        if ((bb::pseudoAttacks<PieceType::King>(sq) & king).any())
+        {
+            return true;
+        }
+
+        if ((bb::pseudoAttacks<PieceType::Knight>(sq) & knights).any())
+        {
+            return true;
+        }
+
+        const Bitboard pawnAttacks = bb::pawnAttacks(pawns, attackerColor);
+
+        return pawnAttacks.isSet(sq);
+    }
+
+    [[nodiscard]] bool Board::createsDiscoveredAttackOnOwnKing(Move move) const
+    {
+        Bitboard occupied = (piecesBB() ^ move.from) | move.to;
+
+        const Piece movedPiece = pieceAt(move.from);
+        const Color kingColor = movedPiece.color();
+        const Color attackerColor = !kingColor;
+        const Square ksq = kingSquare(kingColor);
+
+        Bitboard bishops = piecesBB(Piece(PieceType::Bishop, attackerColor));
+        Bitboard rooks = piecesBB(Piece(PieceType::Rook, attackerColor));
+        Bitboard queens = piecesBB(Piece(PieceType::Queen, attackerColor));
+
+        if (move.type == MoveType::EnPassant)
+        {
+            const Square capturedPawnSq(move.to.file(), move.from.rank());
+            occupied ^= capturedPawnSq;
+        }
+        else if (pieceAt(move.to) != Piece::none())
+        {
+            const Bitboard notCaptured = ~Bitboard::square(move.to);
+            bishops &= notCaptured;
+            rooks &= notCaptured;
+            queens &= notCaptured;
+        }
+
+        const Bitboard allSliders = (bishops | rooks | queens);
+        if ((bb::pseudoAttacks<PieceType::Queen>(ksq) & allSliders).any())
+        {
+            if (bb::isAttackedBySlider(
+                ksq,
+                bishops,
+                rooks,
+                queens,
+                occupied
+            ))
+            {
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+    [[nodiscard]] bool Board::isPieceAttacked(Square sq) const
+    {
+        const Piece piece = pieceAt(sq);
+
+        if (piece == Piece::none())
+        {
+            return false;
+        }
+
+        return isSquareAttacked(sq, !piece.color());
+    }
+
+    [[nodiscard]] bool Board::isPieceAttackedAfterMove(Move move, Square sq) const
+    {
+        const Piece piece = pieceAt(sq);
+
+        if (piece == Piece::none())
+        {
+            return false;
+        }
+
+        if (sq == move.from)
+        {
+            // We moved the piece we're interested in.
+            // For every move the piece ends up on the move.to except
+            // for the case of castling moves.
+            // But we know pseudo legal castling moves
+            // are already legal, so the king cannot be in check after.
+            if (move.type == MoveType::Castle)
+            {
+                return false;
+            }
+
+            // So update the square we're interested in.
+            sq = move.to;
+        }
+
+        return isSquareAttackedAfterMove(move, sq, !piece.color());
+    }
+
+    [[nodiscard]] bool Board::isOwnKingAttackedAfterMove(Move move) const
+    {
+        if (move.type == MoveType::Castle)
+        {
+            // Pseudo legal castling moves are already legal.
+            // This is ensured by the move generator.
+            return false;
+        }
+
+        const Piece movedPiece = pieceAt(move.from);
+
+        return isPieceAttackedAfterMove(move, kingSquare(movedPiece.color()));
+    }
+
+    [[nodiscard]] Bitboard Board::attacks(Square sq) const
+    {
+        const Piece piece = pieceAt(sq);
+        if (piece == Piece::none())
+        {
+            return Bitboard::none();
+        }
+
+        if (piece.type() == PieceType::Pawn)
+        {
+            return bb::pawnAttacks(Bitboard::square(sq), piece.color());
+        }
+        else
+        {
+            return bb::attacks(piece.type(), sq, piecesBB());
+        }
+    }
+
+    [[nodiscard]] Bitboard Board::attackers(Square sq, Color attackerColor) const
+    {
+        // En-passant square is not included.
+
+        Bitboard allAttackers = Bitboard::none();
+
+        const Bitboard occupied = piecesBB();
+
+        const Bitboard bishops = piecesBB(Piece(PieceType::Bishop, attackerColor));
+        const Bitboard rooks = piecesBB(Piece(PieceType::Rook, attackerColor));
+        const Bitboard queens = piecesBB(Piece(PieceType::Queen, attackerColor));
+
+        const Bitboard bishopLikePieces = (bishops | queens);
+        const Bitboard bishopAttacks = bb::attacks<PieceType::Bishop>(sq, occupied);
+        allAttackers |= bishopAttacks & bishopLikePieces;
+
+        const Bitboard rookLikePieces = (rooks | queens);
+        const Bitboard rookAttacks = bb::attacks<PieceType::Rook>(sq, occupied);
+        allAttackers |= rookAttacks & rookLikePieces;
+
+        const Bitboard king = piecesBB(Piece(PieceType::King, attackerColor));
+        allAttackers |= bb::pseudoAttacks<PieceType::King>(sq) & king;
+
+        const Bitboard knights = piecesBB(Piece(PieceType::Knight, attackerColor));
+        allAttackers |= bb::pseudoAttacks<PieceType::Knight>(sq) & knights;
+
+        const Bitboard pawns = piecesBB(Piece(PieceType::Pawn, attackerColor));
+        allAttackers |= bb::pawnAttacks(Bitboard::square(sq), !attackerColor) & pawns;
+
+        return allAttackers;
+    }
+
+    const Piece* Board::piecesRaw() const
+    {
+        return m_pieces.data();
+    }
+
+    namespace detail::lookup
+    {
+        static constexpr EnumArray<Piece, char> fenPiece = []() {
+            EnumArray<Piece, char> fenPiece{};
+
+            fenPiece[whitePawn] = 'P';
+            fenPiece[blackPawn] = 'p';
+            fenPiece[whiteKnight] = 'N';
+            fenPiece[blackKnight] = 'n';
+            fenPiece[whiteBishop] = 'B';
+            fenPiece[blackBishop] = 'b';
+            fenPiece[whiteRook] = 'R';
+            fenPiece[blackRook] = 'r';
+            fenPiece[whiteQueen] = 'Q';
+            fenPiece[blackQueen] = 'q';
+            fenPiece[whiteKing] = 'K';
+            fenPiece[blackKing] = 'k';
+            fenPiece[Piece::none()] = 'X';
+
+            return fenPiece;
+        }();
+    }
+
+    [[nodiscard]] std::string Board::fen() const
+    {
+        std::string fen;
+        fen.reserve(96); // longest fen is probably in range of around 88
+
+        Rank rank = rank8;
+        File file = fileA;
+        std::uint8_t emptyCounter = 0;
+
+        for (;;)
+        {
+            const Square sq(file, rank);
+            const Piece piece = m_pieces[sq];
+
+            if (piece == Piece::none())
+            {
+                ++emptyCounter;
+            }
+            else
+            {
+                if (emptyCounter != 0)
+                {
+                    fen.push_back(static_cast<char>(emptyCounter) + '0');
+                    emptyCounter = 0;
+                }
+
+                fen.push_back(detail::lookup::fenPiece[piece]);
+            }
+
+            ++file;
+            if (file > fileH)
+            {
+                file = fileA;
+                --rank;
+
+                if (emptyCounter != 0)
+                {
+                    fen.push_back(static_cast<char>(emptyCounter) + '0');
+                    emptyCounter = 0;
+                }
+
+                if (rank < rank1)
+                {
+                    break;
+                }
+                fen.push_back('/');
+            }
+        }
+
+        return fen;
+    }
+
+    MoveLegalityChecker::MoveLegalityChecker(const Position& position) :
+        m_position(&position),
+        m_checkers(position.checkers()),
+        m_ourBlockersForKing(
+            position.blockersForKing(position.sideToMove())
+            & position.piecesBB(position.sideToMove())
+        ),
+        m_ksq(position.kingSquare(position.sideToMove()))
+    {
+        if (m_checkers.exactlyOne())
+        {
+            const Bitboard knightCheckers = m_checkers & bb::pseudoAttacks<PieceType::Knight>(m_ksq);
+            if (knightCheckers.any())
+            {
+                // We're checked by a knight, we have to remove it or move the king.
+                m_potentialCheckRemovals = knightCheckers;
+            }
+            else
+            {
+                // If we're not checked by a knight we can block it.
+                m_potentialCheckRemovals = bb::between(m_ksq, m_checkers.first()) | m_checkers;
+            }
+        }
+        else
+        {
+            // Double check, king has to move.
+            m_potentialCheckRemovals = Bitboard::none();
+        }
+    }
+
+    [[nodiscard]] bool MoveLegalityChecker::isPseudoLegalMoveLegal(const Move& move) const
+    {
+        const Piece movedPiece = m_position->pieceAt(move.from);
+
+        if (m_checkers.any())
+        {
+            if (move.from == m_ksq || move.type == MoveType::EnPassant)
+            {
+                return m_position->isPseudoLegalMoveLegal(move);
+            }
+            else
+            {
+                // This means there's only one check and we either
+                // blocked it or removed the piece that attacked
+                // our king. So the only threat is if it's a discovered check.
+                return
+                    m_potentialCheckRemovals.isSet(move.to)
+                    && !m_ourBlockersForKing.isSet(move.from);
+            }
+        }
+        else
+        {
+            if (move.from == m_ksq)
+            {
+                return m_position->isPseudoLegalMoveLegal(move);
+            }
+            else if (move.type == MoveType::EnPassant)
+            {
+                return !m_position->createsDiscoveredAttackOnOwnKing(move);
+            }
+            else if (m_ourBlockersForKing.isSet(move.from))
+            {
+                // If it was a blocker it may have only moved in line with our king.
+                // Otherwise it's a discovered check.
+                return bb::line(m_ksq, move.from).isSet(move.to);
+            }
+            else
+            {
+                return true;
+            }
+        }
+    }
+
+    void Position::set(std::string_view fen)
+    {
+        (void)trySet(fen);
+    }
+
+    // Returns false if the fen was not valid
+    // If the returned value was false the position
+    // is in unspecified state.
+    [[nodiscard]] bool Position::trySet(std::string_view fen)
+    {
+        // Lazily splits by ' '. Returns empty string views if at the end.
+        auto nextPart = [fen, start = std::size_t{ 0 }]() mutable {
+            std::size_t end = fen.find(' ', start);
+            if (end == std::string::npos)
+            {
+                std::string_view substr = fen.substr(start);
+                start = fen.size();
+                return substr;
+            }
+            else
+            {
+                std::string_view substr = fen.substr(start, end - start);
+                start = end + 1; // to skip whitespace
+                return substr;
+            }
+        };
+
+        if (!BaseType::trySet(nextPart())) return false;
+
+        {
+            const auto side = nextPart();
+            if (side == std::string_view("w")) m_sideToMove = Color::White;
+            else if (side == std::string_view("b")) m_sideToMove = Color::Black;
+            else return false;
+
+            if (isSquareAttacked(kingSquare(!m_sideToMove), m_sideToMove)) return false;
+        }
+
+        {
+            const auto castlingRights = nextPart();
+            auto castlingRightsOpt = parser_bits::tryParseCastlingRights(castlingRights);
+            if (!castlingRightsOpt.has_value())
+            {
+                return false;
+            }
+            else
+            {
+                m_castlingRights = *castlingRightsOpt;
+            }
+        }
+
+        {
+            const auto epSquare = nextPart();
+            auto epSquareOpt = parser_bits::tryParseEpSquare(epSquare);
+            if (!epSquareOpt.has_value())
+            {
+                return false;
+            }
+            else
+            {
+                m_epSquare = *epSquareOpt;
+            }
+        }
+
+        {
+            const auto rule50 = nextPart();
+            if (!rule50.empty())
+            {
+                m_rule50Counter = std::stoi(rule50.data());
+            }
+            else
+            {
+                m_rule50Counter = 0;
+            }
+        }
+
+        {
+            const auto halfMove = nextPart();
+            if (!halfMove.empty())
+            {
+                m_ply = std::stoi(halfMove.data()) * 2 - (m_sideToMove == Color::White);
+            }
+            else
+            {
+                m_ply = 0;
+            }
+        }
+
+        nullifyEpSquareIfNotPossible();
+
+        return true;
+    }
+
+    [[nodiscard]] Position Position::fromFen(std::string_view fen)
+    {
+        Position pos{};
+        pos.set(fen);
+        return pos;
+    }
+
+    [[nodiscard]] std::optional<Position> Position::tryFromFen(std::string_view fen)
+    {
+        Position pos{};
+        if (pos.trySet(fen)) return pos;
+        else return {};
+    }
+
+    [[nodiscard]] Position Position::startPosition()
+    {
+        static const Position pos = fromFen("rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1");
+        return pos;
+    }
+
+    [[nodiscard]] std::string Position::fen() const
+    {
+        std::string fen = Board::fen();
+
+        fen += ' ';
+        fen += m_sideToMove == Color::White ? 'w' : 'b';
+
+        fen += ' ';
+        parser_bits::appendCastlingRightsToString(m_castlingRights, fen);
+
+        fen += ' ';
+        parser_bits::appendEpSquareToString(m_epSquare, fen);
+
+        fen += ' ';
+        fen += std::to_string(m_rule50Counter);
+
+        fen += ' ';
+        fen += std::to_string(halfMove());
+
+        return fen;
+    }
+
+    namespace detail::lookup
+    {
+        static constexpr EnumArray<Square, CastlingRights> preservedCastlingRights = []() {
+            EnumArray<Square, CastlingRights> preservedCastlingRights{};
+            for (CastlingRights& rights : preservedCastlingRights)
+            {
+                rights = ~CastlingRights::None;
+            }
+
+            preservedCastlingRights[e1] = ~CastlingRights::White;
+            preservedCastlingRights[e8] = ~CastlingRights::Black;
+
+            preservedCastlingRights[h1] = ~CastlingRights::WhiteKingSide;
+            preservedCastlingRights[a1] = ~CastlingRights::WhiteQueenSide;
+            preservedCastlingRights[h8] = ~CastlingRights::BlackKingSide;
+            preservedCastlingRights[a8] = ~CastlingRights::BlackQueenSide;
+
+            return preservedCastlingRights;
+        }();
+    }
+
+    ReverseMove Position::doMove(const Move& move)
+    {
+        assert(move.from.isOk() && move.to.isOk());
+
+        const PieceType movedPiece = pieceAt(move.from).type();
+
+        m_ply += 1;
+        m_rule50Counter += 1;
+
+        if (move.type != MoveType::Castle && (movedPiece == PieceType::Pawn || pieceAt(move.to) != Piece::none()))
+        {
+            m_rule50Counter = 0;
+        }
+
+        const Square oldEpSquare = m_epSquare;
+        const CastlingRights oldCastlingRights = m_castlingRights;
+        m_castlingRights &= detail::lookup::preservedCastlingRights[move.from];
+        m_castlingRights &= detail::lookup::preservedCastlingRights[move.to];
+
+        m_epSquare = Square::none();
+        // for double pushes move index differs by 16 or -16;
+        if((movedPiece == PieceType::Pawn) & ((ordinal(move.to) ^ ordinal(move.from)) == 16))
+        {
+            const Square potentialEpSquare = fromOrdinal<Square>((ordinal(move.to) + ordinal(move.from)) >> 1);
+            // Even though the move has not yet been made we can safely call
+            // this function and get the right result because the position of the
+            // pawn to be captured is not really relevant.
+            if (isEpPossible(potentialEpSquare, !m_sideToMove))
+            {
+                m_epSquare = potentialEpSquare;
+            }
+        }
+
+        const Piece captured = BaseType::doMove(move);
+        m_sideToMove = !m_sideToMove;
+        return { move, captured, oldEpSquare, oldCastlingRights };
+    }
+
+    [[nodiscard]] bool Position::isCheck() const
+    {
+        return BaseType::isSquareAttacked(kingSquare(m_sideToMove), !m_sideToMove);
+    }
+
+    [[nodiscard]] Bitboard Position::checkers() const
+    {
+        return BaseType::attackers(kingSquare(m_sideToMove), !m_sideToMove);
+    }
+
+    [[nodiscard]] bool Position::isCheckAfterMove(Move move) const
+    {
+        return BaseType::isSquareAttackedAfterMove(move, kingSquare(!m_sideToMove), m_sideToMove);
+    }
+
+    [[nodiscard]] Bitboard Position::blockersForKing(Color color) const
+    {
+        const Color attackerColor = !color;
+
+        const Bitboard occupied = piecesBB();
+
+        const Bitboard bishops = piecesBB(Piece(PieceType::Bishop, attackerColor));
+        const Bitboard rooks = piecesBB(Piece(PieceType::Rook, attackerColor));
+        const Bitboard queens = piecesBB(Piece(PieceType::Queen, attackerColor));
+
+        const Square ksq = kingSquare(color);
+
+        const Bitboard opponentBishopLikePieces = (bishops | queens);
+        const Bitboard bishopPseudoAttacks = bb::pseudoAttacks<PieceType::Bishop>(ksq);
+
+        const Bitboard opponentRookLikePieces = (rooks | queens);
+        const Bitboard rookPseudoAttacks = bb::pseudoAttacks<PieceType::Rook>(ksq);
+
+        const Bitboard xrayers =
+            (bishopPseudoAttacks & opponentBishopLikePieces)
+            | (rookPseudoAttacks & opponentRookLikePieces);
+
+        Bitboard allBlockers = Bitboard::none();
+
+        for (Square xrayer : xrayers)
+        {
+            const Bitboard blockers = bb::between(xrayer, ksq) & occupied;
+            if (blockers.exactlyOne())
+            {
+                allBlockers |= blockers;
+            }
+        }
+
+        return allBlockers;
+    }
+
+    [[nodiscard]] Position Position::afterMove(Move move) const
+    {
+        Position cpy(*this);
+        auto pc = cpy.doMove(move);
+
+        (void)pc;
+        //assert(cpy.beforeMove(move, pc) == *this); // this assert would result in infinite recursion
+
+        return cpy;
+    }
+
+    [[nodiscard]] FORCEINLINE bool Position::isEpPossible(Square epSquare, Color sideToMove) const
+    {
+        const Bitboard pawnsAttackingEpSquare =
+            bb::pawnAttacks(Bitboard::square(epSquare), !sideToMove)
+            & piecesBB(Piece(PieceType::Pawn, sideToMove));
+
+        if (!pawnsAttackingEpSquare.any())
+        {
+            return false;
+        }
+
+        return isEpPossibleColdPath(epSquare, pawnsAttackingEpSquare, sideToMove);
+    }
+
+    [[nodiscard]] NOINLINE bool Position::isEpPossibleColdPath(Square epSquare, Bitboard pawnsAttackingEpSquare, Color sideToMove) const
+    {
+        // only set m_epSquare when it matters, ie. when
+        // the opposite side can actually capture
+        for (Square sq : pawnsAttackingEpSquare)
+        {
+            // If we're here the previous move by other side
+            // was a double pawn move so our king is either not in check
+            // or is attacked only by the moved pawn - in which
+            // case it can be captured by our pawn if it doesn't
+            // create a discovered check on our king.
+            // So overall we only have to check whether our king
+            // ends up being uncovered to a slider attack.
+
+            const Square ksq = kingSquare(sideToMove);
+
+            const Bitboard bishops = piecesBB(Piece(PieceType::Bishop, !sideToMove));
+            const Bitboard rooks = piecesBB(Piece(PieceType::Rook, !sideToMove));
+            const Bitboard queens = piecesBB(Piece(PieceType::Queen, !sideToMove));
+
+            const Bitboard relevantAttackers = bishops | rooks | queens;
+            const Bitboard pseudoSliderAttacksFromKing = bb::pseudoAttacks<PieceType::Queen>(ksq);
+            if ((relevantAttackers & pseudoSliderAttacksFromKing).isEmpty())
+            {
+                // It's enough that one pawn can capture.
+                return true;
+            }
+
+            const Square capturedPawnSq(epSquare.file(), sq.rank());
+            const Bitboard occupied = ((piecesBB() ^ sq) | epSquare) ^ capturedPawnSq;
+
+            if (!bb::isAttackedBySlider(
+                ksq,
+                bishops,
+                rooks,
+                queens,
+                occupied
+            ))
+            {
+                // It's enough that one pawn can capture.
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+    void Position::nullifyEpSquareIfNotPossible()
+    {
+        if (m_epSquare != Square::none() && !isEpPossible(m_epSquare, m_sideToMove))
+        {
+            m_epSquare = Square::none();
+        }
+    }
+
+    namespace uci
+    {
+        [[nodiscard]] std::string moveToUci(const Position& pos, const Move& move);
+        [[nodiscard]] Move uciToMove(const Position& pos, std::string_view sv);
+
+        [[nodiscard]] std::optional<Move> tryUciToMove(const Position& pos, std::string_view sv);
+
+        [[nodiscard]] std::string moveToUci(const Position& pos, const Move& move)
+        {
+            std::string s;
+
+            parser_bits::appendSquareToString(move.from, s);
+
+            if (move.type == MoveType::Castle)
+            {
+                const CastleType castleType = CastlingTraits::moveCastlingType(move);
+
+                const Square kingDestination = CastlingTraits::kingDestination[pos.sideToMove()][castleType];
+                parser_bits::appendSquareToString(kingDestination, s);
+            }
+            else
+            {
+                parser_bits::appendSquareToString(move.to, s);
+
+                if (move.type == MoveType::Promotion)
+                {
+                    // lowercase piece symbol
+                    s += EnumTraits<PieceType>::toChar(move.promotedPiece.type(), Color::Black);
+                }
+            }
+
+            return s;
+        }
+
+        [[nodiscard]] Move uciToMove(const Position& pos, std::string_view sv)
+        {
+            const Square from = parser_bits::parseSquare(sv.data());
+            const Square to = parser_bits::parseSquare(sv.data() + 2);
+
+            if (sv.size() == 5)
+            {
+                const PieceType promotedPieceType = *fromChar<PieceType>(sv[4]);
+                return Move::promotion(from, to, Piece(promotedPieceType, pos.sideToMove()));
+            }
+            else
+            {
+                if (
+                    pos.pieceAt(from).type() == PieceType::King
+                    && std::abs(from.file() - to.file()) > 1
+                    )
+                {
+                    // uci king destinations are on files C or G.
+                    const CastleType castleType =
+                        (to.file() == fileG)
+                        ? CastleType::Short
+                        : CastleType::Long;
+
+                    return Move::castle(castleType, pos.sideToMove());
+                }
+                else if (pos.epSquare() == to)
+                {
+                    return Move::enPassant(from, to);
+                }
+                else
+                {
+                    return Move::normal(from, to);
+                }
+            }
+        }
+
+        [[nodiscard]] std::optional<Move> tryUciToMove(const Position& pos, std::string_view sv)
+        {
+            if (sv.size() < 4 || sv.size() > 5)
+            {
+                return std::nullopt;
+            }
+
+            const auto from = parser_bits::tryParseSquare(sv.substr(0, 2));
+            const auto to = parser_bits::tryParseSquare(sv.substr(2, 2));
+
+            Move move{};
+
+            if (!from.has_value() || !to.has_value())
+            {
+                return std::nullopt;
+            }
+
+            if (sv.size() == 5)
+            {
+                const auto promotedPieceType = fromChar<PieceType>(sv[4]);
+                if (!promotedPieceType.has_value())
+                {
+                    return std::nullopt;
+                }
+
+                if (
+                    *promotedPieceType != PieceType::Knight
+                    && *promotedPieceType != PieceType::Bishop
+                    && *promotedPieceType != PieceType::Rook
+                    && *promotedPieceType != PieceType::Queen
+                    )
+                {
+                    return std::nullopt;
+                }
+
+                move = Move::promotion(*from, *to, Piece(*promotedPieceType, pos.sideToMove()));
+            }
+            else // sv.size() == 4
+            {
+
+                if (
+                    pos.pieceAt(*from).type() == PieceType::King
+                    && std::abs(from->file() - to->file()) > 1
+                    )
+                {
+                    // uci king destinations are on files C or G.
+
+                    if (pos.sideToMove() == Color::White)
+                    {
+                        if (*from != e1)
+                        {
+                            return std::nullopt;
+                        }
+
+                        if (*to != c1 && *to != g1)
+                        {
+                            return std::nullopt;
+                        }
+                    }
+                    else
+                    {
+                        if (*from != e8)
+                        {
+                            return std::nullopt;
+                        }
+
+                        if (*to != c8 && *to != g8)
+                        {
+                            return std::nullopt;
+                        }
+                    }
+
+                    const CastleType castleType =
+                        (to->file() == fileG)
+                        ? CastleType::Short
+                        : CastleType::Long;
+
+                    move = Move::castle(castleType, pos.sideToMove());
+                }
+                else if (to == pos.epSquare())
+                {
+                    move = Move::enPassant(*from, *to);
+                }
+                else
+                {
+                    move = Move::normal(*from, *to);
+                }
+            }
+
+            if (!pos.isMoveLegal(move))
+            {
+                return std::nullopt;
+            }
+
+            return move;
+        }
+    }
+}
+
+namespace binpack
+{
+    constexpr std::size_t KiB = 1024;
+    constexpr std::size_t MiB = (1024*KiB);
+    constexpr std::size_t GiB = (1024*MiB);
+
+    constexpr std::size_t suggestedChunkSize = MiB;
+    constexpr std::size_t maxMovelistSize = 10*KiB; // a safe upper bound
+    constexpr std::size_t maxChunkSize = 100*MiB; // to prevent malformed files from causing huge allocations
+
+    using namespace std::literals;
+
+    namespace nodchip
+    {
+        // This namespace contains modified code from https://github.com/nodchip/Stockfish
+        // which is released under GPL v3 license https://www.gnu.org/licenses/gpl-3.0.html
+
+        using namespace std;
+
+        struct StockfishMove
+        {
+            [[nodiscard]] static StockfishMove fromMove(chess::Move move)
+            {
+                StockfishMove sfm;
+
+                sfm.m_raw = 0;
+
+                unsigned moveFlag = 0;
+                if (move.type == chess::MoveType::Promotion) moveFlag = 1;
+                else if (move.type == chess::MoveType::EnPassant) moveFlag = 2;
+                else if (move.type == chess::MoveType::Castle) moveFlag = 3;
+
+                unsigned promotionIndex = 0;
+                if (move.type == chess::MoveType::Promotion)
+                {
+                    promotionIndex = static_cast<int>(move.promotedPiece.type()) - static_cast<int>(chess::PieceType::Knight);
+                }
+
+                sfm.m_raw |= static_cast<std::uint16_t>(moveFlag);
+                sfm.m_raw <<= 2;
+                sfm.m_raw |= static_cast<std::uint16_t>(promotionIndex);
+                sfm.m_raw <<= 6;
+                sfm.m_raw |= static_cast<int>(move.from);
+                sfm.m_raw <<= 6;
+                sfm.m_raw |= static_cast<int>(move.to);
+
+                return sfm;
+            }
+
+            [[nodiscard]] chess::Move toMove() const
+            {
+                const chess::Square to = static_cast<chess::Square>((m_raw & (0b111111 << 0) >> 0));
+                const chess::Square from = static_cast<chess::Square>((m_raw & (0b111111 << 6)) >> 6);
+
+                const unsigned promotionIndex = (m_raw & (0b11 << 12)) >> 12;
+                const chess::PieceType promotionType = static_cast<chess::PieceType>(static_cast<int>(chess::PieceType::Knight) + promotionIndex);
+
+                const unsigned moveFlag = (m_raw & (0b11 << 14)) >> 14;
+                chess::MoveType type = chess::MoveType::Normal;
+                if (moveFlag == 1) type = chess::MoveType::Promotion;
+                else if (moveFlag == 2) type = chess::MoveType::EnPassant;
+                else if (moveFlag == 3) type = chess::MoveType::Castle;
+
+                if (type == chess::MoveType::Promotion)
+                {
+                    const chess::Color stm = to.rank() == chess::rank8 ? chess::Color::White : chess::Color::Black;
+                    return chess::Move{from, to, type, chess::Piece(promotionType, stm)};
+                }
+
+                return chess::Move{from, to, type};
+            }
+
+        private:
+            std::uint16_t m_raw;
+        };
+        static_assert(sizeof(StockfishMove) == sizeof(std::uint16_t));
+
+        struct PackedSfen
+        {
+            uint8_t data[32];
+        };
+
+        struct PackedSfenValue
+        {
+            // phase
+            PackedSfen sfen;
+
+            // Evaluation value returned from Learner::search()
+            int16_t score;
+
+            // PV first move
+            // Used when finding the match rate with the teacher
+            StockfishMove move;
+
+            // Trouble of the phase from the initial phase.
+            uint16_t gamePly;
+
+            // 1 if the player on this side ultimately wins the game. -1 if you are losing.
+            // 0 if a draw is reached.
+            // The draw is in the teacher position generation command gensfen,
+            // Only write if LEARN_GENSFEN_DRAW_RESULT is enabled.
+            int8_t game_result;
+
+            // When exchanging the file that wrote the teacher aspect with other people
+            //Because this structure size is not fixed, pad it so that it is 40 bytes in any environment.
+            uint8_t padding;
+
+            // 32 + 2 + 2 + 2 + 1 + 1 = 40bytes
+        };
+        static_assert(sizeof(PackedSfenValue) == 40);
+        // Class that handles bitstream
+
+        // useful when doing aspect encoding
+        struct BitStream
+        {
+            // Set the memory to store the data in advance.
+            // Assume that memory is cleared to 0.
+            void  set_data(uint8_t* data_) { data = data_; reset(); }
+
+            // Get the pointer passed in set_data().
+            uint8_t* get_data() const { return data; }
+
+            // Get the cursor.
+            int get_cursor() const { return bit_cursor; }
+
+            // reset the cursor
+            void reset() { bit_cursor = 0; }
+
+            // Write 1bit to the stream.
+            // If b is non-zero, write out 1. If 0, write 0.
+            void write_one_bit(int b)
+            {
+                if (b)
+                    data[bit_cursor / 8] |= 1 << (bit_cursor & 7);
+
+                ++bit_cursor;
+            }
+
+            // Get 1 bit from the stream.
+            int read_one_bit()
+            {
+                int b = (data[bit_cursor / 8] >> (bit_cursor & 7)) & 1;
+                ++bit_cursor;
+
+                return b;
+            }
+
+            // write n bits of data
+            // Data shall be written out from the lower order of d.
+            void write_n_bit(int d, int n)
+            {
+                for (int i = 0; i <n; ++i)
+                    write_one_bit(d & (1 << i));
+            }
+
+            // read n bits of data
+            // Reverse conversion of write_n_bit().
+            int read_n_bit(int n)
+            {
+                int result = 0;
+                for (int i = 0; i < n; ++i)
+                    result |= read_one_bit() ? (1 << i) : 0;
+
+                return result;
+            }
+
+        private:
+            // Next bit position to read/write.
+            int bit_cursor;
+
+            // data entity
+            uint8_t* data;
+        };
+
+
+        // Huffman coding
+        // * is simplified from mini encoding to make conversion easier.
+        //
+        // 1 box on the board (other than NO_PIECE) = 2 to 6 bits (+ 1-bit flag + 1-bit forward and backward)
+        // 1 piece of hand piece = 1-5bit (+ 1-bit flag + 1bit ahead and behind)
+        //
+        // empty xxxxx0 + 0 (none)
+        // step xxxx01 + 2 xxxx0 + 2
+        // incense xx0011 + 2 xx001 + 2
+        // Katsura xx1011 + 2 xx101 + 2
+        // silver xx0111 + 2 xx011 + 2
+        // Gold x01111 + 1 x0111 + 1 // Gold is valid and has no flags.
+        // corner 011111 + 2 01111 + 2
+        // Fly 111111 + 2 11111 + 2
+        //
+        // Assuming all pieces are on the board,
+        // Sky 81-40 pieces = 41 boxes = 41bit
+        // Walk 4bit*18 pieces = 72bit
+        // Incense 6bit*4 pieces = 24bit
+        // Katsura 6bit*4 pieces = 24bit
+        // Silver 6bit*4 pieces = 24bit
+        // Gold 6bit* 4 pieces = 24bit
+        // corner 8bit* 2 pieces = 16bit
+        // Fly 8bit* 2 pieces = 16bit
+        // -------
+        // 241bit + 1bit (turn) + 7bit × 2 (King's position after) = 256bit
+        //
+        // When the piece on the board moves to the hand piece, the piece on the board becomes empty, so the box on the board can be expressed with 1 bit,
+        // Since the hand piece can be expressed by 1 bit less than the piece on the board, the total number of bits does not change in the end.
+        // Therefore, in this expression, any aspect can be expressed by this bit number.
+        // It is a hand piece and no flag is required, but if you include this, the bit number of the piece on the board will be -1
+        // Since the total number of bits can be fixed, we will include this as well.
+
+        // Huffman Encoding
+        //
+        // Empty  xxxxxxx0
+        // Pawn   xxxxx001 + 1 bit (Side to move)
+        // Knight xxxxx011 + 1 bit (Side to move)
+        // Bishop xxxxx101 + 1 bit (Side to move)
+        // Rook   xxxxx111 + 1 bit (Side to move)
+
+        struct HuffmanedPiece
+        {
+            int code; // how it will be coded
+            int bits; // How many bits do you have
+        };
+
+        // NOTE: Order adjusted for this library because originally NO_PIECE had index 0
+        constexpr HuffmanedPiece huffman_table[] =
+        {
+            {0b0001,4}, // PAWN     1
+            {0b0011,4}, // KNIGHT   3
+            {0b0101,4}, // BISHOP   5
+            {0b0111,4}, // ROOK     7
+            {0b1001,4}, // QUEEN    9
+            {-1,-1},    // KING - unused
+            {0b0000,1}, // NO_PIECE 0
+        };
+
+        // Class for compressing/decompressing sfen
+        // sfen can be packed to 256bit (32bytes) by Huffman coding.
+        // This is proven by mini. The above is Huffman coding.
+        //
+        // Internal format = 1-bit turn + 7-bit king position *2 + piece on board (Huffman coding) + hand piece (Huffman coding)
+        // Side to move (White = 0, Black = 1) (1bit)
+        // White King Position (6 bits)
+        // Black King Position (6 bits)
+        // Huffman Encoding of the board
+        // Castling availability (1 bit x 4)
+        // En passant square (1 or 1 + 6 bits)
+        // Rule 50 (6 bits)
+        // Game play (8 bits)
+        //
+        // TODO(someone): Rename SFEN to FEN.
+        //
+        struct SfenPacker
+        {
+            // Pack sfen and store in data[32].
+            void pack(const chess::Position& pos)
+            {
+                memset(data, 0, 32 /* 256bit */);
+                stream.set_data(data);
+
+                // turn
+                // Side to move.
+                stream.write_one_bit((int)(pos.sideToMove()));
+
+                // 7-bit positions for leading and trailing balls
+                // White king and black king, 6 bits for each.
+                stream.write_n_bit(static_cast<int>(pos.kingSquare(chess::Color::White)), 6);
+                stream.write_n_bit(static_cast<int>(pos.kingSquare(chess::Color::Black)), 6);
+
+                // Write the pieces on the board other than the kings.
+                for (chess::Rank r = chess::rank8; r >= chess::rank1; --r)
+                {
+                    for (chess::File f = chess::fileA; f <= chess::fileH; ++f)
+                    {
+                        chess::Piece pc = pos.pieceAt(chess::Square(f, r));
+                        if (pc.type() == chess::PieceType::King)
+                            continue;
+                        write_board_piece_to_stream(pc);
+                    }
+                }
+
+                // TODO(someone): Support chess960.
+                auto cr = pos.castlingRights();
+                stream.write_one_bit(contains(cr, chess::CastlingRights::WhiteKingSide));
+                stream.write_one_bit(contains(cr, chess::CastlingRights::WhiteQueenSide));
+                stream.write_one_bit(contains(cr, chess::CastlingRights::BlackKingSide));
+                stream.write_one_bit(contains(cr, chess::CastlingRights::BlackQueenSide));
+
+                if (pos.epSquare() == chess::Square::none()) {
+                    stream.write_one_bit(0);
+                }
+                else {
+                    stream.write_one_bit(1);
+                    stream.write_n_bit(static_cast<int>(pos.epSquare()), 6);
+                }
+
+                stream.write_n_bit(pos.rule50Counter(), 6);
+
+                stream.write_n_bit(pos.halfMove(), 8);
+
+                assert(stream.get_cursor() <= 256);
+            }
+
+            // sfen packed by pack() (256bit = 32bytes)
+            // Or sfen to decode with unpack()
+            uint8_t *data; // uint8_t[32];
+
+            BitStream stream;
+
+            // Output the board pieces to stream.
+            void write_board_piece_to_stream(chess::Piece pc)
+            {
+                // piece type
+                chess::PieceType pr = pc.type();
+                auto c = huffman_table[static_cast<int>(pr)];
+                stream.write_n_bit(c.code, c.bits);
+
+                if (pc == chess::Piece::none())
+                    return;
+
+                // first and second flag
+                stream.write_one_bit(static_cast<int>(pc.color()));
+            }
+
+            // Read one board piece from stream
+            [[nodiscard]] chess::Piece read_board_piece_from_stream()
+            {
+                int pr = static_cast<int>(chess::PieceType::None);
+                int code = 0, bits = 0;
+                while (true)
+                {
+                    code |= stream.read_one_bit() << bits;
+                    ++bits;
+
+                    assert(bits <= 6);
+
+                    for (pr = static_cast<int>(chess::PieceType::Pawn); pr <= static_cast<int>(chess::PieceType::None); ++pr)
+                        if (huffman_table[pr].code == code
+                            && huffman_table[pr].bits == bits)
+                            goto Found;
+                }
+            Found:;
+                if (pr == static_cast<int>(chess::PieceType::None))
+                    return chess::Piece::none();
+
+                // first and second flag
+                chess::Color c = (chess::Color)stream.read_one_bit();
+
+                return chess::Piece(static_cast<chess::PieceType>(pr), c);
+            }
+        };
+
+
+        [[nodiscard]] chess::Position pos_from_packed_sfen(const PackedSfen& sfen)
+        {
+            SfenPacker packer;
+            auto& stream = packer.stream;
+            stream.set_data((uint8_t*)&sfen);
+
+            chess::Position pos{};
+
+            // Active color
+            pos.setSideToMove((chess::Color)stream.read_one_bit());
+
+            // First the position of the ball
+            pos.place(chess::Piece(chess::PieceType::King, chess::Color::White), static_cast<chess::Square>(stream.read_n_bit(6)));
+            pos.place(chess::Piece(chess::PieceType::King, chess::Color::Black), static_cast<chess::Square>(stream.read_n_bit(6)));
+
+            // Piece placement
+            for (chess::Rank r = chess::rank8; r >= chess::rank1; --r)
+            {
+                for (chess::File f = chess::fileA; f <= chess::fileH; ++f)
+                {
+                    auto sq = chess::Square(f, r);
+
+                    // it seems there are already balls
+                    chess::Piece pc;
+                    if (pos.pieceAt(sq).type() != chess::PieceType::King)
+                    {
+                        assert(pos.pieceAt(sq) == chess::Piece::none());
+                        pc = packer.read_board_piece_from_stream();
+                    }
+                    else
+                    {
+                        pc = pos.pieceAt(sq);
+                    }
+
+                    // There may be no pieces, so skip in that case.
+                    if (pc == chess::Piece::none())
+                        continue;
+
+                    if (pc.type() != chess::PieceType::King)
+                    {
+                        pos.place(pc, sq);
+                    }
+
+                    assert(stream.get_cursor() <= 256);
+                }
+            }
+
+            // Castling availability.
+            chess::CastlingRights cr = chess::CastlingRights::None;
+            if (stream.read_one_bit()) {
+                cr |= chess::CastlingRights::WhiteKingSide;
+            }
+            if (stream.read_one_bit()) {
+                cr |= chess::CastlingRights::WhiteQueenSide;
+            }
+            if (stream.read_one_bit()) {
+                cr |= chess::CastlingRights::BlackKingSide;
+            }
+            if (stream.read_one_bit()) {
+                cr |= chess::CastlingRights::BlackQueenSide;
+            }
+            pos.setCastlingRights(cr);
+
+            // En passant square. Ignore if no pawn capture is possible
+            if (stream.read_one_bit()) {
+                chess::Square ep_square = static_cast<chess::Square>(stream.read_n_bit(6));
+                pos.setEpSquare(ep_square);
+            }
+
+            // Halfmove clock
+            pos.setRule50Counter(stream.read_n_bit(6));
+
+            // Fullmove number
+            pos.setHalfMove(stream.read_n_bit(8));
+
+            assert(stream.get_cursor() <= 256);
+
+            return pos;
+        }
+    }
+
+    struct CompressedTrainingDataFile
+    {
+        struct Header
+        {
+            std::uint32_t chunkSize;
+        };
+
+        CompressedTrainingDataFile(std::string path, std::ios_base::openmode om = std::ios_base::app) :
+            m_path(std::move(path)),
+            m_file(m_path, std::ios_base::binary | std::ios_base::in | std::ios_base::out | om)
+        {
+        }
+
+        void append(const char* data, std::uint32_t size)
+        {
+            writeChunkHeader({size});
+            m_file.write(data, size);
+        }
+
+        [[nodiscard]] bool hasNextChunk()
+        {
+            m_file.peek();
+            return !m_file.eof();
+        }
+
+        [[nodiscard]] std::vector<unsigned char> readNextChunk()
+        {
+            auto size = readChunkHeader().chunkSize;
+            std::vector<unsigned char> data(size);
+            m_file.read(reinterpret_cast<char*>(data.data()), size);
+            return data;
+        }
+
+    private:
+        std::string m_path;
+        std::fstream m_file;
+
+        void writeChunkHeader(Header h)
+        {
+            unsigned char header[8];
+            header[0] = 'B';
+            header[1] = 'I';
+            header[2] = 'N';
+            header[3] = 'P';
+            header[4] = h.chunkSize;
+            header[5] = h.chunkSize >> 8;
+            header[6] = h.chunkSize >> 16;
+            header[7] = h.chunkSize >> 24;
+            m_file.write(reinterpret_cast<const char*>(header), 8);
+        }
+
+        [[nodiscard]] Header readChunkHeader()
+        {
+            unsigned char header[8];
+            m_file.read(reinterpret_cast<char*>(header), 8);
+            if (header[0] != 'B' || header[1] != 'I' || header[2] != 'N' || header[3] != 'P')
+            {
+                assert(false);
+                // throw std::runtime_error("Invalid binpack file or chunk.");
+            }
+
+            const std::uint32_t size =
+                header[4]
+                | (header[5] << 8)
+                | (header[6] << 16)
+                | (header[7] << 24);
+
+            if (size > maxChunkSize)
+            {
+                assert(false);
+                // throw std::runtime_error("Chunks size larger than supported. Malformed file?");
+            }
+
+            return { size };
+        }
+    };
+
+    [[nodiscard]] inline std::uint16_t signedToUnsigned(std::int16_t a)
+    {
+        std::uint16_t r;
+        std::memcpy(&r, &a, sizeof(std::uint16_t));
+        if (r & 0x8000)
+        {
+            r ^= 0x7FFF;
+        }
+        r = (r << 1) | (r >> 15);
+        return r;
+    }
+
+    [[nodiscard]] inline std::int16_t unsignedToSigned(std::uint16_t r)
+    {
+        std::int16_t a;
+        r = (r << 15) | (r >> 1);
+        if (r & 0x8000)
+        {
+            r ^= 0x7FFF;
+        }
+        std::memcpy(&a, &r, sizeof(std::uint16_t));
+        return a;
+    }
+
+    struct TrainingDataEntry
+    {
+        chess::Position pos;
+        chess::Move move;
+        std::int16_t score;
+        std::uint16_t ply;
+        std::int16_t result;
+    };
+
+    [[nodiscard]] inline TrainingDataEntry packedSfenValueToTrainingDataEntry(const nodchip::PackedSfenValue& psv)
+    {
+        TrainingDataEntry ret;
+
+        ret.pos = nodchip::pos_from_packed_sfen(psv.sfen);
+        ret.move = psv.move.toMove();
+        ret.score = psv.score;
+        ret.ply = psv.gamePly;
+        ret.result = psv.game_result;
+
+        return ret;
+    }
+
+    [[nodiscard]] inline nodchip::PackedSfenValue trainingDataEntryToPackedSfenValue(const TrainingDataEntry& plain)
+    {
+        nodchip::PackedSfenValue ret;
+
+        nodchip::SfenPacker sp;
+        sp.data = reinterpret_cast<uint8_t*>(&ret.sfen);
+        sp.pack(plain.pos);
+
+        ret.score = plain.score;
+        ret.move = nodchip::StockfishMove::fromMove(plain.move);
+        ret.gamePly = plain.ply;
+        ret.game_result = plain.result;
+        ret.padding = 0xff; // for consistency with the .bin format.
+
+        return ret;
+    }
+
+    [[nodiscard]] inline bool isContinuation(const TrainingDataEntry& lhs, const TrainingDataEntry& rhs)
+    {
+        return
+            lhs.result == -rhs.result
+            && lhs.ply + 1 == rhs.ply
+            && lhs.pos.afterMove(lhs.move) == rhs.pos;
+    }
+
+    struct PackedTrainingDataEntry
+    {
+        unsigned char bytes[32];
+    };
+
+    [[nodiscard]] inline std::size_t usedBitsSafe(std::size_t value)
+    {
+        if (value == 0) return 0;
+        return chess::util::usedBits(value - 1);
+    }
+
+    static constexpr std::size_t scoreVleBlockSize = 4;
+
+    struct PackedMoveScoreListReader
+    {
+        TrainingDataEntry entry;
+        std::uint16_t numPlies;
+        unsigned char* movetext;
+
+        PackedMoveScoreListReader(const TrainingDataEntry& entry, unsigned char* movetext, std::uint16_t numPlies) :
+            entry(entry),
+            movetext(movetext),
+            numPlies(numPlies),
+            m_lastScore(-entry.score)
+        {
+
+        }
+
+        [[nodiscard]] std::uint8_t extractBitsLE8(std::size_t count)
+        {
+            if (count == 0) return 0;
+
+            if (m_readBitsLeft == 0)
+            {
+                m_readOffset += 1;
+                m_readBitsLeft = 8;
+            }
+
+            const std::uint8_t byte = movetext[m_readOffset] << (8 - m_readBitsLeft);
+            std::uint8_t bits = byte >> (8 - count);
+
+            if (count > m_readBitsLeft)
+            {
+                const auto spillCount = count - m_readBitsLeft;
+                bits |= movetext[m_readOffset + 1] >> (8 - spillCount);
+
+                m_readBitsLeft += 8;
+                m_readOffset += 1;
+            }
+
+            m_readBitsLeft -= count;
+
+            return bits;
+        }
+
+        [[nodiscard]] std::uint16_t extractVle16(std::size_t blockSize)
+        {
+            auto mask = (1 << blockSize) - 1;
+            std::uint16_t v = 0;
+            std::size_t offset = 0;
+            for(;;)
+            {
+                std::uint16_t block = extractBitsLE8(blockSize + 1);
+                v |= ((block & mask) << offset);
+                if (!(block >> blockSize))
+                {
+                    break;
+                }
+
+                offset += blockSize;
+            }
+            return v;
+        }
+
+        [[nodiscard]] TrainingDataEntry nextEntry()
+        {
+            entry.pos.doMove(entry.move);
+            auto [move, score] = nextMoveScore(entry.pos);
+            entry.move = move;
+            entry.score = score;
+            entry.ply += 1;
+            entry.result = -entry.result;
+            return entry;
+        }
+
+        [[nodiscard]] bool hasNext() const
+        {
+            return m_numReadPlies < numPlies;
+        }
+
+        [[nodiscard]] std::pair<chess::Move, std::int16_t> nextMoveScore(const chess::Position& pos)
+        {
+            chess::Move move;
+            std::int16_t score;
+
+            const chess::Color sideToMove = pos.sideToMove();
+            const chess::Bitboard ourPieces = pos.piecesBB(sideToMove);
+            const chess::Bitboard theirPieces = pos.piecesBB(!sideToMove);
+            const chess::Bitboard occupied = ourPieces | theirPieces;
+
+            const auto pieceId = extractBitsLE8(usedBitsSafe(ourPieces.count()));
+            const auto from = chess::Square(chess::nthSetBitIndex(ourPieces.bits(), pieceId));
+
+            const auto pt = pos.pieceAt(from).type();
+            switch (pt)
+            {
+            case chess::PieceType::Pawn:
+            {
+                const chess::Rank promotionRank = pos.sideToMove() == chess::Color::White ? chess::rank7 : chess::rank2;
+                const chess::Rank startRank = pos.sideToMove() == chess::Color::White ? chess::rank2 : chess::rank7;
+                const auto forward = sideToMove == chess::Color::White ? chess::FlatSquareOffset(0, 1) : chess::FlatSquareOffset(0, -1);
+
+                const chess::Square epSquare = pos.epSquare();
+
+                chess::Bitboard attackTargets = theirPieces;
+                if (epSquare != chess::Square::none())
+                {
+                    attackTargets |= epSquare;
+                }
+
+                chess::Bitboard destinations = chess::bb::pawnAttacks(chess::Bitboard::square(from), sideToMove) & attackTargets;
+
+                const chess::Square sqForward = from + forward;
+                if (!occupied.isSet(sqForward))
+                {
+                    destinations |= sqForward;
+
+                    const chess::Square sqForward2 = sqForward + forward;
+                    if (
+                        from.rank() == startRank
+                        && !occupied.isSet(sqForward2)
+                        )
+                    {
+                        destinations |= sqForward2;
+                    }
+                }
+
+                const auto destinationsCount = destinations.count();
+                if (from.rank() == promotionRank)
+                {
+                    const auto moveId = extractBitsLE8(usedBitsSafe(destinationsCount * 4ull));
+                    const chess::Piece promotedPiece = chess::Piece(
+                        chess::fromOrdinal<chess::PieceType>(ordinal(chess::PieceType::Knight) + (moveId % 4ull)),
+                        sideToMove
+                    );
+                    const auto to = chess::Square(chess::nthSetBitIndex(destinations.bits(), moveId / 4ull));
+
+                    move = chess::Move::promotion(from, to, promotedPiece);
+                    break;
+                }
+                else
+                {
+                    auto moveId = extractBitsLE8(usedBitsSafe(destinationsCount));
+                    const auto to = chess::Square(chess::nthSetBitIndex(destinations.bits(), moveId));
+                    if (to == epSquare)
+                    {
+                        move = chess::Move::enPassant(from, to);
+                        break;
+                    }
+                    else
+                    {
+                        move = chess::Move::normal(from, to);
+                        break;
+                    }
+                }
+            }
+            case chess::PieceType::King:
+            {
+                const chess::CastlingRights ourCastlingRightsMask =
+                    sideToMove == chess::Color::White
+                    ? chess::CastlingRights::White
+                    : chess::CastlingRights::Black;
+
+                const chess::CastlingRights castlingRights = pos.castlingRights();
+
+                const chess::Bitboard attacks = chess::bb::pseudoAttacks<chess::PieceType::King>(from) & ~ourPieces;
+                const std::size_t attacksSize = attacks.count();
+                const std::size_t numCastlings = chess::intrin::popcount(ordinal(castlingRights & ourCastlingRightsMask));
+
+                const auto moveId = extractBitsLE8(usedBitsSafe(attacksSize + numCastlings));
+
+                if (moveId >= attacksSize)
+                {
+                    const std::size_t idx = moveId - attacksSize;
+
+                    const chess::CastleType castleType =
+                        idx == 0
+                        && chess::contains(castlingRights, chess::CastlingTraits::castlingRights[sideToMove][chess::CastleType::Long])
+                        ? chess::CastleType::Long
+                        : chess::CastleType::Short;
+
+                    move = chess::Move::castle(castleType, sideToMove);
+                    break;
+                }
+                else
+                {
+                    auto to = chess::Square(chess::nthSetBitIndex(attacks.bits(), moveId));
+                    move = chess::Move::normal(from, to);
+                    break;
+                }
+                break;
+            }
+            default:
+            {
+                const chess::Bitboard attacks = chess::bb::attacks(pt, from, occupied) & ~ourPieces;
+                const auto moveId = extractBitsLE8(usedBitsSafe(attacks.count()));
+                auto to = chess::Square(chess::nthSetBitIndex(attacks.bits(), moveId));
+                move = chess::Move::normal(from, to);
+                break;
+            }
+            }
+
+            score = m_lastScore + unsignedToSigned(extractVle16(scoreVleBlockSize));
+            m_lastScore = -score;
+
+            ++m_numReadPlies;
+
+            return {move, score};
+        }
+
+        [[nodiscard]] std::size_t numReadBytes()
+        {
+            return m_readOffset + (m_readBitsLeft != 8);
+        }
+
+    private:
+        std::size_t m_readBitsLeft = 8;
+        std::size_t m_readOffset = 0;
+        std::int16_t m_lastScore = 0;
+        std::uint16_t m_numReadPlies = 0;
+    };
+
+    struct PackedMoveScoreList
+    {
+        std::uint16_t numPlies = 0;
+        std::vector<unsigned char> movetext;
+
+        void clear(const TrainingDataEntry& e)
+        {
+            numPlies = 0;
+            movetext.clear();
+            m_bitsLeft = 0;
+            m_lastScore = -e.score;
+        }
+
+        void addBitsLE8(std::uint8_t bits, std::size_t count)
+        {
+            if (count == 0) return;
+
+            if (m_bitsLeft == 0)
+            {
+                movetext.emplace_back(bits << (8 - count));
+                m_bitsLeft = 8;
+            }
+            else if (count <= m_bitsLeft)
+            {
+                movetext.back() |= bits << (m_bitsLeft - count);
+            }
+            else
+            {
+                const auto spillCount = count - m_bitsLeft;
+                movetext.back() |= bits >> spillCount;
+                movetext.emplace_back(bits << (8 - spillCount));
+                m_bitsLeft += 8;
+            }
+
+            m_bitsLeft -= count;
+        }
+
+        void addBitsVle16(std::uint16_t v, std::size_t blockSize)
+        {
+            auto mask = (1 << blockSize) - 1;
+            for(;;)
+            {
+                std::uint8_t block = (v & mask) | ((v > mask) << blockSize);
+                addBitsLE8(block, blockSize + 1);
+                v >>= blockSize;
+                if (v == 0) break;
+            }
+        }
+
+
+        void addMoveScore(const chess::Position& pos, chess::Move move, std::int16_t score)
+        {
+            const chess::Color sideToMove = pos.sideToMove();
+            const chess::Bitboard ourPieces = pos.piecesBB(sideToMove);
+            const chess::Bitboard theirPieces = pos.piecesBB(!sideToMove);
+            const chess::Bitboard occupied = ourPieces | theirPieces;
+
+            const std::uint8_t pieceId = (pos.piecesBB(sideToMove) & chess::bb::before(move.from)).count();
+            std::size_t numMoves = 0;
+            int moveId = 0;
+            const auto pt = pos.pieceAt(move.from).type();
+            switch (pt)
+            {
+            case chess::PieceType::Pawn:
+            {
+                const chess::Rank secondToLastRank = pos.sideToMove() == chess::Color::White ? chess::rank7 : chess::rank2;
+                const chess::Rank startRank = pos.sideToMove() == chess::Color::White ? chess::rank2 : chess::rank7;
+                const auto forward = sideToMove == chess::Color::White ? chess::FlatSquareOffset(0, 1) : chess::FlatSquareOffset(0, -1);
+
+                const chess::Square epSquare = pos.epSquare();
+
+                chess::Bitboard attackTargets = theirPieces;
+                if (epSquare != chess::Square::none())
+                {
+                    attackTargets |= epSquare;
+                }
+
+                chess::Bitboard destinations = chess::bb::pawnAttacks(chess::Bitboard::square(move.from), sideToMove) & attackTargets;
+
+                const chess::Square sqForward = move.from + forward;
+                if (!occupied.isSet(sqForward))
+                {
+                    destinations |= sqForward;
+
+                    const chess::Square sqForward2 = sqForward + forward;
+                    if (
+                        move.from.rank() == startRank
+                        && !occupied.isSet(sqForward2)
+                        )
+                    {
+                        destinations |= sqForward2;
+                    }
+                }
+
+                moveId = (destinations & chess::bb::before(move.to)).count();
+                numMoves = destinations.count();
+                if (move.from.rank() == secondToLastRank)
+                {
+                    const auto promotionIndex = (ordinal(move.promotedPiece.type()) - ordinal(chess::PieceType::Knight));
+                    moveId = moveId * 4 + promotionIndex;
+                    numMoves *= 4;
+                }
+
+                break;
+            }
+            case chess::PieceType::King:
+            {
+                const chess::CastlingRights ourCastlingRightsMask =
+                    sideToMove == chess::Color::White
+                    ? chess::CastlingRights::White
+                    : chess::CastlingRights::Black;
+
+                const chess::CastlingRights castlingRights = pos.castlingRights();
+
+                const chess::Bitboard attacks = chess::bb::pseudoAttacks<chess::PieceType::King>(move.from) & ~ourPieces;
+                const auto attacksSize = attacks.count();
+                const auto numCastlingRights = chess::intrin::popcount(ordinal(castlingRights & ourCastlingRightsMask));
+
+                numMoves += attacksSize;
+                numMoves += numCastlingRights;
+
+                if (move.type == chess::MoveType::Castle)
+                {
+                    const auto longCastlingRights = chess::CastlingTraits::castlingRights[sideToMove][chess::CastleType::Long];
+
+                    moveId = attacksSize - 1;
+
+                    if (chess::contains(castlingRights, longCastlingRights))
+                    {
+                        // We have to add one no matter if it's the used one or not.
+                        moveId += 1;
+                    }
+
+                    if (chess::CastlingTraits::moveCastlingType(move) == chess::CastleType::Short)
+                    {
+                        moveId += 1;
+                    }
+                }
+                else
+                {
+                    moveId = (attacks & chess::bb::before(move.to)).count();
+                }
+                break;
+            }
+            default:
+            {
+                const chess::Bitboard attacks = chess::bb::attacks(pt, move.from, occupied) & ~ourPieces;
+
+                moveId = (attacks & chess::bb::before(move.to)).count();
+                numMoves = attacks.count();
+            }
+            }
+
+            const std::size_t numPieces = ourPieces.count();
+            addBitsLE8(pieceId, usedBitsSafe(numPieces));
+            addBitsLE8(moveId, usedBitsSafe(numMoves));
+
+            std::uint16_t scoreDelta = signedToUnsigned(score - m_lastScore);
+            addBitsVle16(scoreDelta, scoreVleBlockSize);
+            m_lastScore = -score;
+
+            ++numPlies;
+        }
+
+    private:
+        std::size_t m_bitsLeft = 0;
+        std::int16_t m_lastScore = 0;
+    };
+
+
+    [[nodiscard]] inline PackedTrainingDataEntry packEntry(const TrainingDataEntry& plain)
+    {
+        PackedTrainingDataEntry packed;
+
+        auto compressedPos = plain.pos.compress();
+        auto compressedMove = plain.move.compress();
+
+        static_assert(sizeof(compressedPos) + sizeof(compressedMove) + 6 == sizeof(PackedTrainingDataEntry));
+
+        std::size_t offset = 0;
+        compressedPos.writeToBigEndian(packed.bytes);
+        offset += sizeof(compressedPos);
+        compressedMove.writeToBigEndian(packed.bytes + offset);
+        offset += sizeof(compressedMove);
+        std::uint16_t pr = plain.ply | (signedToUnsigned(plain.result) << 14);
+        packed.bytes[offset++] = signedToUnsigned(plain.score) >> 8;
+        packed.bytes[offset++] = signedToUnsigned(plain.score);
+        packed.bytes[offset++] = pr >> 8;
+        packed.bytes[offset++] = pr;
+        packed.bytes[offset++] = plain.pos.rule50Counter() >> 8;
+        packed.bytes[offset++] = plain.pos.rule50Counter();
+
+        return packed;
+    }
+
+    [[nodiscard]] inline TrainingDataEntry unpackEntry(const PackedTrainingDataEntry& packed)
+    {
+        TrainingDataEntry plain;
+
+        std::size_t offset = 0;
+        auto compressedPos = chess::CompressedPosition::readFromBigEndian(packed.bytes);
+        plain.pos = compressedPos.decompress();
+        offset += sizeof(compressedPos);
+        auto compressedMove = chess::CompressedMove::readFromBigEndian(packed.bytes + offset);
+        plain.move = compressedMove.decompress();
+        offset += sizeof(compressedMove);
+        plain.score = unsignedToSigned((packed.bytes[offset] << 8) | packed.bytes[offset+1]);
+        offset += 2;
+        std::uint16_t pr = (packed.bytes[offset] << 8) | packed.bytes[offset+1];
+        plain.ply = pr & 0x3FFF;
+        plain.pos.setPly(plain.ply);
+        plain.result = unsignedToSigned(pr >> 14);
+        offset += 2;
+        plain.pos.setRule50Counter((packed.bytes[offset] << 8) | packed.bytes[offset+1]);
+
+        return plain;
+    }
+
+    struct CompressedTrainingDataEntryWriter
+    {
+        static constexpr std::size_t chunkSize = suggestedChunkSize;
+
+        CompressedTrainingDataEntryWriter(std::string path, std::ios_base::openmode om = std::ios_base::app) :
+            m_outputFile(path, om),
+            m_lastEntry{},
+            m_movelist{},
+            m_packedSize(0),
+            m_packedEntries(chunkSize + maxMovelistSize),
+            m_isFirst(true)
+        {
+            m_lastEntry.ply = 0xFFFF; // so it's never a continuation
+            m_lastEntry.result = 0x7FFF;
+        }
+
+        void addTrainingDataEntry(const TrainingDataEntry& e)
+        {
+            bool isCont = isContinuation(m_lastEntry, e);
+            if (isCont)
+            {
+                // add to movelist
+                m_movelist.addMoveScore(e.pos, e.move, e.score);
+            }
+            else
+            {
+                if (!m_isFirst)
+                {
+                    writeMovelist();
+                }
+
+                if (m_packedSize >= chunkSize)
+                {
+                    m_outputFile.append(m_packedEntries.data(), m_packedSize);
+                    m_packedSize = 0;
+                }
+
+                auto packed = packEntry(e);
+                std::memcpy(m_packedEntries.data() + m_packedSize, &packed, sizeof(PackedTrainingDataEntry));
+                m_packedSize += sizeof(PackedTrainingDataEntry);
+
+                m_movelist.clear(e);
+
+                m_isFirst = false;
+            }
+
+            m_lastEntry = e;
+        }
+
+        ~CompressedTrainingDataEntryWriter()
+        {
+            if (m_packedSize > 0)
+            {
+                if (!m_isFirst)
+                {
+                    writeMovelist();
+                }
+
+                m_outputFile.append(m_packedEntries.data(), m_packedSize);
+                m_packedSize = 0;
+            }
+        }
+
+    private:
+        CompressedTrainingDataFile m_outputFile;
+        TrainingDataEntry m_lastEntry;
+        PackedMoveScoreList m_movelist;
+        std::size_t m_packedSize;
+        std::vector<char> m_packedEntries;
+        bool m_isFirst;
+
+        void writeMovelist()
+        {
+            m_packedEntries[m_packedSize++] = m_movelist.numPlies >> 8;
+            m_packedEntries[m_packedSize++] = m_movelist.numPlies;
+            if (m_movelist.numPlies > 0)
+            {
+                std::memcpy(m_packedEntries.data() + m_packedSize, m_movelist.movetext.data(), m_movelist.movetext.size());
+                m_packedSize += m_movelist.movetext.size();
+            }
+        };
+    };
+
+    struct CompressedTrainingDataEntryReader
+    {
+        static constexpr std::size_t chunkSize = suggestedChunkSize;
+
+        CompressedTrainingDataEntryReader(std::string path, std::ios_base::openmode om = std::ios_base::app) :
+            m_inputFile(path, om),
+            m_chunk(),
+            m_movelistReader(std::nullopt),
+            m_offset(0),
+            m_isEnd(false)
+        {
+            if (!m_inputFile.hasNextChunk())
+            {
+                m_isEnd = true;
+            }
+            else
+            {
+                m_chunk = m_inputFile.readNextChunk();
+            }
+        }
+
+        [[nodiscard]] bool hasNext()
+        {
+            return !m_isEnd;
+        }
+
+        [[nodiscard]] TrainingDataEntry next()
+        {
+            if (m_movelistReader.has_value())
+            {
+                const auto e = m_movelistReader->nextEntry();
+
+                if (!m_movelistReader->hasNext())
+                {
+                    m_offset += m_movelistReader->numReadBytes();
+                    m_movelistReader.reset();
+
+                    fetchNextChunkIfNeeded();
+                }
+
+                return e;
+            }
+
+            PackedTrainingDataEntry packed;
+            std::memcpy(&packed, m_chunk.data() + m_offset, sizeof(PackedTrainingDataEntry));
+            m_offset += sizeof(PackedTrainingDataEntry);
+
+            const std::uint16_t numPlies = (m_chunk[m_offset] << 8) | m_chunk[m_offset + 1];
+            m_offset += 2;
+
+            const auto e = unpackEntry(packed);
+
+            if (numPlies > 0)
+            {
+                m_movelistReader.emplace(e, reinterpret_cast<unsigned char*>(m_chunk.data()) + m_offset, numPlies);
+            }
+            else
+            {
+                fetchNextChunkIfNeeded();
+            }
+
+            return e;
+        }
+
+    private:
+        CompressedTrainingDataFile m_inputFile;
+        std::vector<unsigned char> m_chunk;
+        std::optional<PackedMoveScoreListReader> m_movelistReader;
+        std::size_t m_offset;
+        bool m_isEnd;
+
+        void fetchNextChunkIfNeeded()
+        {
+            if (m_offset + sizeof(PackedTrainingDataEntry) + 2 > m_chunk.size())
+            {
+                if (m_inputFile.hasNextChunk())
+                {
+                    m_chunk = m_inputFile.readNextChunk();
+                    m_offset = 0;
+                }
+                else
+                {
+                    m_isEnd = true;
+                }
+            }
+        }
+    };
+
+    inline void emitPlainEntry(std::string& buffer, const TrainingDataEntry& plain)
+    {
+        buffer += "fen ";
+        buffer += plain.pos.fen();
+        buffer += '\n';
+
+        buffer += "move ";
+        buffer += chess::uci::moveToUci(plain.pos, plain.move);
+        buffer += '\n';
+
+        buffer += "score ";
+        buffer += std::to_string(plain.score);
+        buffer += '\n';
+
+        buffer += "ply ";
+        buffer += std::to_string(plain.ply);
+        buffer += '\n';
+
+        buffer += "result ";
+        buffer += std::to_string(plain.result);
+        buffer += "\ne\n";
+    }
+
+    inline void emitBinEntry(std::vector<char>& buffer, const TrainingDataEntry& plain)
+    {
+        auto psv = trainingDataEntryToPackedSfenValue(plain);
+        const char* data = reinterpret_cast<const char*>(&psv);
+        buffer.insert(buffer.end(), data, data+sizeof(psv));
+    }
+
+    inline void convertPlainToBinpack(std::string inputPath, std::string outputPath, std::ios_base::openmode om)
+    {
+        constexpr std::size_t reportEveryNPositions = 100'000;
+
+        std::cout << "Compressing " << inputPath << " to " << outputPath << '\n';
+
+        CompressedTrainingDataEntryWriter writer(outputPath, om);
+        TrainingDataEntry e;
+
+        std::string key;
+        std::string value;
+        std::string move;
+
+        std::ifstream inputFile(inputPath);
+        const auto base = inputFile.tellg();
+        std::size_t numProcessedPositions = 0;
+
+        for(;;)
+        {
+            inputFile >> key;
+            if (!inputFile)
+            {
+                break;
+            }
+
+            if (key == "e"sv)
+            {
+                e.move = chess::uci::uciToMove(e.pos, move);
+
+                writer.addTrainingDataEntry(e);
+
+                ++numProcessedPositions;
+                const auto cur = inputFile.tellg();
+                if (numProcessedPositions % reportEveryNPositions == 0)
+                {
+                    std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
+                }
+
+                continue;
+            }
+
+            inputFile >> std::ws;
+            std::getline(inputFile, value, '\n');
+
+            if (key == "fen"sv) e.pos = chess::Position::fromFen(value.c_str());
+            if (key == "move"sv) move = value;
+            if (key == "score"sv) e.score = std::stoi(value);
+            if (key == "ply"sv) e.ply = std::stoi(value);
+            if (key == "result"sv) e.result = std::stoi(value);
+        }
+    }
+
+    inline void convertBinpackToPlain(std::string inputPath, std::string outputPath, std::ios_base::openmode om)
+    {
+        constexpr std::size_t bufferSize = MiB;
+
+        std::cout << "Decompressing " << inputPath << " to " << outputPath << '\n';
+
+        CompressedTrainingDataEntryReader reader(inputPath);
+        std::ofstream outputFile(outputPath, om);
+        const auto base = outputFile.tellp();
+        std::size_t numProcessedPositions = 0;
+        std::string buffer;
+        buffer.reserve(bufferSize * 2);
+
+        while(reader.hasNext())
+        {
+            emitPlainEntry(buffer, reader.next());
+
+            ++numProcessedPositions;
+
+            if (buffer.size() > bufferSize)
+            {
+                outputFile << buffer;
+                buffer.clear();
+
+                const auto cur = outputFile.tellp();
+                std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
+            }
+        }
+
+        if (!buffer.empty())
+        {
+            outputFile << buffer;
+
+            const auto cur = outputFile.tellp();
+            std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
+        }
+    }
+
+
+    inline void convertBinToBinpack(std::string inputPath, std::string outputPath, std::ios_base::openmode om)
+    {
+        constexpr std::size_t reportEveryNPositions = 100'000;
+
+        std::cout << "Compressing " << inputPath << " to " << outputPath << '\n';
+
+        CompressedTrainingDataEntryWriter writer(outputPath, om);
+        TrainingDataEntry e;
+
+        std::string key;
+        std::string value;
+        std::string move;
+
+        std::ifstream inputFile(inputPath, std::ios_base::binary);
+        const auto base = inputFile.tellg();
+        std::size_t numProcessedPositions = 0;
+
+        nodchip::PackedSfenValue psv;
+        for(;;)
+        {
+            inputFile.read(reinterpret_cast<char*>(&psv), sizeof(psv));
+            if (inputFile.gcount() != 40)
+            {
+                break;
+            }
+
+            writer.addTrainingDataEntry(packedSfenValueToTrainingDataEntry(psv));
+
+            ++numProcessedPositions;
+            const auto cur = inputFile.tellg();
+            if (numProcessedPositions % reportEveryNPositions == 0)
+            {
+                std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
+            }
+        }
+    }
+
+    inline void convertBinpackToBin(std::string inputPath, std::string outputPath, std::ios_base::openmode om)
+    {
+        constexpr std::size_t bufferSize = MiB;
+
+        std::cout << "Decompressing " << inputPath << " to " << outputPath << '\n';
+
+        CompressedTrainingDataEntryReader reader(inputPath);
+        std::ofstream outputFile(outputPath, std::ios_base::binary | om);
+        const auto base = outputFile.tellp();
+        std::size_t numProcessedPositions = 0;
+        std::vector<char> buffer;
+        buffer.reserve(bufferSize * 2);
+
+        while(reader.hasNext())
+        {
+            emitBinEntry(buffer, reader.next());
+
+            ++numProcessedPositions;
+
+            if (buffer.size() > bufferSize)
+            {
+                outputFile.write(buffer.data(), buffer.size());
+                buffer.clear();
+
+                const auto cur = outputFile.tellp();
+                std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
+            }
+        }
+
+        if (!buffer.empty())
+        {
+            outputFile.write(buffer.data(), buffer.size());
+
+            const auto cur = outputFile.tellp();
+            std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
+        }
+    }
+
+    inline void convertBinToPlain(std::string inputPath, std::string outputPath, std::ios_base::openmode om)
+    {
+        constexpr std::size_t reportEveryNPositions = 100'000;
+        constexpr std::size_t bufferSize = MiB;
+
+        std::cout << "Converting " << inputPath << " to " << outputPath << '\n';
+
+        TrainingDataEntry e;
+
+        std::string key;
+        std::string value;
+        std::string move;
+
+        std::ifstream inputFile(inputPath, std::ios_base::binary);
+        const auto base = inputFile.tellg();
+        std::size_t numProcessedPositions = 0;
+
+        std::ofstream outputFile(outputPath, om);
+        std::string buffer;
+        buffer.reserve(bufferSize * 2);
+
+        nodchip::PackedSfenValue psv;
+        for(;;)
+        {
+            inputFile.read(reinterpret_cast<char*>(&psv), sizeof(psv));
+            if (inputFile.gcount() != 40)
+            {
+                break;
+            }
+
+            emitPlainEntry(buffer, packedSfenValueToTrainingDataEntry(psv));
+
+            ++numProcessedPositions;
+
+            if (buffer.size() > bufferSize)
+            {
+                outputFile << buffer;
+                buffer.clear();
+
+                const auto cur = outputFile.tellp();
+                std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
+            }
+        }
+
+        if (!buffer.empty())
+        {
+            outputFile << buffer;
+
+            const auto cur = outputFile.tellp();
+            std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
+        }
+    }
+
+    inline void convertPlainToBin(std::string inputPath, std::string outputPath, std::ios_base::openmode om)
+    {
+        constexpr std::size_t reportEveryNPositions = 100'000;
+        constexpr std::size_t bufferSize = MiB;
+
+        std::cout << "Compressing " << inputPath << " to " << outputPath << '\n';
+
+        std::ofstream outputFile(outputPath, std::ios_base::binary | om);
+        std::vector<char> buffer;
+        buffer.reserve(bufferSize * 2);
+
+        TrainingDataEntry e;
+
+        std::string key;
+        std::string value;
+        std::string move;
+
+        std::ifstream inputFile(inputPath);
+        const auto base = inputFile.tellg();
+        std::size_t numProcessedPositions = 0;
+
+        for(;;)
+        {
+            inputFile >> key;
+            if (!inputFile)
+            {
+                break;
+            }
+
+            if (key == "e"sv)
+            {
+                e.move = chess::uci::uciToMove(e.pos, move);
+
+                emitBinEntry(buffer, e);
+
+                ++numProcessedPositions;
+
+                if (buffer.size() > bufferSize)
+                {
+                    outputFile.write(buffer.data(), buffer.size());
+                    buffer.clear();
+
+                    const auto cur = outputFile.tellp();
+                    std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
+                }
+
+                continue;
+            }
+
+            inputFile >> std::ws;
+            std::getline(inputFile, value, '\n');
+
+            if (key == "fen"sv) e.pos = chess::Position::fromFen(value.c_str());
+            if (key == "move"sv) move = value;
+            if (key == "score"sv) e.score = std::stoi(value);
+            if (key == "ply"sv) e.ply = std::stoi(value);
+            if (key == "result"sv) e.result = std::stoi(value);
+        }
+
+        if (!buffer.empty())
+        {
+            outputFile.write(buffer.data(), buffer.size());
+
+            const auto cur = outputFile.tellp();
+            std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 84feabb0..530c660b 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -11,6 +11,8 @@
 #include "learn.h"
 #include "multi_think.h"
 
+#include "../extra/nnue_data_binpack_format.h"
+
 #include <chrono>
 #include <climits>
 #include <cmath>
@@ -32,6 +34,12 @@ using namespace std;
 
 namespace Learner
 {
+    enum struct SfenOutputType
+    {
+        Bin,
+        Binpack
+    };
+
     static bool write_out_draw_game_in_training_data_generation = false;
     static bool detect_draw_by_consecutive_low_score = false;
     static bool detect_draw_by_insufficient_mating_material = false;
@@ -42,6 +50,94 @@ namespace Learner
     // https://discordapp.com/channels/435943710472011776/733545871911813221/748524079761326192
     extern bool use_raw_nnue_eval;
 
+    static SfenOutputType sfen_output_type = SfenOutputType::Bin;
+
+    static bool ends_with(const std::string& lhs, const std::string& end)
+    {
+        if (end.size() > lhs.size()) return false;
+
+        return std::equal(end.rbegin(), end.rend(), lhs.rbegin());
+    }
+
+    static std::string filename_with_extension(const std::string& filename, const std::string& ext)
+    {
+        if (ends_with(filename, ext))
+        {
+            return filename;
+        }
+        else
+        {
+            return filename + "." + ext;
+        }
+    }
+
+    struct BasicSfenOutputStream
+    {
+        virtual void write(const PSVector& sfens) = 0;
+        virtual ~BasicSfenOutputStream() {}
+    };
+
+    struct BinSfenOutputStream : BasicSfenOutputStream
+    {
+        static constexpr auto openmode = ios::out | ios::binary | ios::app;
+        static inline const std::string extension = "bin";
+
+        BinSfenOutputStream(std::string filename) :
+            m_stream(filename_with_extension(filename, extension), openmode)
+        {
+        }
+
+        void write(const PSVector& sfens) override
+        {
+            m_stream.write(reinterpret_cast<const char*>(sfens.data()), sizeof(PackedSfenValue) * sfens.size());
+        }
+
+        ~BinSfenOutputStream() override {}
+
+    private:
+        fstream m_stream;
+    };
+
+    struct BinpackSfenOutputStream : BasicSfenOutputStream
+    {
+        static constexpr auto openmode = ios::out | ios::binary | ios::app;
+        static inline const std::string extension = "binpack";
+
+        BinpackSfenOutputStream(std::string filename) :
+            m_stream(filename_with_extension(filename, extension), openmode)
+        {
+        }
+
+        void write(const PSVector& sfens) override
+        {
+            static_assert(sizeof(binpack::nodchip::PackedSfenValue) == sizeof(PackedSfenValue));
+
+            for(auto& sfen : sfens)
+            {
+                // The library uses a type that's different but layout-compatibile.
+                binpack::nodchip::PackedSfenValue e;
+                std::memcpy(&e, &sfen, sizeof(binpack::nodchip::PackedSfenValue));
+                m_stream.addTrainingDataEntry(binpack::packedSfenValueToTrainingDataEntry(e));
+            }
+        }
+
+        ~BinpackSfenOutputStream() override {}
+
+    private:
+        binpack::CompressedTrainingDataEntryWriter m_stream;
+    };
+
+    static std::unique_ptr<BasicSfenOutputStream> create_new_sfen_output(const std::string& filename)
+    {
+        switch(sfen_output_type)
+        {
+            case SfenOutputType::Bin:
+                return std::make_unique<BinSfenOutputStream>(filename);
+            default:
+                return std::make_unique<BinpackSfenOutputStream>(filename);
+        }
+    }
+
     // Helper class for exporting Sfen
     struct SfenWriter
     {
@@ -58,7 +154,7 @@ namespace Learner
             sfen_buffers_pool.reserve((size_t)thread_num * 10);
             sfen_buffers.resize(thread_num);
 
-            output_file_stream.open(filename_, ios::out | ios::binary | ios::app);
+            output_file_stream = create_new_sfen_output(filename_);
             filename = filename_;
 
             finished = false;
@@ -68,7 +164,7 @@ namespace Learner
         {
             finished = true;
             file_worker_thread.join();
-            output_file_stream.close();
+            output_file_stream.reset();
 
 #if defined(_DEBUG)
             {
@@ -137,9 +233,6 @@ namespace Learner
             {
                 // Also output the current time to console.
                 sync_cout << endl << sfen_write_count << " sfens , at " << now_string() << sync_endl;
-
-                // This is enough for flush().
-                output_file_stream.flush();
             };
 
             while (!finished || sfen_buffers_pool.size())
@@ -163,7 +256,7 @@ namespace Learner
                 {
                     for (auto& buf : buffers)
                     {
-                        output_file_stream.write(reinterpret_cast<const char*>(buf->data()), sizeof(PackedSfenValue) * buf->size());
+                        output_file_stream->write(*buf);
 
                         sfen_write_count += buf->size();
 
@@ -174,8 +267,6 @@ namespace Learner
                         {
                             sfen_write_count_current_file = 0;
 
-                            output_file_stream.close();
-
                             // Sequential number attached to the file
                             int n = (int)(sfen_write_count / save_every);
 
@@ -183,7 +274,7 @@ namespace Learner
                             // Add ios::app in consideration of overwriting.
                             // (Depending on the operation, it may not be necessary.)
                             string new_filename = filename + "_" + std::to_string(n);
-                            output_file_stream.open(new_filename, ios::out | ios::binary | ios::app);
+                            output_file_stream = create_new_sfen_output(new_filename);
                             cout << endl << "output sfen file = " << new_filename << endl;
                         }
 
@@ -217,7 +308,7 @@ namespace Learner
 
     private:
 
-        fstream output_file_stream;
+        std::unique_ptr<BasicSfenOutputStream> output_file_stream;
 
         // A new net is saved after every save_every sfens are processed.
         uint64_t save_every = std::numeric_limits<uint64_t>::max();
@@ -951,7 +1042,7 @@ namespace Learner
         int write_maxply = 400;
 
         // File name to write
-        string output_file_name = "generated_kifu.bin";
+        string output_file_name = "generated_kifu";
 
         string token;
 
@@ -962,6 +1053,8 @@ namespace Learner
         // Add a random number to the end of the file name.
         bool random_file_name = false;
 
+        std::string sfen_format;
+
         while (true)
         {
             token = "";
@@ -1017,10 +1110,24 @@ namespace Learner
                 is >> detect_draw_by_insufficient_mating_material;
             else if (token == "use_raw_nnue_eval")
                 is >> use_raw_nnue_eval;
+            else if (token == "sfen_format")
+                is >> sfen_format;
             else
                 cout << "Error! : Illegal token " << token << endl;
         }
 
+        if (!sfen_format.empty())
+        {
+            if (sfen_format == "bin")
+                sfen_output_type = SfenOutputType::Bin;
+            else if (sfen_format == "binpack")
+                sfen_output_type = SfenOutputType::Binpack;
+            else
+            {
+                cout << "Unknown sfen format `" << sfen_format << "`. Using bin\n";
+            }
+        }
+
         // If search depth2 is not set, leave it the same as search depth.
         if (search_depth_max == INT_MIN)
             search_depth_max = search_depth_min;

From 6b76ebc2ca3b66003424d73f8561fb4906657fde Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 10 Sep 2020 00:31:38 +0200
Subject: [PATCH 086/398] Support for binpack format in sfenreader in learner.
 Automatically detect file extension and choose the correct reader (bin or
 binpack)

---
 src/extra/nnue_data_binpack_format.h | 231 +++++-----------
 src/learn/gensfen.cpp                |   5 +-
 src/learn/learner.cpp                | 396 ++++++++++++++++++---------
 3 files changed, 328 insertions(+), 304 deletions(-)

diff --git a/src/extra/nnue_data_binpack_format.h b/src/extra/nnue_data_binpack_format.h
index 9f810a3b..bec0e9ad 100644
--- a/src/extra/nnue_data_binpack_format.h
+++ b/src/extra/nnue_data_binpack_format.h
@@ -2745,9 +2745,6 @@ namespace chess
                 0x000200282410A102ull,
                 0x4048240043802106ull
                     } };
-            alignas(64) extern EnumArray<Square, Bitboard> g_rookMasks;
-            alignas(64) extern EnumArray<Square, std::uint8_t> g_rookShifts;
-            alignas(64) extern EnumArray<Square, const Bitboard*> g_rookAttacks;
 
             alignas(64) constexpr EnumArray<Square, std::uint64_t> g_bishopMagics{ {
                 0x40106000A1160020ull,
@@ -2815,9 +2812,17 @@ namespace chess
                 0x0300404822C08200ull,
                 0x48081010008A2A80ull
             } };
-            alignas(64) extern EnumArray<Square, Bitboard> g_bishopMasks;
-            alignas(64) extern EnumArray<Square, std::uint8_t> g_bishopShifts;
-            alignas(64) extern EnumArray<Square, const Bitboard*> g_bishopAttacks;
+
+            alignas(64) static EnumArray<Square, Bitboard> g_rookMasks;
+            alignas(64) static EnumArray<Square, std::uint8_t> g_rookShifts;
+            alignas(64) static EnumArray<Square, const Bitboard*> g_rookAttacks;
+
+            alignas(64) static EnumArray<Square, Bitboard> g_bishopMasks;
+            alignas(64) static EnumArray<Square, std::uint8_t> g_bishopShifts;
+            alignas(64) static EnumArray<Square, const Bitboard*> g_bishopAttacks;
+
+            alignas(64) static std::array<Bitboard, 102400> g_allRookAttacks;
+            alignas(64) static std::array<Bitboard, 5248> g_allBishopAttacks;
 
             inline Bitboard bishopAttacks(Square s, Bitboard occupied)
             {
@@ -3402,17 +3407,6 @@ namespace chess
                 Bishop
             };
 
-            alignas(64) EnumArray<Square, Bitboard> g_rookMasks;
-            alignas(64) EnumArray<Square, std::uint8_t> g_rookShifts;
-            alignas(64) EnumArray<Square, const Bitboard*> g_rookAttacks;
-
-            alignas(64) EnumArray<Square, Bitboard> g_bishopMasks;
-            alignas(64) EnumArray<Square, std::uint8_t> g_bishopShifts;
-            alignas(64) EnumArray<Square, const Bitboard*> g_bishopAttacks;
-
-            alignas(64) static std::array<Bitboard, 102400> g_allRookAttacks;
-            alignas(64) static std::array<Bitboard, 5248> g_allBishopAttacks;
-
             template <MagicsType TypeV>
             [[nodiscard]] inline Bitboard slidingAttacks(Square sq, Bitboard occupied)
             {
@@ -3857,7 +3851,7 @@ namespace chess
             return true;
         }
 
-        [[nodiscard]] std::string fen() const;
+        [[nodiscard]] inline std::string fen() const;
 
         [[nodiscard]] inline bool trySet(std::string_view boardState)
         {
@@ -4093,7 +4087,7 @@ namespace chess
 
         // returns captured piece
         // doesn't check validity
-        FORCEINLINE constexpr Piece doMove(Move move)
+        inline constexpr Piece doMove(Move move)
         {
             if (move.type == MoveType::Normal)
             {
@@ -4132,7 +4126,7 @@ namespace chess
             return doMoveColdPath(move);
         }
 
-        NOINLINE constexpr Piece doMoveColdPath(Move move)
+        inline constexpr Piece doMoveColdPath(Move move)
         {
             if (move.type == MoveType::Promotion)
             {
@@ -4333,38 +4327,38 @@ namespace chess
 
         // Returns whether a given square is attacked by any piece
         // of `attackerColor` side.
-        [[nodiscard]] bool isSquareAttacked(Square sq, Color attackerColor) const;
+        [[nodiscard]] inline bool isSquareAttacked(Square sq, Color attackerColor) const;
 
         // Returns whether a given square is attacked by any piece
         // of `attackerColor` side after `move` is made.
         // Move must be pseudo legal.
-        [[nodiscard]] bool isSquareAttackedAfterMove(Move move, Square sq, Color attackerColor) const;
+        [[nodiscard]] inline bool isSquareAttackedAfterMove(Move move, Square sq, Color attackerColor) const;
 
         // Move must be pseudo legal.
         // Must not be a king move.
-        [[nodiscard]] bool createsDiscoveredAttackOnOwnKing(Move move) const;
+        [[nodiscard]] inline bool createsDiscoveredAttackOnOwnKing(Move move) const;
 
         // Returns whether a piece on a given square is attacked
         // by any enemy piece. False if square is empty.
-        [[nodiscard]] bool isPieceAttacked(Square sq) const;
+        [[nodiscard]] inline bool isPieceAttacked(Square sq) const;
 
         // Returns whether a piece on a given square is attacked
         // by any enemy piece after `move` is made. False if square is empty.
         // Move must be pseudo legal.
-        [[nodiscard]] bool isPieceAttackedAfterMove(Move move, Square sq) const;
+        [[nodiscard]] inline bool isPieceAttackedAfterMove(Move move, Square sq) const;
 
         // Returns whether the king of the moving side is attacked
         // by any enemy piece after a move is made.
         // Move must be pseudo legal.
-        [[nodiscard]] bool isOwnKingAttackedAfterMove(Move move) const;
+        [[nodiscard]] inline bool isOwnKingAttackedAfterMove(Move move) const;
 
         // Return a bitboard with all (pseudo legal) attacks by the piece on
         // the given square. Empty if no piece on the square.
-        [[nodiscard]] Bitboard attacks(Square sq) const;
+        [[nodiscard]] inline Bitboard attacks(Square sq) const;
 
         // Returns a bitboard with all squared that have pieces
         // that attack a given square (pseudo legally)
-        [[nodiscard]] Bitboard attackers(Square sq, Color attackerColor) const;
+        [[nodiscard]] inline Bitboard attackers(Square sq, Color attackerColor) const;
 
         [[nodiscard]] constexpr Piece pieceAt(Square sq) const
         {
@@ -4438,20 +4432,6 @@ namespace chess
 
     struct Position;
 
-    struct MoveLegalityChecker
-    {
-        MoveLegalityChecker(const Position& position);
-
-        [[nodiscard]] bool isPseudoLegalMoveLegal(const Move& move) const;
-
-    private:
-        const Position* m_position;
-        Bitboard m_checkers;
-        Bitboard m_ourBlockersForKing;
-        Bitboard m_potentialCheckRemovals;
-        Square m_ksq;
-    };
-
     struct CompressedPosition;
 
     struct PositionHash128
@@ -4484,20 +4464,20 @@ namespace chess
         {
         }
 
-        void set(std::string_view fen);
+        inline void set(std::string_view fen);
 
         // Returns false if the fen was not valid
         // If the returned value was false the position
         // is in unspecified state.
-        [[nodiscard]] bool trySet(std::string_view fen);
+        [[nodiscard]] inline bool trySet(std::string_view fen);
 
-        [[nodiscard]] static Position fromFen(std::string_view fen);
+        [[nodiscard]] static inline Position fromFen(std::string_view fen);
 
-        [[nodiscard]] static std::optional<Position> tryFromFen(std::string_view fen);
+        [[nodiscard]] static inline std::optional<Position> tryFromFen(std::string_view fen);
 
-        [[nodiscard]] static Position startPosition();
+        [[nodiscard]] static inline Position startPosition();
 
-        [[nodiscard]] std::string fen() const;
+        [[nodiscard]] inline std::string fen() const;
 
         constexpr void setEpSquareUnchecked(Square sq)
         {
@@ -4535,7 +4515,7 @@ namespace chess
             m_ply = ply;
         }
 
-        ReverseMove doMove(const Move& move);
+        inline ReverseMove doMove(const Move& move);
 
         constexpr void undoMove(const ReverseMove& reverseMove)
         {
@@ -4559,49 +4539,44 @@ namespace chess
             return m_sideToMove;
         }
 
-        [[nodiscard]] std::uint8_t rule50Counter() const
+        [[nodiscard]] inline std::uint8_t rule50Counter() const
         {
             return m_rule50Counter;
         }
 
-        [[nodiscard]] std::uint16_t ply() const
+        [[nodiscard]] inline std::uint16_t ply() const
         {
             return m_ply;
         }
 
-        [[nodiscard]] std::uint16_t halfMove() const
+        [[nodiscard]] inline std::uint16_t halfMove() const
         {
             return (m_ply + 1) / 2;
         }
 
-        void setHalfMove(std::uint16_t hm)
+        inline void setHalfMove(std::uint16_t hm)
         {
             m_ply = 2 * hm - 1 + (m_sideToMove == Color::Black);
         }
 
-        [[nodiscard]] bool isCheck() const;
+        [[nodiscard]] inline bool isCheck() const;
 
-        [[nodiscard]] Bitboard checkers() const;
+        [[nodiscard]] inline Bitboard checkers() const;
 
-        [[nodiscard]] bool isCheckAfterMove(Move move) const;
+        [[nodiscard]] inline bool isCheckAfterMove(Move move) const;
 
         // Checks whether ANY `move` is legal.
-        [[nodiscard]] bool isMoveLegal(Move move) const;
+        [[nodiscard]] inline bool isMoveLegal(Move move) const;
 
-        [[nodiscard]] bool isPseudoLegalMoveLegal(Move move) const;
+        [[nodiscard]] inline bool isPseudoLegalMoveLegal(Move move) const;
 
-        [[nodiscard]] bool isMovePseudoLegal(Move move) const;
+        [[nodiscard]] inline bool isMovePseudoLegal(Move move) const;
 
         // Returns all pieces that block a slider
         // from attacking our king. When two or more
         // pieces block a single slider then none
         // of these pieces are included.
-        [[nodiscard]] Bitboard blockersForKing(Color color) const;
-
-        [[nodiscard]] MoveLegalityChecker moveLegalityChecker() const
-        {
-            return { *this };
-        }
+        [[nodiscard]] inline Bitboard blockersForKing(Color color) const;
 
         [[nodiscard]] constexpr Square epSquare() const
         {
@@ -4637,7 +4612,7 @@ namespace chess
             return cpy;
         }
 
-        [[nodiscard]] Position afterMove(Move move) const;
+        [[nodiscard]] inline Position afterMove(Move move) const;
 
         [[nodiscard]] constexpr bool isEpPossible() const
         {
@@ -4655,11 +4630,11 @@ namespace chess
 
         static_assert(sizeof(Color) + sizeof(Square) + sizeof(CastlingRights) + sizeof(std::uint8_t) == 4);
 
-        [[nodiscard]] FORCEINLINE bool isEpPossible(Square epSquare, Color sideToMove) const;
+        [[nodiscard]] inline bool isEpPossible(Square epSquare, Color sideToMove) const;
 
-        [[nodiscard]] NOINLINE bool isEpPossibleColdPath(Square epSquare, Bitboard pawnsAttackingEpSquare, Color sideToMove) const;
+        [[nodiscard]] inline bool isEpPossibleColdPath(Square epSquare, Bitboard pawnsAttackingEpSquare, Color sideToMove) const;
 
-        void nullifyEpSquareIfNotPossible();
+        inline void nullifyEpSquareIfNotPossible();
     };
 
     struct CompressedPosition
@@ -5302,7 +5277,7 @@ namespace chess
         return allAttackers;
     }
 
-    const Piece* Board::piecesRaw() const
+    inline const Piece* Board::piecesRaw() const
     {
         return m_pieces.data();
     }
@@ -5330,7 +5305,7 @@ namespace chess
         }();
     }
 
-    [[nodiscard]] std::string Board::fen() const
+    [[nodiscard]] inline std::string Board::fen() const
     {
         std::string fen;
         fen.reserve(96); // longest fen is probably in range of around 88
@@ -5382,79 +5357,6 @@ namespace chess
         return fen;
     }
 
-    MoveLegalityChecker::MoveLegalityChecker(const Position& position) :
-        m_position(&position),
-        m_checkers(position.checkers()),
-        m_ourBlockersForKing(
-            position.blockersForKing(position.sideToMove())
-            & position.piecesBB(position.sideToMove())
-        ),
-        m_ksq(position.kingSquare(position.sideToMove()))
-    {
-        if (m_checkers.exactlyOne())
-        {
-            const Bitboard knightCheckers = m_checkers & bb::pseudoAttacks<PieceType::Knight>(m_ksq);
-            if (knightCheckers.any())
-            {
-                // We're checked by a knight, we have to remove it or move the king.
-                m_potentialCheckRemovals = knightCheckers;
-            }
-            else
-            {
-                // If we're not checked by a knight we can block it.
-                m_potentialCheckRemovals = bb::between(m_ksq, m_checkers.first()) | m_checkers;
-            }
-        }
-        else
-        {
-            // Double check, king has to move.
-            m_potentialCheckRemovals = Bitboard::none();
-        }
-    }
-
-    [[nodiscard]] bool MoveLegalityChecker::isPseudoLegalMoveLegal(const Move& move) const
-    {
-        const Piece movedPiece = m_position->pieceAt(move.from);
-
-        if (m_checkers.any())
-        {
-            if (move.from == m_ksq || move.type == MoveType::EnPassant)
-            {
-                return m_position->isPseudoLegalMoveLegal(move);
-            }
-            else
-            {
-                // This means there's only one check and we either
-                // blocked it or removed the piece that attacked
-                // our king. So the only threat is if it's a discovered check.
-                return
-                    m_potentialCheckRemovals.isSet(move.to)
-                    && !m_ourBlockersForKing.isSet(move.from);
-            }
-        }
-        else
-        {
-            if (move.from == m_ksq)
-            {
-                return m_position->isPseudoLegalMoveLegal(move);
-            }
-            else if (move.type == MoveType::EnPassant)
-            {
-                return !m_position->createsDiscoveredAttackOnOwnKing(move);
-            }
-            else if (m_ourBlockersForKing.isSet(move.from))
-            {
-                // If it was a blocker it may have only moved in line with our king.
-                // Otherwise it's a discovered check.
-                return bb::line(m_ksq, move.from).isSet(move.to);
-            }
-            else
-            {
-                return true;
-            }
-        }
-    }
-
     void Position::set(std::string_view fen)
     {
         (void)trySet(fen);
@@ -5611,7 +5513,7 @@ namespace chess
         }();
     }
 
-    ReverseMove Position::doMove(const Move& move)
+    inline ReverseMove Position::doMove(const Move& move)
     {
         assert(move.from.isOk() && move.to.isOk());
 
@@ -5649,12 +5551,12 @@ namespace chess
         return { move, captured, oldEpSquare, oldCastlingRights };
     }
 
-    [[nodiscard]] bool Position::isCheck() const
+    [[nodiscard]] inline bool Position::isCheck() const
     {
         return BaseType::isSquareAttacked(kingSquare(m_sideToMove), !m_sideToMove);
     }
 
-    [[nodiscard]] Bitboard Position::checkers() const
+    [[nodiscard]] inline Bitboard Position::checkers() const
     {
         return BaseType::attackers(kingSquare(m_sideToMove), !m_sideToMove);
     }
@@ -5664,7 +5566,7 @@ namespace chess
         return BaseType::isSquareAttackedAfterMove(move, kingSquare(!m_sideToMove), m_sideToMove);
     }
 
-    [[nodiscard]] Bitboard Position::blockersForKing(Color color) const
+    [[nodiscard]] inline Bitboard Position::blockersForKing(Color color) const
     {
         const Color attackerColor = !color;
 
@@ -5700,7 +5602,7 @@ namespace chess
         return allBlockers;
     }
 
-    [[nodiscard]] Position Position::afterMove(Move move) const
+    [[nodiscard]] inline Position Position::afterMove(Move move) const
     {
         Position cpy(*this);
         auto pc = cpy.doMove(move);
@@ -5711,7 +5613,7 @@ namespace chess
         return cpy;
     }
 
-    [[nodiscard]] FORCEINLINE bool Position::isEpPossible(Square epSquare, Color sideToMove) const
+    [[nodiscard]] inline bool Position::isEpPossible(Square epSquare, Color sideToMove) const
     {
         const Bitboard pawnsAttackingEpSquare =
             bb::pawnAttacks(Bitboard::square(epSquare), !sideToMove)
@@ -5725,7 +5627,7 @@ namespace chess
         return isEpPossibleColdPath(epSquare, pawnsAttackingEpSquare, sideToMove);
     }
 
-    [[nodiscard]] NOINLINE bool Position::isEpPossibleColdPath(Square epSquare, Bitboard pawnsAttackingEpSquare, Color sideToMove) const
+    [[nodiscard]] inline bool Position::isEpPossibleColdPath(Square epSquare, Bitboard pawnsAttackingEpSquare, Color sideToMove) const
     {
         // only set m_epSquare when it matters, ie. when
         // the opposite side can actually capture
@@ -5772,7 +5674,7 @@ namespace chess
         return false;
     }
 
-    void Position::nullifyEpSquareIfNotPossible()
+    inline void Position::nullifyEpSquareIfNotPossible()
     {
         if (m_epSquare != Square::none() && !isEpPossible(m_epSquare, m_sideToMove))
         {
@@ -5782,12 +5684,12 @@ namespace chess
 
     namespace uci
     {
-        [[nodiscard]] std::string moveToUci(const Position& pos, const Move& move);
-        [[nodiscard]] Move uciToMove(const Position& pos, std::string_view sv);
+        [[nodiscard]] inline std::string moveToUci(const Position& pos, const Move& move);
+        [[nodiscard]] inline Move uciToMove(const Position& pos, std::string_view sv);
 
-        [[nodiscard]] std::optional<Move> tryUciToMove(const Position& pos, std::string_view sv);
+        [[nodiscard]] inline std::optional<Move> tryUciToMove(const Position& pos, std::string_view sv);
 
-        [[nodiscard]] std::string moveToUci(const Position& pos, const Move& move)
+        [[nodiscard]] inline std::string moveToUci(const Position& pos, const Move& move)
         {
             std::string s;
 
@@ -5814,7 +5716,7 @@ namespace chess
             return s;
         }
 
-        [[nodiscard]] Move uciToMove(const Position& pos, std::string_view sv)
+        [[nodiscard]] inline Move uciToMove(const Position& pos, std::string_view sv)
         {
             const Square from = parser_bits::parseSquare(sv.data());
             const Square to = parser_bits::parseSquare(sv.data() + 2);
@@ -5850,7 +5752,7 @@ namespace chess
             }
         }
 
-        [[nodiscard]] std::optional<Move> tryUciToMove(const Position& pos, std::string_view sv)
+        [[nodiscard]] inline std::optional<Move> tryUciToMove(const Position& pos, std::string_view sv)
         {
             if (sv.size() < 4 || sv.size() > 5)
             {
@@ -6300,7 +6202,7 @@ namespace binpack
         };
 
 
-        [[nodiscard]] chess::Position pos_from_packed_sfen(const PackedSfen& sfen)
+        [[nodiscard]] inline chess::Position pos_from_packed_sfen(const PackedSfen& sfen)
         {
             SfenPacker packer;
             auto& stream = packer.stream;
@@ -6655,14 +6557,12 @@ namespace binpack
                 if (!occupied.isSet(sqForward))
                 {
                     destinations |= sqForward;
-
-                    const chess::Square sqForward2 = sqForward + forward;
                     if (
                         from.rank() == startRank
-                        && !occupied.isSet(sqForward2)
+                        && !occupied.isSet(sqForward + forward)
                         )
                     {
-                        destinations |= sqForward2;
+                        destinations |= sqForward + forward;
                     }
                 }
 
@@ -6845,13 +6745,12 @@ namespace binpack
                 {
                     destinations |= sqForward;
 
-                    const chess::Square sqForward2 = sqForward + forward;
                     if (
                         move.from.rank() == startRank
-                        && !occupied.isSet(sqForward2)
+                        && !occupied.isSet(sqForward + forward)
                         )
                     {
-                        destinations |= sqForward2;
+                        destinations |= sqForward + forward;
                     }
                 }
 
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 530c660b..99a783bb 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -133,9 +133,12 @@ namespace Learner
         {
             case SfenOutputType::Bin:
                 return std::make_unique<BinSfenOutputStream>(filename);
-            default:
+            case SfenOutputType::Binpack:
                 return std::make_unique<BinpackSfenOutputStream>(filename);
         }
+
+        assert(false);
+        return nullptr;
     }
 
     // Helper class for exporting Sfen
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 15f0825d..7cc04406 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -30,6 +30,8 @@
 #include "learn.h"
 #include "multi_think.h"
 
+#include "../extra/nnue_data_binpack_format.h"
+
 #include <chrono>
 #include <climits>
 #include <cmath>    // std::exp(),std::pow(),std::log()
@@ -85,8 +87,8 @@ namespace Learner
     static double dest_score_min_value = 0.0;
     static double dest_score_max_value = 1.0;
 
-    // Assume teacher signals are the scores of deep searches, 
-    // and convert them into winning probabilities in the trainer. 
+    // Assume teacher signals are the scores of deep searches,
+    // and convert them into winning probabilities in the trainer.
     // Sometimes we want to use the winning probabilities in the training
     // data directly. In those cases, we set false to this variable.
     static bool convert_teacher_signal_to_winning_probability = true;
@@ -125,19 +127,19 @@ namespace Learner
     // A function that converts the evaluation value to the winning rate [0,1]
     double winning_percentage(double value, int ply)
     {
-        if (use_wdl) 
+        if (use_wdl)
         {
             return winning_percentage_wdl(value, ply);
         }
-        else 
+        else
         {
             return winning_percentage(value);
         }
     }
 
     double calc_cross_entropy_of_winning_percentage(
-        double deep_win_rate, 
-        double shallow_eval, 
+        double deep_win_rate,
+        double shallow_eval,
         int ply)
     {
         const double p = deep_win_rate;
@@ -146,8 +148,8 @@ namespace Learner
     }
 
     double calc_d_cross_entropy_of_winning_percentage(
-        double deep_win_rate, 
-        double shallow_eval, 
+        double deep_win_rate,
+        double shallow_eval,
         int ply)
     {
         constexpr double epsilon = 0.000001;
@@ -158,7 +160,7 @@ namespace Learner
         const double y2 = calc_cross_entropy_of_winning_percentage(
             deep_win_rate, shallow_eval + epsilon, ply);
 
-        // Divide by the winning_probability_coefficient to 
+        // Divide by the winning_probability_coefficient to
         // match scale with the sigmoidal win rate
         return ((y2 - y1) / epsilon) / winning_probability_coefficient;
     }
@@ -195,7 +197,7 @@ namespace Learner
         const double scaled_teacher_signal = get_scaled_signal(teacher_signal);
 
         double p = scaled_teacher_signal;
-        if (convert_teacher_signal_to_winning_probability) 
+        if (convert_teacher_signal_to_winning_probability)
         {
             p = winning_percentage(scaled_teacher_signal, ply);
         }
@@ -217,7 +219,7 @@ namespace Learner
 
     double calculate_t(int game_result)
     {
-        // Use 1 as the correction term if the expected win rate is 1, 
+        // Use 1 as the correction term if the expected win rate is 1,
         // 0 if you lose, and 0.5 if you draw.
         // game_result = 1,0,-1 so add 1 and divide by 2.
         const double t = double(game_result + 1) * 0.5;
@@ -235,13 +237,13 @@ namespace Learner
         const double lambda = calculate_lambda(teacher_signal);
 
         double grad;
-        if (use_wdl) 
+        if (use_wdl)
         {
             const double dce_p = calc_d_cross_entropy_of_winning_percentage(p, shallow, psv.gamePly);
             const double dce_t = calc_d_cross_entropy_of_winning_percentage(t, shallow, psv.gamePly);
             grad = lambda * dce_p + (1.0 - lambda) * dce_t;
         }
-        else 
+        else
         {
             // Use the actual win rate as a correction term.
             // This is the idea of ​​elmo (WCSC27), modern O-parts.
@@ -252,18 +254,18 @@ namespace Learner
     }
 
     // Calculate cross entropy during learning
-    // The individual cross entropy of the win/loss term and win 
-    // rate term of the elmo expression is returned 
+    // The individual cross entropy of the win/loss term and win
+    // rate term of the elmo expression is returned
     // to the arguments cross_entropy_eval and cross_entropy_win.
     void calc_cross_entropy(
-        Value teacher_signal, 
-        Value shallow, 
+        Value teacher_signal,
+        Value shallow,
         const PackedSfenValue& psv,
-        double& cross_entropy_eval, 
-        double& cross_entropy_win, 
+        double& cross_entropy_eval,
+        double& cross_entropy_win,
         double& cross_entropy,
-        double& entropy_eval, 
-        double& entropy_win, 
+        double& entropy_eval,
+        double& entropy_win,
         double& entropy)
     {
         // Teacher winning probability.
@@ -292,24 +294,133 @@ namespace Learner
     }
 
     // Other objective functions may be considered in the future...
-    double calc_grad(Value shallow, const PackedSfenValue& psv) 
+    double calc_grad(Value shallow, const PackedSfenValue& psv)
     {
         return calc_grad((Value)psv.score, shallow, psv);
     }
 
+    struct BasicSfenInputStream
+    {
+        virtual std::optional<PackedSfenValue> next() = 0;
+        virtual bool eof() const = 0;
+        virtual ~BasicSfenInputStream() {}
+    };
+
+    struct BinSfenInputStream : BasicSfenInputStream
+    {
+        static constexpr auto openmode = ios::in | ios::binary;
+        static inline const std::string extension = "bin";
+
+        BinSfenInputStream(std::string filename) :
+            m_stream(filename, openmode),
+            m_eof(!m_stream)
+        {
+        }
+
+        std::optional<PackedSfenValue> next() override
+        {
+            PackedSfenValue e;
+            if(m_stream.read(reinterpret_cast<char*>(&e), sizeof(PackedSfenValue)))
+            {
+                return e;
+            }
+            else
+            {
+                m_eof = true;
+                return std::nullopt;
+            }
+        }
+
+        bool eof() const override
+        {
+            return m_eof;
+        }
+
+        ~BinSfenInputStream() override {}
+
+    private:
+        fstream m_stream;
+        bool m_eof;
+    };
+
+    struct BinpackSfenInputStream : BasicSfenInputStream
+    {
+        static constexpr auto openmode = ios::in | ios::binary;
+        static inline const std::string extension = "binpack";
+
+        BinpackSfenInputStream(std::string filename) :
+            m_stream(filename, openmode),
+            m_eof(!m_stream.hasNext())
+        {
+        }
+
+        std::optional<PackedSfenValue> next() override
+        {
+            static_assert(sizeof(binpack::nodchip::PackedSfenValue) == sizeof(PackedSfenValue));
+
+            if (!m_stream.hasNext())
+            {
+                m_eof = true;
+                return std::nullopt;
+            }
+
+            auto training_data_entry = m_stream.next();
+            auto v = binpack::trainingDataEntryToPackedSfenValue(training_data_entry);
+            PackedSfenValue psv;
+            // same layout, different types. One is from generic library.
+            std::memcpy(&psv, &v, sizeof(PackedSfenValue));
+
+            return psv;
+        }
+
+        bool eof() const override
+        {
+            return m_eof;
+        }
+
+        ~BinpackSfenInputStream() override {}
+
+    private:
+        binpack::CompressedTrainingDataEntryReader m_stream;
+        bool m_eof;
+    };
+
+    static bool ends_with(const std::string& lhs, const std::string& end)
+    {
+        if (end.size() > lhs.size()) return false;
+
+        return std::equal(end.rbegin(), end.rend(), lhs.rbegin());
+    }
+
+    static bool has_extension(const std::string& filename, const std::string& extension)
+    {
+        return ends_with(filename, "." + extension);
+    }
+
+    static std::unique_ptr<BasicSfenInputStream> open_sfen_input_file(const std::string& filename)
+    {
+        if (has_extension(filename, BinSfenInputStream::extension))
+            return std::make_unique<BinSfenInputStream>(filename);
+        else if (has_extension(filename, BinpackSfenInputStream::extension))
+            return std::make_unique<BinpackSfenInputStream>(filename);
+
+        assert(false);
+        return nullptr;
+    }
+
     // Sfen reader
     struct SfenReader
     {
         // Number of phases used for calculation such as mse
         // mini-batch size = 1M is standard, so 0.2% of that should be negligible in terms of time.
-        // Since search() is performed with depth = 1 in calculation of 
+        // Since search() is performed with depth = 1 in calculation of
         // move match rate, simple comparison is not possible...
         static constexpr uint64_t sfen_for_mse_size = 2000;
 
         // Number of phases buffered by each thread 0.1M phases. 4M phase at 40HT
         static constexpr size_t THREAD_BUFFER_SIZE = 10 * 1000;
 
-        // Buffer for reading files (If this is made larger, 
+        // Buffer for reading files (If this is made larger,
         // the shuffle becomes larger and the phases may vary.
         // If it is too large, the memory consumption will increase.
         // SFEN_READ_SIZE is a multiple of THREAD_BUFFER_SIZE.
@@ -322,7 +433,7 @@ namespace Learner
 
         // Do not use std::random_device().
         // Because it always the same integers on MinGW.
-        SfenReader(int thread_num) : 
+        SfenReader(int thread_num) :
             prng(std::chrono::system_clock::now().time_since_epoch().count())
         {
             packed_sfens.resize(thread_num);
@@ -369,13 +480,15 @@ namespace Learner
 
         void read_validation_set(const string& file_name, int eval_limit)
         {
-            ifstream input(file_name, ios::binary);
+            auto input = open_sfen_input_file(file_name);
 
-            while (input)
+            while(!input->eof())
             {
-                PackedSfenValue p;
-                if (input.read(reinterpret_cast<char*>(&p), sizeof(PackedSfenValue)))
+                std::optional<PackedSfenValue> p_opt = input->next();
+                if (p_opt.has_value())
                 {
+                    auto& p = *p_opt;
+
                     if (eval_limit < abs(p.score))
                         continue;
 
@@ -398,7 +511,7 @@ namespace Learner
             // then retrieve one and return it.
             auto& thread_ps = packed_sfens[thread_id];
 
-            // Fill the read buffer if there is no remaining buffer, 
+            // Fill the read buffer if there is no remaining buffer,
             // but if it doesn't even exist, finish.
             // If the buffer is empty, fill it.
             if ((thread_ps == nullptr || thread_ps->empty())
@@ -406,7 +519,7 @@ namespace Learner
                 return false;
 
             // read_to_thread_buffer_impl() returned true,
-            // Since the filling of the thread buffer with the 
+            // Since the filling of the thread buffer with the
             // phase has been completed successfully
             // thread_ps->rbegin() is alive.
 
@@ -458,33 +571,42 @@ namespace Learner
         // Start a thread that loads the phase file in the background.
         void start_file_read_worker()
         {
-            file_worker_thread = std::thread([&] { 
-                this->file_read_worker(); 
+            file_worker_thread = std::thread([&] {
+                this->file_read_worker();
                 });
         }
 
         void file_read_worker()
         {
             auto open_next_file = [&]() {
-                if (fs.is_open())
-                    fs.close();
-
                 // no more
-                if (filenames.empty())
-                    return false;
+                for(;;)
+                {
+                    sfen_input_stream.reset();
 
-                // Get the next file name.
-                string filename = filenames.back();
-                filenames.pop_back();
+                    if (filenames.empty())
+                        return false;
 
-                fs.open(filename, ios::in | ios::binary);
-                cout << "open filename = " << filename << endl;
+                    // Get the next file name.
+                    string filename = filenames.back();
+                    filenames.pop_back();
 
-                assert(fs);
+                    sfen_input_stream = open_sfen_input_file(filename);
+                    cout << "open filename = " << filename << endl;
 
-                return true;
+                    // in case the file is empty or was deleted.
+                    if (!sfen_input_stream->eof())
+                        return true;
+                }
             };
 
+            if (sfen_input_stream == nullptr && !open_next_file())
+            {
+                cout << "..end of files." << endl;
+                end_of_files = true;
+                return;
+            }
+
             while (true)
             {
                 // Wait for the buffer to run out.
@@ -501,10 +623,10 @@ namespace Learner
                 // Read from the file into the file buffer.
                 while (sfens.size() < SFEN_READ_SIZE)
                 {
-                    PackedSfenValue p;
-                    if (fs.read(reinterpret_cast<char*>(&p), sizeof(PackedSfenValue)))
+                    std::optional<PackedSfenValue> p = sfen_input_stream->next();
+                    if (p.has_value())
                     {
-                        sfens.push_back(p);
+                        sfens.push_back(*p);
                     }
                     else if(!open_next_file())
                     {
@@ -535,8 +657,8 @@ namespace Learner
                     auto buf = std::make_unique<PSVector>();
                     buf->resize(THREAD_BUFFER_SIZE);
                     memcpy(
-                        buf->data(), 
-                        &sfens[i * THREAD_BUFFER_SIZE], 
+                        buf->data(),
+                        &sfens[i * THREAD_BUFFER_SIZE],
                         sizeof(PackedSfenValue) * THREAD_BUFFER_SIZE);
 
                     buffers.emplace_back(std::move(buf));
@@ -545,7 +667,7 @@ namespace Learner
                 {
                     std::unique_lock<std::mutex> lk(mutex);
 
-                    // The mutex lock is required because the 
+                    // The mutex lock is required because the%
                     // contents of packed_sfens_pool are changed.
 
                     for (auto& buf : buffers)
@@ -600,7 +722,7 @@ namespace Learner
         atomic<bool> end_of_files;
 
         // handle of sfen file
-        std::fstream fs;
+        std::unique_ptr<BasicSfenInputStream> sfen_input_stream;
 
         // sfen for each thread
         // (When the thread is used up, the thread should call delete to release it.)
@@ -621,9 +743,9 @@ namespace Learner
     // Class to generate sfen with multiple threads
     struct LearnerThink : public MultiThink
     {
-        LearnerThink(SfenReader& sr_) : 
-            sr(sr_), 
-            stop_flag(false), 
+        LearnerThink(SfenReader& sr_) :
+            sr(sr_),
+            stop_flag(false),
             save_only_once(false)
         {
             learn_sum_cross_entropy_eval = 0.0;
@@ -644,9 +766,9 @@ namespace Learner
         virtual void thread_worker(size_t thread_id);
 
         // Start a thread that loads the phase file in the background.
-        void start_file_read_worker() 
-        { 
-            sr.start_file_read_worker(); 
+        void start_file_read_worker()
+        {
+            sr.start_file_read_worker();
         }
 
         Value get_shallow_value(Position& task_pos);
@@ -674,7 +796,7 @@ namespace Learner
         // Option not to learn kk/kkp/kpp/kppp
         std::array<bool, 4> freeze;
 
-        // If the absolute value of the evaluation value of the deep search 
+        // If the absolute value of the evaluation value of the deep search
         // of the teacher phase exceeds this value, discard the teacher phase.
         int eval_limit;
 
@@ -742,7 +864,7 @@ namespace Learner
 
     void LearnerThink::calc_loss(size_t thread_id, uint64_t done)
     {
-        // There is no point in hitting the replacement table, 
+        // There is no point in hitting the replacement table,
         // so at this timing the generation of the replacement table is updated.
         // It doesn't matter if you have disabled the substitution table.
         TT.new_search();
@@ -766,7 +888,7 @@ namespace Learner
         atomic<double> sum_norm;
         sum_norm = 0;
 
-        // The number of times the pv first move of deep 
+        // The number of times the pv first move of deep
         // search matches the pv first move of search(1).
         atomic<int> move_accord_count;
         move_accord_count = 0;
@@ -778,7 +900,7 @@ namespace Learner
         pos.set(StartFEN, false, &si, th);
         std::cout << "hirate eval = " << Eval::evaluate(pos);
 
-        // It's better to parallelize here, but it's a bit 
+        // It's better to parallelize here, but it's a bit
         // troublesome because the search before slave has not finished.
         // I created a mechanism to call task, so I will use it.
 
@@ -792,7 +914,7 @@ namespace Learner
         {
             // Assign work to each thread using TaskDispatcher.
             // A task definition for that.
-            // It is not possible to capture pos used in ↑, 
+            // It is not possible to capture pos used in ↑,
             // so specify the variables you want to capture one by one.
             auto task =
                 [
@@ -823,7 +945,7 @@ namespace Learner
                 // Evaluation value of deep search
                 auto deep_value = (Value)ps.score;
 
-                // Note) This code does not consider when 
+                // Note) This code does not consider when
                 //       eval_limit is specified in the learn command.
 
                 // --- calculation of cross entropy
@@ -834,14 +956,14 @@ namespace Learner
                 double test_cross_entropy_eval, test_cross_entropy_win, test_cross_entropy;
                 double test_entropy_eval, test_entropy_win, test_entropy;
                 calc_cross_entropy(
-                    deep_value, 
-                    shallow_value, 
-                    ps, 
-                    test_cross_entropy_eval, 
-                    test_cross_entropy_win, 
-                    test_cross_entropy, 
-                    test_entropy_eval, 
-                    test_entropy_win, 
+                    deep_value,
+                    shallow_value,
+                    ps,
+                    test_cross_entropy_eval,
+                    test_cross_entropy_win,
+                    test_cross_entropy,
+                    test_entropy_eval,
+                    test_entropy_win,
                     test_entropy);
 
                 // The total cross entropy need not be abs() by definition.
@@ -878,9 +1000,9 @@ namespace Learner
         latest_loss_sum += test_sum_cross_entropy - test_sum_entropy;
         latest_loss_count += sr.sfen_for_mse.size();
 
-        // learn_cross_entropy may be called train cross 
+        // learn_cross_entropy may be called train cross
         // entropy in the world of machine learning,
-        // When omitting the acronym, it is nice to be able to 
+        // When omitting the acronym, it is nice to be able to
         // distinguish it from test cross entropy(tce) by writing it as lce.
 
         if (sr.sfen_for_mse.size() && done)
@@ -907,7 +1029,7 @@ namespace Learner
             }
             cout << endl;
         }
-        else 
+        else
         {
             cout << "Error! : sr.sfen_for_mse.size() = " << sr.sfen_for_mse.size() << " ,  done = " << done << endl;
         }
@@ -977,7 +1099,7 @@ namespace Learner
                     {
                         sr.save_count = 0;
 
-                        // During this time, as the gradient calculation proceeds, 
+                        // During this time, as the gradient calculation proceeds,
                         // the value becomes too large and I feel annoyed, so stop other threads.
                         const bool converged = save();
                         if (converged)
@@ -1007,11 +1129,11 @@ namespace Learner
                         sr.last_done = sr.total_done;
                     }
 
-                    // Next time, I want you to do this series of 
+                    // Next time, I want you to do this series of
                     // processing again when you process only mini_batch_size.
                     sr.next_update_weights += mini_batch_size;
 
-                    // Since I was waiting for the update of this 
+                    // Since I was waiting for the update of this
                     // sr.next_update_weights except the main thread,
                     // Once this value is updated, it will start moving again.
                 }
@@ -1048,16 +1170,16 @@ namespace Learner
             if (pos.set_from_packed_sfen(ps.sfen, &si, th, mirror) != 0)
             {
                 // I got a strange sfen. Should be debugged!
-                // Since it is an illegal sfen, it may not be 
+                // Since it is an illegal sfen, it may not be
                 // displayed with pos.sfen(), but it is better than not.
                 cout << "Error! : illigal packed sfen = " << pos.fen() << endl;
                 goto RETRY_READ;
             }
 
             // There is a possibility that all the pieces are blocked and stuck.
-            // Also, the declaration win phase is excluded from 
+            // Also, the declaration win phase is excluded from
             // learning because you cannot go to leaf with PV moves.
-            // (shouldn't write out such teacher aspect itself, 
+            // (shouldn't write out such teacher aspect itself,
             // but may have written it out with an old generation routine)
             // Skip the position if there are no legal moves (=checkmated or stalemate).
             if (MoveList<LEGAL>(pos).size() == 0)
@@ -1073,7 +1195,7 @@ namespace Learner
             const auto deep_value = (Value)ps.score;
 
             // I feel that the mini batch has a better gradient.
-            // Go to the leaf node as it is, add only to the gradient array, 
+            // Go to the leaf node as it is, add only to the gradient array,
             // and later try AdaGrad at the time of rmse aggregation.
 
             const auto rootColor = pos.side_to_move();
@@ -1088,30 +1210,30 @@ namespace Learner
             auto pos_add_grad = [&]() {
                 // Use the value of evaluate in leaf as shallow_value.
                 // Using the return value of qsearch() as shallow_value,
-                // If PV is interrupted in the middle, the phase where 
-                // evaluate() is called to calculate the gradient, 
-                // and I don't think this is a very desirable property, 
+                // If PV is interrupted in the middle, the phase where
+                // evaluate() is called to calculate the gradient,
+                // and I don't think this is a very desirable property,
                 // as the aspect that gives that gradient will be different.
-                // I have turned off the substitution table, but since 
+                // I have turned off the substitution table, but since
                 // the pv array has not been updated due to one stumbling block etc...
 
-                const Value shallow_value = 
-                    (rootColor == pos.side_to_move()) 
-                    ? Eval::evaluate(pos) 
+                const Value shallow_value =
+                    (rootColor == pos.side_to_move())
+                    ? Eval::evaluate(pos)
                     : -Eval::evaluate(pos);
 
                 // Calculate loss for training data
                 double learn_cross_entropy_eval, learn_cross_entropy_win, learn_cross_entropy;
                 double learn_entropy_eval, learn_entropy_win, learn_entropy;
                 calc_cross_entropy(
-                    deep_value, 
-                    shallow_value, 
-                    ps, 
-                    learn_cross_entropy_eval, 
-                    learn_cross_entropy_win, 
-                    learn_cross_entropy, 
-                    learn_entropy_eval, 
-                    learn_entropy_win, 
+                    deep_value,
+                    shallow_value,
+                    ps,
+                    learn_cross_entropy_eval,
+                    learn_cross_entropy_win,
+                    learn_cross_entropy,
+                    learn_entropy_eval,
+                    learn_entropy_win,
                     learn_entropy);
 
                 learn_sum_cross_entropy_eval += learn_cross_entropy_eval;
@@ -1154,7 +1276,7 @@ namespace Learner
                 Eval::NNUE::update_eval(pos);
             }
 
-            if (illegal_move) 
+            if (illegal_move)
             {
                 sync_cout << "An illegal move was detected... Excluded the position from the learning data..." << sync_endl;
                 continue;
@@ -1182,12 +1304,12 @@ namespace Learner
             // Do not dig a subfolder because I want to save it only once.
             Eval::save_eval("");
         }
-        else if (is_final) 
+        else if (is_final)
         {
             Eval::save_eval("final");
             return true;
         }
-        else 
+        else
         {
             static int dir_number = 0;
             const std::string dir_name = std::to_string(dir_number++);
@@ -1199,27 +1321,27 @@ namespace Learner
                 latest_loss_sum = 0.0;
                 latest_loss_count = 0;
                 cout << "loss: " << latest_loss;
-                if (latest_loss < best_loss) 
+                if (latest_loss < best_loss)
                 {
                     cout << " < best (" << best_loss << "), accepted" << endl;
                     best_loss = latest_loss;
                     best_nn_directory = Path::Combine((std::string)Options["EvalSaveDir"], dir_name);
                     trials = newbob_num_trials;
                 }
-                else 
+                else
                 {
                     cout << " >= best (" << best_loss << "), rejected" << endl;
-                    if (best_nn_directory.empty()) 
+                    if (best_nn_directory.empty())
                     {
                         cout << "WARNING: no improvement from initial model" << endl;
                     }
-                    else 
+                    else
                     {
                         cout << "restoring parameters from " << best_nn_directory << endl;
                         Eval::NNUE::RestoreParameters(best_nn_directory);
                     }
 
-                    if (--trials > 0 && !is_final) 
+                    if (--trials > 0 && !is_final)
                     {
                         cout
                             << "reducing learning rate scale from " << newbob_scale
@@ -1230,8 +1352,8 @@ namespace Learner
                         Eval::NNUE::SetGlobalLearningRateScale(newbob_scale);
                     }
                 }
-                
-                if (trials == 0) 
+
+                if (trials == 0)
                 {
                     cout << "converged" << endl;
                     return true;
@@ -1247,9 +1369,9 @@ namespace Learner
     // sfen_file_streams: fstream of each teacher phase file
     // sfen_count_in_file: The number of teacher positions present in each file.
     void shuffle_write(
-        const string& output_file_name, 
-        PRNG& prng, 
-        vector<fstream>& sfen_file_streams, 
+        const string& output_file_name,
+        PRNG& prng,
+        vector<fstream>& sfen_file_streams,
         vector<uint64_t>& sfen_count_in_file)
     {
         uint64_t total_sfen_count = 0;
@@ -1323,7 +1445,7 @@ namespace Learner
         // Temporary file is written to tmp/ folder for each buffer_size phase.
         // For example, if buffer_size = 20M, you need a buffer of 20M*40bytes = 800MB.
         // In a PC with a small memory, it would be better to reduce this.
-        // However, if the number of files increases too much, 
+        // However, if the number of files increases too much,
         // it will not be possible to open at the same time due to OS restrictions.
         // There should have been a limit of 512 per process on Windows, so you can open here as 500,
         // The current setting is 500 files x 20M = 10G = 10 billion phases.
@@ -1377,7 +1499,7 @@ namespace Learner
 
             // Read in units of sizeof(PackedSfenValue),
             // Ignore the last remaining fraction. (Fails in fs.read, so exit while)
-            // (The remaining fraction seems to be half-finished data 
+            // (The remaining fraction seems to be half-finished data
             // that was created because it was stopped halfway during teacher generation.)
         }
 
@@ -1385,14 +1507,14 @@ namespace Learner
             write_buffer(buf_write_marker);
 
         // Only shuffled files have been written write_file_count.
-        // As a second pass, if you open all of them at the same time, 
+        // As a second pass, if you open all of them at the same time,
         // select one at random and load one phase at a time
         // Now you have shuffled.
 
-        // Original file for shirt full + tmp file + file to write 
+        // Original file for shirt full + tmp file + file to write
         // requires 3 times the storage capacity of the original file.
         // 1 billion SSD is not enough for shuffling because it is 400GB for 10 billion phases.
-        // If you want to delete (or delete by hand) the 
+        // If you want to delete (or delete by hand) the
         // original file at this point after writing to tmp,
         // The storage capacity is about twice that of the original file.
         // So, maybe we should have an option to delete the original file.
@@ -1477,11 +1599,11 @@ namespace Learner
 
         std::cout << "write : " << output_file_name << endl;
 
-        // If the file to be written exceeds 2GB, it cannot be 
+        // If the file to be written exceeds 2GB, it cannot be
         // written in one shot with fstream::write, so use wrapper.
         write_memory_to_file(
-            output_file_name, 
-            (void*)&buf[0], 
+            output_file_name,
+            (void*)&buf[0],
             sizeof(PackedSfenValue) * buf.size());
 
         std::cout << "..shuffle_on_memory done." << std::endl;
@@ -1521,10 +1643,10 @@ namespace Learner
         uint64_t buffer_size = 20000000;
         // fast shuffling assuming each file is shuffled
         bool shuffle_quick = false;
-        // A function to read the entire file in memory and shuffle it. 
+        // A function to read the entire file in memory and shuffle it.
         // (Requires file size memory)
         bool shuffle_on_memory = false;
-        // Conversion of packed sfen. In plain, it consists of sfen(string), 
+        // Conversion of packed sfen. In plain, it consists of sfen(string),
         // evaluation value (integer), move (eg 7g7f, string), result (loss-1, win 1, draw 0)
         bool use_convert_plain = false;
         // convert plain format teacher to Yaneura King's bin
@@ -1541,15 +1663,15 @@ namespace Learner
         // File name to write in those cases (default is "shuffled_sfen.bin")
         string output_file_name = "shuffled_sfen.bin";
 
-        // If the absolute value of the evaluation value 
-        // in the deep search of the teacher phase exceeds this value, 
+        // If the absolute value of the evaluation value
+        // in the deep search of the teacher phase exceeds this value,
         // that phase is discarded.
         int eval_limit = 32000;
 
         // Flag to save the evaluation function file only once near the end.
         bool save_only_once = false;
 
-        // Shuffle about what you are pre-reading on the teacher aspect. 
+        // Shuffle about what you are pre-reading on the teacher aspect.
         // (Shuffle of about 10 million phases)
         // Turn on if you want to pass a pre-shuffled file.
         bool no_shuffle = false;
@@ -1559,8 +1681,8 @@ namespace Learner
         ELMO_LAMBDA2 = 0.33;
         ELMO_LAMBDA_LIMIT = 32000;
 
-        // Discount rate. If this is set to a value other than 0, 
-        // the slope will be added even at other than the PV termination. 
+        // Discount rate. If this is set to a value other than 0,
+        // the slope will be added even at other than the PV termination.
         // (At that time, apply this discount rate)
         double discount_rate = 0;
 
@@ -1620,18 +1742,18 @@ namespace Learner
             else if (option == "eta2_epoch") is >> eta2_epoch;
 
             // Accept also the old option name.
-            else if (option == "use_draw_in_training" 
-                  || option == "use_draw_games_in_training") 
+            else if (option == "use_draw_in_training"
+                  || option == "use_draw_games_in_training")
                 is >> use_draw_games_in_training;
 
             // Accept also the old option name.
-            else if (option == "use_draw_in_validation" 
-                  || option == "use_draw_games_in_validation") 
+            else if (option == "use_draw_in_validation"
+                  || option == "use_draw_games_in_validation")
                 is >> use_draw_games_in_validation;
 
             // Accept also the old option name.
-            else if (option == "use_hash_in_training" 
-                  || option == "skip_duplicated_positions_in_training") 
+            else if (option == "use_hash_in_training"
+                  || option == "skip_duplicated_positions_in_training")
                 is >> skip_duplicated_positions_in_training;
 
             else if (option == "winning_probability_coefficient") is >> winning_probability_coefficient;
@@ -1792,9 +1914,9 @@ namespace Learner
             Eval::init_NNUE();
             cout << "convert_bin_from_pgn-extract.." << endl;
             convert_bin_from_pgn_extract(
-                filenames, 
-                output_file_name, 
-                pgn_eval_side_to_move, 
+                filenames,
+                output_file_name,
+                pgn_eval_side_to_move,
                 convert_no_eval_fens_as_score_zero);
 
             return;
@@ -1808,7 +1930,7 @@ namespace Learner
         // Insert the file name for the number of loops.
         for (int i = 0; i < loop; ++i)
         {
-            // sfen reader, I'll read it in reverse 
+            // sfen reader, I'll read it in reverse
             // order so I'll reverse it here. I'm sorry.
             for (auto it = filenames.rbegin(); it != filenames.rend(); ++it)
             {
@@ -1891,12 +2013,12 @@ namespace Learner
 
         learn_think.mini_batch_size = mini_batch_size;
 
-        if (validation_set_file_name.empty()) 
+        if (validation_set_file_name.empty())
         {
             // Get about 10,000 data for mse calculation.
             sr.read_for_mse();
         }
-        else 
+        else
         {
             sr.read_validation_set(validation_set_file_name, eval_limit);
         }

From 585a5351bf1dee8c3fb56f74a53f3d035781189f Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 10 Sep 2020 00:42:24 +0200
Subject: [PATCH 087/398] Fix warnings.

---
 src/extra/nnue_data_binpack_format.h | 234 ++++++++-------------------
 1 file changed, 66 insertions(+), 168 deletions(-)

diff --git a/src/extra/nnue_data_binpack_format.h b/src/extra/nnue_data_binpack_format.h
index bec0e9ad..9b7a868e 100644
--- a/src/extra/nnue_data_binpack_format.h
+++ b/src/extra/nnue_data_binpack_format.h
@@ -501,14 +501,14 @@ namespace chess
 
         [[nodiscard]] constexpr ValueType& operator[](const KeyType& dir)
         {
-            assert(ordinal(dir) < SizeV);
+            assert(static_cast<int>(ordinal(dir)) < static_cast<int>(SizeV));
 
             return elements[ordinal(dir)];
         }
 
         [[nodiscard]] constexpr const ValueType& operator[](const KeyType& dir) const
         {
-            assert(ordinal(dir) < SizeV);
+            assert(static_cast<int>(ordinal(dir)) < static_cast<int>(SizeV));
 
             return elements[ordinal(dir)];
         }
@@ -1141,9 +1141,9 @@ namespace chess
         {
         }
 
-        constexpr Offset(int files, int ranks) :
-            files(files),
-            ranks(ranks)
+        constexpr Offset(int files_, int ranks_) :
+            files(files_),
+            ranks(ranks_)
         {
         }
 
@@ -1328,7 +1328,7 @@ namespace chess
         [[nodiscard]] constexpr Color color() const
         {
             assert(isOk());
-            return !fromOrdinal<Color>(ordinal(rank()) + ordinal(file()) & 1);
+            return !fromOrdinal<Color>((ordinal(rank()) + ordinal(file())) & 1);
         }
 
         constexpr void flipVertically()
@@ -1887,11 +1887,11 @@ namespace chess
         {
         }
 
-        constexpr ReverseMove(const Move& move, Piece capturedPiece, Square oldEpSquare, CastlingRights oldCastlingRights) :
-            move(move),
-            capturedPiece(capturedPiece),
-            oldEpSquare(oldEpSquare),
-            oldCastlingRights(oldCastlingRights)
+        constexpr ReverseMove(const Move& move_, Piece capturedPiece_, Square oldEpSquare_, CastlingRights oldCastlingRights_) :
+            move(move_),
+            capturedPiece(capturedPiece_),
+            oldEpSquare(oldEpSquare_),
+            oldCastlingRights(oldCastlingRights_)
         {
         }
 
@@ -3100,13 +3100,13 @@ namespace chess
                 return bbs;
             }
 
-            [[nodiscard]] static Bitboard generateSliderPseudoAttacks(const std::array<Offset, 4> & offsets, Square fromSq)
+            [[nodiscard]] static Bitboard generateSliderPseudoAttacks(const std::array<Offset, 4> & offsets_, Square fromSq)
             {
                 assert(fromSq.isOk());
 
                 Bitboard bb{};
 
-                for (auto&& offset : offsets)
+                for (auto&& offset : offsets_)
                 {
                     SquareCoords fromSqC = fromSq.coords();
 
@@ -3370,32 +3370,32 @@ namespace chess
 
             static const EnumArray2<Square, Square, Bitboard> between = []()
             {
-                EnumArray2<Square, Square, Bitboard> between;
+                EnumArray2<Square, Square, Bitboard> between_;
 
                 for (Square s1 : values<Square>())
                 {
                     for (Square s2 : values<Square>())
                     {
-                        between[s1][s2] = generateBetween(s1, s2);
+                        between_[s1][s2] = generateBetween(s1, s2);
                     }
                 }
 
-                return between;
+                return between_;
             }();
 
             static const EnumArray2<Square, Square, Bitboard> line = []()
             {
-                EnumArray2<Square, Square, Bitboard> line;
+                EnumArray2<Square, Square, Bitboard> line_;
 
                 for (Square s1 : values<Square>())
                 {
                     for (Square s2 : values<Square>())
                     {
-                        line[s1][s2] = generateLine(s1, s2);
+                        line_[s1][s2] = generateLine(s1, s2);
                     }
                 }
 
-                return line;
+                return line_;
             }();
         }
 
@@ -4262,12 +4262,12 @@ namespace chess
             else if (move.type == MoveType::EnPassant)
             {
                 const Piece movedPiece = m_pieces[move.to];
-                const Piece capturedPiece(PieceType::Pawn, !movedPiece.color());
+                const Piece capturedPiece_(PieceType::Pawn, !movedPiece.color());
                 const Square capturedPieceSq(move.to.file(), move.from.rank());
 
                 m_pieces[move.to] = Piece::none();
                 m_pieces[move.from] = movedPiece;
-                m_pieces[capturedPieceSq] = capturedPiece;
+                m_pieces[capturedPieceSq] = capturedPiece_;
 
                 m_pieceBB[movedPiece] ^= move.from;
                 m_pieceBB[movedPiece] ^= move.to;
@@ -4276,14 +4276,14 @@ namespace chess
                 m_pieceBB[Piece::none()] ^= move.to;
 
                 // on ep move there are 3 squares involved
-                m_pieceBB[capturedPiece] ^= capturedPieceSq;
+                m_pieceBB[capturedPiece_] ^= capturedPieceSq;
                 m_pieceBB[Piece::none()] ^= capturedPieceSq;
 
                 m_piecesByColorBB[movedPiece.color()] ^= move.to;
                 m_piecesByColorBB[movedPiece.color()] ^= move.from;
-                m_piecesByColorBB[capturedPiece.color()] ^= capturedPieceSq;
+                m_piecesByColorBB[capturedPiece_.color()] ^= capturedPieceSq;
 
-                ++m_pieceCount[capturedPiece];
+                ++m_pieceCount[capturedPiece_];
                 --m_pieceCount[Piece::none()];
             }
             else // if (move.type == MoveType::Castle)
@@ -4565,9 +4565,6 @@ namespace chess
 
         [[nodiscard]] inline bool isCheckAfterMove(Move move) const;
 
-        // Checks whether ANY `move` is legal.
-        [[nodiscard]] inline bool isMoveLegal(Move move) const;
-
         [[nodiscard]] inline bool isPseudoLegalMoveLegal(Move move) const;
 
         [[nodiscard]] inline bool isMovePseudoLegal(Move move) const;
@@ -4806,7 +4803,7 @@ namespace chess
             }
         }
 
-        [[nodiscard]] FORCEINLINE constexpr std::uint8_t compressKing(const Position& position, Square sq, Piece piece)
+        [[nodiscard]] FORCEINLINE constexpr std::uint8_t compressKing(const Position& position, Square /* sq */, Piece piece)
         {
             const Color color = piece.color();
             const Color sideToMove = position.sideToMove();
@@ -4829,19 +4826,19 @@ namespace chess
     namespace detail::lookup
     {
         static constexpr EnumArray<PieceType, std::uint8_t(*)(const Position&, Square, Piece)> pieceCompressorFunc = []() {
-            EnumArray<PieceType, std::uint8_t(*)(const Position&, Square, Piece)> pieceCompressorFunc{};
+            EnumArray<PieceType, std::uint8_t(*)(const Position&, Square, Piece)> pieceCompressorFunc_{};
 
-            pieceCompressorFunc[PieceType::Knight] = detail::compressOrdinaryPiece;
-            pieceCompressorFunc[PieceType::Bishop] = detail::compressOrdinaryPiece;
-            pieceCompressorFunc[PieceType::Queen] = detail::compressOrdinaryPiece;
+            pieceCompressorFunc_[PieceType::Knight] = detail::compressOrdinaryPiece;
+            pieceCompressorFunc_[PieceType::Bishop] = detail::compressOrdinaryPiece;
+            pieceCompressorFunc_[PieceType::Queen] = detail::compressOrdinaryPiece;
 
-            pieceCompressorFunc[PieceType::Pawn] = detail::compressPawn;
-            pieceCompressorFunc[PieceType::Rook] = detail::compressRook;
-            pieceCompressorFunc[PieceType::King] = detail::compressKing;
+            pieceCompressorFunc_[PieceType::Pawn] = detail::compressPawn;
+            pieceCompressorFunc_[PieceType::Rook] = detail::compressRook;
+            pieceCompressorFunc_[PieceType::King] = detail::compressKing;
 
-            pieceCompressorFunc[PieceType::None] = [](const Position&, Square, Piece) -> std::uint8_t { /* should never happen */ return 0; };
+            pieceCompressorFunc_[PieceType::None] = [](const Position&, Square, Piece) -> std::uint8_t { /* should never happen */ return 0; };
 
-            return pieceCompressorFunc;
+            return pieceCompressorFunc_;
         }();
     }
 
@@ -5089,6 +5086,8 @@ namespace chess
                     king ^= occupiedChange;
                 }
             }
+            case PieceType::None:
+                assert(false);
             }
         }
 
@@ -5285,23 +5284,23 @@ namespace chess
     namespace detail::lookup
     {
         static constexpr EnumArray<Piece, char> fenPiece = []() {
-            EnumArray<Piece, char> fenPiece{};
+            EnumArray<Piece, char> fenPiece_{};
 
-            fenPiece[whitePawn] = 'P';
-            fenPiece[blackPawn] = 'p';
-            fenPiece[whiteKnight] = 'N';
-            fenPiece[blackKnight] = 'n';
-            fenPiece[whiteBishop] = 'B';
-            fenPiece[blackBishop] = 'b';
-            fenPiece[whiteRook] = 'R';
-            fenPiece[blackRook] = 'r';
-            fenPiece[whiteQueen] = 'Q';
-            fenPiece[blackQueen] = 'q';
-            fenPiece[whiteKing] = 'K';
-            fenPiece[blackKing] = 'k';
-            fenPiece[Piece::none()] = 'X';
+            fenPiece_[whitePawn] = 'P';
+            fenPiece_[blackPawn] = 'p';
+            fenPiece_[whiteKnight] = 'N';
+            fenPiece_[blackKnight] = 'n';
+            fenPiece_[whiteBishop] = 'B';
+            fenPiece_[blackBishop] = 'b';
+            fenPiece_[whiteRook] = 'R';
+            fenPiece_[blackRook] = 'r';
+            fenPiece_[whiteQueen] = 'Q';
+            fenPiece_[blackQueen] = 'q';
+            fenPiece_[whiteKing] = 'K';
+            fenPiece_[blackKing] = 'k';
+            fenPiece_[Piece::none()] = 'X';
 
-            return fenPiece;
+            return fenPiece_;
         }();
     }
 
@@ -5495,21 +5494,21 @@ namespace chess
     namespace detail::lookup
     {
         static constexpr EnumArray<Square, CastlingRights> preservedCastlingRights = []() {
-            EnumArray<Square, CastlingRights> preservedCastlingRights{};
-            for (CastlingRights& rights : preservedCastlingRights)
+            EnumArray<Square, CastlingRights> preservedCastlingRights_{};
+            for (CastlingRights& rights : preservedCastlingRights_)
             {
                 rights = ~CastlingRights::None;
             }
 
-            preservedCastlingRights[e1] = ~CastlingRights::White;
-            preservedCastlingRights[e8] = ~CastlingRights::Black;
+            preservedCastlingRights_[e1] = ~CastlingRights::White;
+            preservedCastlingRights_[e8] = ~CastlingRights::Black;
 
-            preservedCastlingRights[h1] = ~CastlingRights::WhiteKingSide;
-            preservedCastlingRights[a1] = ~CastlingRights::WhiteQueenSide;
-            preservedCastlingRights[h8] = ~CastlingRights::BlackKingSide;
-            preservedCastlingRights[a8] = ~CastlingRights::BlackQueenSide;
+            preservedCastlingRights_[h1] = ~CastlingRights::WhiteKingSide;
+            preservedCastlingRights_[a1] = ~CastlingRights::WhiteQueenSide;
+            preservedCastlingRights_[h8] = ~CastlingRights::BlackKingSide;
+            preservedCastlingRights_[a8] = ~CastlingRights::BlackQueenSide;
 
-            return preservedCastlingRights;
+            return preservedCastlingRights_;
         }();
     }
 
@@ -5687,8 +5686,6 @@ namespace chess
         [[nodiscard]] inline std::string moveToUci(const Position& pos, const Move& move);
         [[nodiscard]] inline Move uciToMove(const Position& pos, std::string_view sv);
 
-        [[nodiscard]] inline std::optional<Move> tryUciToMove(const Position& pos, std::string_view sv);
-
         [[nodiscard]] inline std::string moveToUci(const Position& pos, const Move& move)
         {
             std::string s;
@@ -5751,103 +5748,6 @@ namespace chess
                 }
             }
         }
-
-        [[nodiscard]] inline std::optional<Move> tryUciToMove(const Position& pos, std::string_view sv)
-        {
-            if (sv.size() < 4 || sv.size() > 5)
-            {
-                return std::nullopt;
-            }
-
-            const auto from = parser_bits::tryParseSquare(sv.substr(0, 2));
-            const auto to = parser_bits::tryParseSquare(sv.substr(2, 2));
-
-            Move move{};
-
-            if (!from.has_value() || !to.has_value())
-            {
-                return std::nullopt;
-            }
-
-            if (sv.size() == 5)
-            {
-                const auto promotedPieceType = fromChar<PieceType>(sv[4]);
-                if (!promotedPieceType.has_value())
-                {
-                    return std::nullopt;
-                }
-
-                if (
-                    *promotedPieceType != PieceType::Knight
-                    && *promotedPieceType != PieceType::Bishop
-                    && *promotedPieceType != PieceType::Rook
-                    && *promotedPieceType != PieceType::Queen
-                    )
-                {
-                    return std::nullopt;
-                }
-
-                move = Move::promotion(*from, *to, Piece(*promotedPieceType, pos.sideToMove()));
-            }
-            else // sv.size() == 4
-            {
-
-                if (
-                    pos.pieceAt(*from).type() == PieceType::King
-                    && std::abs(from->file() - to->file()) > 1
-                    )
-                {
-                    // uci king destinations are on files C or G.
-
-                    if (pos.sideToMove() == Color::White)
-                    {
-                        if (*from != e1)
-                        {
-                            return std::nullopt;
-                        }
-
-                        if (*to != c1 && *to != g1)
-                        {
-                            return std::nullopt;
-                        }
-                    }
-                    else
-                    {
-                        if (*from != e8)
-                        {
-                            return std::nullopt;
-                        }
-
-                        if (*to != c8 && *to != g8)
-                        {
-                            return std::nullopt;
-                        }
-                    }
-
-                    const CastleType castleType =
-                        (to->file() == fileG)
-                        ? CastleType::Short
-                        : CastleType::Long;
-
-                    move = Move::castle(castleType, pos.sideToMove());
-                }
-                else if (to == pos.epSquare())
-                {
-                    move = Move::enPassant(*from, *to);
-                }
-                else
-                {
-                    move = Move::normal(*from, *to);
-                }
-            }
-
-            if (!pos.isMoveLegal(move))
-            {
-                return std::nullopt;
-            }
-
-            return move;
-        }
     }
 }
 
@@ -6206,7 +6106,7 @@ namespace binpack
         {
             SfenPacker packer;
             auto& stream = packer.stream;
-            stream.set_data((uint8_t*)&sfen);
+            stream.set_data(const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(&sfen)));
 
             chess::Position pos{};
 
@@ -6450,11 +6350,11 @@ namespace binpack
         std::uint16_t numPlies;
         unsigned char* movetext;
 
-        PackedMoveScoreListReader(const TrainingDataEntry& entry, unsigned char* movetext, std::uint16_t numPlies) :
-            entry(entry),
-            movetext(movetext),
-            numPlies(numPlies),
-            m_lastScore(-entry.score)
+        PackedMoveScoreListReader(const TrainingDataEntry& entry_, unsigned char* movetext_, std::uint16_t numPlies_) :
+            entry(entry_),
+            numPlies(numPlies_),
+            movetext(movetext_),
+            m_lastScore(-entry_.score)
         {
 
         }
@@ -7247,7 +7147,6 @@ namespace binpack
 
     inline void convertBinToPlain(std::string inputPath, std::string outputPath, std::ios_base::openmode om)
     {
-        constexpr std::size_t reportEveryNPositions = 100'000;
         constexpr std::size_t bufferSize = MiB;
 
         std::cout << "Converting " << inputPath << " to " << outputPath << '\n';
@@ -7300,7 +7199,6 @@ namespace binpack
 
     inline void convertPlainToBin(std::string inputPath, std::string outputPath, std::ios_base::openmode om)
     {
-        constexpr std::size_t reportEveryNPositions = 100'000;
         constexpr std::size_t bufferSize = MiB;
 
         std::cout << "Compressing " << inputPath << " to " << outputPath << '\n';

From a7ca8265937f8ead43355284c608893cf68ffbb5 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 10 Sep 2020 01:02:56 +0200
Subject: [PATCH 088/398] MIT license/copyright notice in the library file.

---
 src/extra/nnue_data_binpack_format.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/src/extra/nnue_data_binpack_format.h b/src/extra/nnue_data_binpack_format.h
index 9b7a868e..5dd5819c 100644
--- a/src/extra/nnue_data_binpack_format.h
+++ b/src/extra/nnue_data_binpack_format.h
@@ -1,3 +1,29 @@
+/*
+
+Copyright 2020 Tomasz Sobczyk
+
+Permission is hereby granted, free of charge,
+to any person obtaining a copy of this software
+and associated documentation files (the "Software"),
+to deal in the Software without restriction,
+including without limitation the rights to use, copy,
+modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall
+be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH
+THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+*/
+
 #pragma once
 
 #include <cstdio>

From 53ad4d8b5613ff005d737d42b5ef25fcd88f38f9 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 10 Sep 2020 01:48:48 +0200
Subject: [PATCH 089/398] A speculative build fix for linux.

---
 src/extra/nnue_data_binpack_format.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/extra/nnue_data_binpack_format.h b/src/extra/nnue_data_binpack_format.h
index 5dd5819c..c86a55c2 100644
--- a/src/extra/nnue_data_binpack_format.h
+++ b/src/extra/nnue_data_binpack_format.h
@@ -40,7 +40,13 @@ THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include <cassert>
 #include <array>
 #include <immintrin.h>
+
+#ifdef linux
+#include <x86intrin.h>
+#else
 #include <intrin.h>
+#endif
+
 #include <nmmintrin.h>
 #include <limits>
 

From 7e6901af27effddcf75ac293377e7879eb2d517f Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 10 Sep 2020 12:36:39 +0200
Subject: [PATCH 090/398] Remove unused immintring. Include intrin.h only on
 some platforms, otherwise builtins are used.

---
 src/extra/nnue_data_binpack_format.h | 109 +--------------------------
 1 file changed, 4 insertions(+), 105 deletions(-)

diff --git a/src/extra/nnue_data_binpack_format.h b/src/extra/nnue_data_binpack_format.h
index c86a55c2..3204b4b4 100644
--- a/src/extra/nnue_data_binpack_format.h
+++ b/src/extra/nnue_data_binpack_format.h
@@ -39,17 +39,11 @@ THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include <cstdio>
 #include <cassert>
 #include <array>
-#include <immintrin.h>
-
-#ifdef linux
-#include <x86intrin.h>
-#else
-#include <intrin.h>
-#endif
-
-#include <nmmintrin.h>
 #include <limits>
 
+#if (defined(_MSC_VER) || defined(__INTEL_COMPILER)) && !defined(__clang__)
+#include <intrin.h>
+#endif
 
 namespace chess
 {
@@ -177,87 +171,12 @@ namespace chess
     #endif
     }
 
-
-    template <typename IntT>
-    [[nodiscard]] constexpr IntT mulSaturate(IntT lhs, IntT rhs)
-    {
-        static_assert(std::is_unsigned_v<IntT>); // currently no support for signed
-
-    #if defined (_MSC_VER)
-
-        if (lhs == 0) return 0;
-
-        const IntT result = lhs * rhs;
-        return result / lhs == rhs ? result : std::numeric_limits<IntT>::max();
-
-    #elif defined (__GNUC__)
-
-        IntT result{};
-        return __builtin_mul_overflow(lhs, rhs, &result) ? std::numeric_limits<IntT>::max() : result;
-
-    #endif
-    }
-
-    template <typename IntT>
-    [[nodiscard]] constexpr IntT addSaturate(IntT lhs, IntT rhs)
-    {
-        static_assert(std::is_unsigned_v<IntT>); // currently no support for signed
-
-    #if defined (_MSC_VER)
-
-        const IntT result = lhs + rhs;
-        return result >= lhs ? result : std::numeric_limits<IntT>::max();
-
-    #elif defined (__GNUC__)
-
-        IntT result{};
-        return __builtin_add_overflow(lhs, rhs, &result) ? std::numeric_limits<IntT>::max() : result;
-
-    #endif
-    }
-
-    template <typename IntT>
-    [[nodiscard]] constexpr bool addOverflows(IntT lhs, IntT rhs)
-    {
-    #if defined (_MSC_VER)
-
-        return static_cast<IntT>(lhs + rhs) < lhs;
-
-    #elif defined (__GNUC__)
-
-        IntT result{};
-        __builtin_add_overflow(lhs, rhs, &result);
-        return result;
-
-    #endif
-    }
-
     template <typename IntT>
     [[nodiscard]] constexpr IntT floorLog2(IntT value)
     {
         return intrin::msb_constexpr(value);
     }
 
-    template <typename IntT>
-    constexpr std::size_t maxFibonacciNumberIndexForType()
-    {
-        static_assert(std::is_unsigned_v<IntT>);
-
-        switch (sizeof(IntT))
-        {
-        case 8:
-            return 93;
-        case 4:
-            return 47;
-        case 2:
-            return 24;
-        case 1:
-            return 13;
-        }
-
-        return 0;
-    }
-
     template <typename IntT>
     constexpr auto computeMasks()
     {
@@ -278,26 +197,6 @@ namespace chess
     template <typename IntT>
     constexpr auto nbitmask = computeMasks<IntT>();
 
-    template <typename IntT>
-    constexpr auto computeFibonacciNumbers()
-    {
-        constexpr std::size_t size = maxFibonacciNumberIndexForType<IntT>() + 1;
-        std::array<IntT, size> numbers{};
-        numbers[0] = 0;
-        numbers[1] = 1;
-
-        for (std::size_t i = 2; i < size; ++i)
-        {
-            numbers[i] = numbers[i - 1] + numbers[i - 2];
-        }
-
-        return numbers;
-    }
-
-    // F(0) = 0, F(1) = 1
-    template <typename IntT>
-    constexpr auto fibonacciNumbers = computeFibonacciNumbers<IntT>();
-
     template <std::size_t N, typename FromT, typename ToT = std::make_signed_t<FromT>>
     inline ToT signExtend(FromT value)
     {
@@ -2700,7 +2599,7 @@ namespace chess
         return Bitboard::square(sq0) | sq1;
     }
 
-    [[nodiscard]] constexpr Bitboard operator""_bb(std::uint64_t bits)
+    [[nodiscard]] constexpr Bitboard operator""_bb(unsigned long long bits)
     {
         return Bitboard::fromBits(bits);
     }

From 59402d4a6de1fc27a0253c8f7d3c2d604a5236fb Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 10 Sep 2020 13:02:45 +0200
Subject: [PATCH 091/398] Include <climits> for CHAR_BIT. Test both formats in
 instrumented learn.

---
 src/extra/nnue_data_binpack_format.h | 1 +
 tests/instrumented_learn.sh          | 8 ++++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/extra/nnue_data_binpack_format.h b/src/extra/nnue_data_binpack_format.h
index 3204b4b4..839fc17c 100644
--- a/src/extra/nnue_data_binpack_format.h
+++ b/src/extra/nnue_data_binpack_format.h
@@ -40,6 +40,7 @@ THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include <cassert>
 #include <array>
 #include <limits>
+#include <climits>
 
 #if (defined(_MSC_VER) || defined(__INTEL_COMPILER)) && !defined(__clang__)
 #include <intrin.h>
diff --git a/tests/instrumented_learn.sh b/tests/instrumented_learn.sh
index 756569e6..147c0c97 100755
--- a/tests/instrumented_learn.sh
+++ b/tests/instrumented_learn.sh
@@ -78,7 +78,9 @@ cat << EOF > gensfen01.exp
  send "setoption name Threads value $threads\n"
  send "setoption name Use NNUE value false\n"
  send "isready\n"
- send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data_01/training_data.bin use_raw_nnue_eval 0\n"
+ send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data_01/training_data.bin use_raw_nnue_eval 0 sfen_format bin\n"
+ expect "gensfen finished."
+ send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data_01/training_data.binpack use_raw_nnue_eval 0 sfen_format binpack\n"
  expect "gensfen finished."
 
  send "quit\n"
@@ -100,7 +102,9 @@ cat << EOF > gensfen02.exp
  send "setoption name Threads value $threads\n"
  send "setoption name Use NNUE value true\n"
  send "isready\n"
- send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data_02/training_data.bin use_raw_nnue_eval 0\n"
+ send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data_01/training_data.bin use_raw_nnue_eval 0 sfen_format bin\n"
+ expect "gensfen finished."
+ send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data_01/training_data.binpack use_raw_nnue_eval 0 sfen_format binpack\n"
  expect "gensfen finished."
 
  send "quit\n"

From ac6e6f73f281458e6c5488debf4d96d7a50c8bf4 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Thu, 10 Sep 2020 20:54:47 +0900
Subject: [PATCH 092/398] Added EnableTranspositionTable UCI option to
 enable/disable transposition table.

---
 src/tt.cpp        | 11 +++++++++++
 src/tt.h          |  4 ++++
 src/ucioption.cpp |  7 ++++++-
 3 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/src/tt.cpp b/src/tt.cpp
index 60a3a5f1..fc8ab3b1 100644
--- a/src/tt.cpp
+++ b/src/tt.cpp
@@ -28,6 +28,10 @@
 
 TranspositionTable TT; // Our global transposition table
 
+#ifdef EVAL_LEARN
+bool TranspositionTable::enable_transposition_table = true;
+#endif
+
 /// TTEntry::save() populates the TTEntry with a new node's data, possibly
 /// overwriting an old position. Update is not atomic and can be racy.
 
@@ -116,6 +120,13 @@ void TranspositionTable::clear() {
 
 TTEntry* TranspositionTable::probe(const Key key, bool& found) const {
 
+#ifdef EVAL_LEARN
+  if (!enable_transposition_table) {
+      found = false;
+      return first_entry(0);
+  }
+#endif
+
   TTEntry* const tte = first_entry(key);
   const uint16_t key16 = (uint16_t)key;  // Use the low 16 bits as key inside the cluster
 
diff --git a/src/tt.h b/src/tt.h
index fdfd6769..e83b6f3c 100644
--- a/src/tt.h
+++ b/src/tt.h
@@ -84,6 +84,10 @@ public:
     return &table[mul_hi64(key, clusterCount)].entry[0];
   }
 
+#ifdef EVAL_LEARN
+  static bool enable_transposition_table;
+#endif
+
 private:
   friend struct TTEntry;
 
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index 0e561416..b24d8d78 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -44,7 +44,10 @@ void on_use_NNUE(const Option& ) { Eval::init_NNUE(); }
 void on_eval_file(const Option& ) { Eval::init_NNUE(); }
 #ifdef EVAL_LEARN
 void on_prune_at_shallow_depth_on_pv_node(const Option& o) {
-  Search::prune_at_shallow_depth_on_pv_node = o;
+    Search::prune_at_shallow_depth_on_pv_node = o;
+}
+void on_enable_transposition_table(const Option& o) {
+    TranspositionTable::enable_transposition_table = o;
 }
 #endif
 
@@ -102,6 +105,8 @@ void init(OptionsMap& o) {
   o["EvalSaveDir"] << Option("evalsave");
   // Prune at shallow depth on PV nodes. Setting this value to true gains elo in shallow search.
   o["PruneAtShallowDepthOnPvNode"] << Option(false, on_prune_at_shallow_depth_on_pv_node);
+  // Enable transposition table.
+  o["EnableTranspositionTable"] << Option(true, on_enable_transposition_table);
 #endif
 }
 

From c76bb34a96ed36511e360a60bd9f33e364617139 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 10 Sep 2020 15:18:47 +0200
Subject: [PATCH 093/398] Add convert UCI function that allows conversion of
 files between any of plain, bin, and binpack. Usage: convert infile outfile
 [append].

---
 src/extra/nnue_data_binpack_format.h |  33 +++++----
 src/learn/convert.cpp                | 105 +++++++++++++++++++++++++++
 src/uci.cpp                          |   3 +
 3 files changed, 125 insertions(+), 16 deletions(-)

diff --git a/src/extra/nnue_data_binpack_format.h b/src/extra/nnue_data_binpack_format.h
index 839fc17c..2c555939 100644
--- a/src/extra/nnue_data_binpack_format.h
+++ b/src/extra/nnue_data_binpack_format.h
@@ -6915,7 +6915,7 @@ namespace binpack
     {
         constexpr std::size_t reportEveryNPositions = 100'000;
 
-        std::cout << "Compressing " << inputPath << " to " << outputPath << '\n';
+        std::cout << "Converting " << inputPath << " to " << outputPath << '\n';
 
         CompressedTrainingDataEntryWriter writer(outputPath, om);
         TrainingDataEntry e;
@@ -6961,13 +6961,15 @@ namespace binpack
             if (key == "ply"sv) e.ply = std::stoi(value);
             if (key == "result"sv) e.result = std::stoi(value);
         }
+
+        std::cout << "Finished. Converted " << numProcessedPositions << " positions.\n";
     }
 
     inline void convertBinpackToPlain(std::string inputPath, std::string outputPath, std::ios_base::openmode om)
     {
         constexpr std::size_t bufferSize = MiB;
 
-        std::cout << "Decompressing " << inputPath << " to " << outputPath << '\n';
+        std::cout << "Converting " << inputPath << " to " << outputPath << '\n';
 
         CompressedTrainingDataEntryReader reader(inputPath);
         std::ofstream outputFile(outputPath, om);
@@ -6999,6 +7001,8 @@ namespace binpack
             const auto cur = outputFile.tellp();
             std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
         }
+
+        std::cout << "Finished. Converted " << numProcessedPositions << " positions.\n";
     }
 
 
@@ -7006,14 +7010,9 @@ namespace binpack
     {
         constexpr std::size_t reportEveryNPositions = 100'000;
 
-        std::cout << "Compressing " << inputPath << " to " << outputPath << '\n';
+        std::cout << "Converting " << inputPath << " to " << outputPath << '\n';
 
         CompressedTrainingDataEntryWriter writer(outputPath, om);
-        TrainingDataEntry e;
-
-        std::string key;
-        std::string value;
-        std::string move;
 
         std::ifstream inputFile(inputPath, std::ios_base::binary);
         const auto base = inputFile.tellg();
@@ -7037,13 +7036,15 @@ namespace binpack
                 std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
             }
         }
+
+        std::cout << "Finished. Converted " << numProcessedPositions << " positions.\n";
     }
 
     inline void convertBinpackToBin(std::string inputPath, std::string outputPath, std::ios_base::openmode om)
     {
         constexpr std::size_t bufferSize = MiB;
 
-        std::cout << "Decompressing " << inputPath << " to " << outputPath << '\n';
+        std::cout << "Converting " << inputPath << " to " << outputPath << '\n';
 
         CompressedTrainingDataEntryReader reader(inputPath);
         std::ofstream outputFile(outputPath, std::ios_base::binary | om);
@@ -7075,6 +7076,8 @@ namespace binpack
             const auto cur = outputFile.tellp();
             std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
         }
+
+        std::cout << "Finished. Converted " << numProcessedPositions << " positions.\n";
     }
 
     inline void convertBinToPlain(std::string inputPath, std::string outputPath, std::ios_base::openmode om)
@@ -7083,12 +7086,6 @@ namespace binpack
 
         std::cout << "Converting " << inputPath << " to " << outputPath << '\n';
 
-        TrainingDataEntry e;
-
-        std::string key;
-        std::string value;
-        std::string move;
-
         std::ifstream inputFile(inputPath, std::ios_base::binary);
         const auto base = inputFile.tellg();
         std::size_t numProcessedPositions = 0;
@@ -7127,13 +7124,15 @@ namespace binpack
             const auto cur = outputFile.tellp();
             std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
         }
+
+        std::cout << "Finished. Converted " << numProcessedPositions << " positions.\n";
     }
 
     inline void convertPlainToBin(std::string inputPath, std::string outputPath, std::ios_base::openmode om)
     {
         constexpr std::size_t bufferSize = MiB;
 
-        std::cout << "Compressing " << inputPath << " to " << outputPath << '\n';
+        std::cout << "Converting " << inputPath << " to " << outputPath << '\n';
 
         std::ofstream outputFile(outputPath, std::ios_base::binary | om);
         std::vector<char> buffer;
@@ -7194,5 +7193,7 @@ namespace binpack
             const auto cur = outputFile.tellp();
             std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
         }
+
+        std::cout << "Finished. Converted " << numProcessedPositions << " positions.\n";
     }
 }
\ No newline at end of file
diff --git a/src/learn/convert.cpp b/src/learn/convert.cpp
index d07fc00c..364ad3dd 100644
--- a/src/learn/convert.cpp
+++ b/src/learn/convert.cpp
@@ -12,6 +12,8 @@
 #include "../position.h"
 #include "../tt.h"
 
+#include "../extra/nnue_data_binpack_format.h"
+
 #include <sstream>
 #include <fstream>
 #include <unordered_set>
@@ -497,5 +499,108 @@ namespace Learner
         ofs.close();
         std::cout << "all done" << std::endl;
     }
+
+    static inline const std::string plain_extension = ".plain";
+    static inline const std::string bin_extension = ".bin";
+    static inline const std::string binpack_extension = ".binpack";
+
+    static bool file_exists(const std::string& name)
+    {
+        std::ifstream f(name);
+        return f.good();
+    }
+
+    static bool ends_with(const std::string& lhs, const std::string& end)
+    {
+        if (end.size() > lhs.size()) return false;
+
+        return std::equal(end.rbegin(), end.rend(), lhs.rbegin());
+    }
+
+    static bool is_convert_of_type(
+        const std::string& input_path,
+        const std::string& output_path,
+        const std::string& expected_input_extension,
+        const std::string& expected_output_extension)
+    {
+        return ends_with(input_path, expected_input_extension)
+            && ends_with(output_path, expected_output_extension);
+    }
+
+    using ConvertFunctionType = void(std::string inputPath, std::string outputPath, std::ios_base::openmode om);
+
+    static ConvertFunctionType* get_convert_function(const std::string& input_path, const std::string& output_path)
+    {
+        if (is_convert_of_type(input_path, output_path, plain_extension, bin_extension))
+            return binpack::convertPlainToBin;
+        if (is_convert_of_type(input_path, output_path, plain_extension, binpack_extension))
+            return binpack::convertPlainToBinpack;
+
+        if (is_convert_of_type(input_path, output_path, bin_extension, plain_extension))
+            return binpack::convertBinToPlain;
+        if (is_convert_of_type(input_path, output_path, bin_extension, binpack_extension))
+            return binpack::convertBinToBinpack;
+
+        if (is_convert_of_type(input_path, output_path, binpack_extension, plain_extension))
+            return binpack::convertBinpackToPlain;
+        if (is_convert_of_type(input_path, output_path, binpack_extension, bin_extension))
+            return binpack::convertBinpackToBin;
+
+        return nullptr;
+    }
+
+    static void convert(const std::string& input_path, const std::string& output_path, std::ios_base::openmode om)
+    {
+        if(!file_exists(input_path))
+        {
+            std::cerr << "Input file does not exist.\n";
+            return;
+        }
+
+        auto func = get_convert_function(input_path, output_path);
+        if (func != nullptr)
+        {
+            func(input_path, output_path, om);
+        }
+        else
+        {
+            std::cerr << "Conversion between files of these types is not supported.\n";
+        }
+    }
+
+    static void convert(const std::vector<std::string>& args)
+    {
+        if (args.size() < 2 || args.size() > 3)
+        {
+            std::cerr << "Invalid arguments.\n";
+            std::cerr << "Usage: convert from_path to_path [append]\n";
+            return;
+        }
+
+        const bool append = (args.size() == 3) && (args[2] == "append");
+        const std::ios_base::openmode openmode =
+            append
+            ? std::ios_base::app
+            : std::ios_base::trunc;
+
+        convert(args[0], args[1], openmode);
+    }
+
+    void convert(istringstream& is)
+    {
+        std::vector<std::string> args;
+
+        while (true)
+        {
+            std::string token = "";
+            is >> token;
+            if (token == "")
+                break;
+
+            args.push_back(token);
+        }
+
+        convert(args);
+    }
 }
 #endif
diff --git a/src/uci.cpp b/src/uci.cpp
index 6675f2e0..96adf927 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -50,6 +50,8 @@ namespace Learner
   // Learning from the generated game record
   void learn(Position& pos, istringstream& is);
 
+  void convert(istringstream& is);
+
   // A pair of reader and evaluation value. Returned by Learner::search(),Learner::qsearch().
   typedef std::pair<Value, std::vector<Move> > ValueAndPV;
 
@@ -352,6 +354,7 @@ void UCI::loop(int argc, char* argv[]) {
 #if defined (EVAL_LEARN)
       else if (token == "gensfen") Learner::gen_sfen(pos, is);
       else if (token == "learn") Learner::learn(pos, is);
+      else if (token == "convert") Learner::convert(is);
 
       // Command to call qsearch(),search() directly for testing
       else if (token == "qsearch") qsearch_cmd(pos);

From c6f5f6a082592a2402f14908224fd33f9ad6fc0e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 11 Sep 2020 11:02:00 +0200
Subject: [PATCH 094/398] Replace "use_raw_nnue_eval" with an uci option "Use
 NNUE pure"

---
 src/evaluate.cpp      | 44 +++++++++++++++++++++++++------------------
 src/evaluate.h        | 11 ++++++++++-
 src/learn/gensfen.cpp |  8 --------
 src/learn/learner.cpp |  7 -------
 src/position.cpp      | 10 +++++-----
 src/ucioption.cpp     |  6 +++++-
 6 files changed, 46 insertions(+), 40 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 8edc9bb8..94581998 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -32,23 +32,32 @@
 #include "thread.h"
 #include "uci.h"
 
-#ifdef EVAL_LEARN
-namespace Learner
-{
-    extern bool use_raw_nnue_eval;
-}
-#endif
-
 namespace Eval {
 
-  bool useNNUE;
+  UseNNUEMode useNNUE;
   std::string eval_file_loaded="None";
 
+  static UseNNUEMode nnue_mode_from_option(const std::string& mode)
+  {
+    if (mode == "false")
+      return UseNNUEMode::False;
+    else if (mode == "true")
+      return UseNNUEMode::True;
+
+#ifdef EVAL_LEARN
+    else if (mode == "pure")
+      return UseNNUEMode::Pure;
+#endif
+
+    return UseNNUEMode::False;
+  }
+
   void init_NNUE() {
 
-    useNNUE = Options["Use NNUE"];
+    useNNUE = nnue_mode_from_option(Options["Use NNUE"]);
+
     std::string eval_file = std::string(Options["EvalFile"]);
-    if (useNNUE && eval_file_loaded != eval_file)
+    if (useNNUE != UseNNUEMode::False && eval_file_loaded != eval_file)
         if (Eval::NNUE::load_eval_file(eval_file))
             eval_file_loaded = eval_file;
   }
@@ -56,8 +65,7 @@ namespace Eval {
   void verify_NNUE() {
 
     std::string eval_file = std::string(Options["EvalFile"]);
-    if (useNNUE && eval_file_loaded != eval_file)
-    {
+    if (useNNUE != UseNNUEMode::False && eval_file_loaded != eval_file)    {
         UCI::OptionsMap defaults;
         UCI::init(defaults);
 
@@ -69,7 +77,7 @@ namespace Eval {
         std::exit(EXIT_FAILURE);
     }
 
-    if (useNNUE)
+    if (useNNUE != UseNNUEMode::False)
         sync_cout << "info string NNUE evaluation using " << eval_file << " enabled." << sync_endl;
     else
         sync_cout << "info string classical evaluation enabled." << sync_endl;
@@ -948,17 +956,17 @@ make_v:
 
 Value Eval::evaluate(const Position& pos) {
 #ifdef EVAL_LEARN
-  if (Learner::use_raw_nnue_eval) {
+  if (useNNUE == UseNNUEMode::Pure) {
       return NNUE::evaluate(pos);
   }
 #endif
 
-  bool classical = !Eval::useNNUE
-                ||  abs(eg_value(pos.psq_score())) * 16 > NNUEThreshold1 * (16 + pos.rule50_count());
+  bool classical = useNNUE == UseNNUEMode::False
+                || abs(eg_value(pos.psq_score())) * 16 > NNUEThreshold1 * (16 + pos.rule50_count());
   Value v = classical ? Evaluation<NO_TRACE>(pos).value()
                       : NNUE::evaluate(pos) * 5 / 4 + Tempo;
 
-  if (classical && Eval::useNNUE && abs(v) * 16 < NNUEThreshold2 * (16 + pos.rule50_count()))
+  if (classical && useNNUE != UseNNUEMode::False && abs(v) * 16 < NNUEThreshold2 * (16 + pos.rule50_count()))
       v = NNUE::evaluate(pos) * 5 / 4 + Tempo;
 
   // Damp down the evaluation linearly when shuffling
@@ -1015,7 +1023,7 @@ std::string Eval::trace(const Position& pos) {
 
   ss << "\nClassical evaluation: " << to_cp(v) << " (white side)\n";
 
-  if (Eval::useNNUE)
+  if (useNNUE != UseNNUEMode::False)
   {
       v = NNUE::evaluate(pos);
       v = pos.side_to_move() == WHITE ? v : -v;
diff --git a/src/evaluate.h b/src/evaluate.h
index e808068d..61052e90 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -26,11 +26,20 @@
 class Position;
 
 namespace Eval {
+  enum struct UseNNUEMode
+  {
+    False,
+    True
+
+#ifdef EVAL_LEARN
+    ,Pure
+#endif
+  };
 
   std::string trace(const Position& pos);
   Value evaluate(const Position& pos);
 
-  extern bool useNNUE;
+  extern UseNNUEMode useNNUE;
   extern std::string eval_file_loaded;
   void init_NNUE();
   void verify_NNUE();
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 99a783bb..9088fd81 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -44,12 +44,6 @@ namespace Learner
     static bool detect_draw_by_consecutive_low_score = false;
     static bool detect_draw_by_insufficient_mating_material = false;
 
-    // Use raw NNUE eval value in the Eval::evaluate().
-    // If hybrid eval is enabled, training data
-    // generation and training don't work well.
-    // https://discordapp.com/channels/435943710472011776/733545871911813221/748524079761326192
-    extern bool use_raw_nnue_eval;
-
     static SfenOutputType sfen_output_type = SfenOutputType::Bin;
 
     static bool ends_with(const std::string& lhs, const std::string& end)
@@ -1111,8 +1105,6 @@ namespace Learner
                 is >> detect_draw_by_consecutive_low_score;
             else if (token == "detect_draw_by_insufficient_mating_material")
                 is >> detect_draw_by_insufficient_mating_material;
-            else if (token == "use_raw_nnue_eval")
-                is >> use_raw_nnue_eval;
             else if (token == "sfen_format")
                 is >> sfen_format;
             else
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 7cc04406..da093192 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -93,12 +93,6 @@ namespace Learner
     // data directly. In those cases, we set false to this variable.
     static bool convert_teacher_signal_to_winning_probability = true;
 
-    // Use raw NNUE eval value in the Eval::evaluate(). If hybrid eval is enabled, training data
-    // generation and training don't work well.
-    // https://discordapp.com/channels/435943710472011776/733545871911813221/748524079761326192
-    // This CANNOT be static since it's used elsewhere.
-    bool use_raw_nnue_eval = false;
-
     // Using stockfish's WDL with win rate model instead of sigmoid
     static bool use_wdl = false;
 
@@ -1811,7 +1805,6 @@ namespace Learner
             else if (option == "dest_score_min_value") is >> dest_score_min_value;
             else if (option == "dest_score_max_value") is >> dest_score_max_value;
             else if (option == "convert_teacher_signal_to_winning_probability") is >> convert_teacher_signal_to_winning_probability;
-            else if (option == "use_raw_nnue_eval") is >> use_raw_nnue_eval;
 
             // Otherwise, it's a filename.
             else
diff --git a/src/position.cpp b/src/position.cpp
index fe89b753..5ac461bc 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -755,7 +755,7 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
       else
           st->nonPawnMaterial[them] -= PieceValue[MG][captured];
 
-      if (Eval::useNNUE)
+      if (Eval::useNNUE != Eval::UseNNUEMode::False)
       {
           dp.dirty_num = 2;  // 1 piece moved, 1 piece captured
           dp.piece[1] = captured;
@@ -799,7 +799,7 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
   // Move the piece. The tricky Chess960 castling is handled earlier
   if (type_of(m) != CASTLING)
   {
-      if (Eval::useNNUE)
+      if (Eval::useNNUE != Eval::UseNNUEMode::False)
       {
           dp.piece[0] = pc;
           dp.from[0] = from;
@@ -830,7 +830,7 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
           remove_piece(to);
           put_piece(promotion, to);
 
-          if (Eval::useNNUE)
+          if (Eval::useNNUE != Eval::UseNNUEMode::False)
           {
               // Promoting pawn to SQ_NONE, promoted piece from SQ_NONE
               dp.to[0] = SQ_NONE;
@@ -968,7 +968,7 @@ void Position::do_castling(Color us, Square from, Square& to, Square& rfrom, Squ
   rto = relative_square(us, kingSide ? SQ_F1 : SQ_D1);
   to = relative_square(us, kingSide ? SQ_G1 : SQ_C1);
 
-  if (Do && Eval::useNNUE)
+  if (Do && Eval::useNNUE != Eval::UseNNUEMode::False)
   {
       auto& dp = st->dirtyPiece;
       dp.piece[0] = make_piece(us, KING);
@@ -997,7 +997,7 @@ void Position::do_null_move(StateInfo& newSt) {
   assert(!checkers());
   assert(&newSt != st);
 
-  if (Eval::useNNUE)
+  if (Eval::useNNUE != Eval::UseNNUEMode::False)
   {
       std::memcpy(&newSt, st, sizeof(StateInfo));
       st->accumulator.computed_score = false;
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index b24d8d78..61e47539 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -86,7 +86,11 @@ void init(OptionsMap& o) {
   o["SyzygyProbeDepth"]      << Option(1, 1, 100);
   o["Syzygy50MoveRule"]      << Option(true);
   o["SyzygyProbeLimit"]      << Option(7, 0, 7);
-  o["Use NNUE"]              << Option(true, on_use_NNUE);
+#ifdef EVAL_LEARN
+  o["Use NNUE"]              << Option("true var true var false var pure", "true", on_use_NNUE);
+#else
+  o["Use NNUE"]              << Option("true var true var false", "true", on_use_NNUE);
+#endif
   // The default must follow the format nn-[SHA256 first 12 digits].nnue
   // for the build process (profile-build and fishtest) to work.
   o["EvalFile"]              << Option("nn-82215d0fd0df.nnue", on_eval_file);

From 683c6146ce7217df8693ba83ff9a27a941915aaf Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 11 Sep 2020 12:05:46 +0200
Subject: [PATCH 095/398] Move declarations around and split them.

---
 src/Makefile                         |   2 +-
 src/extra/sfen_packer.cpp            | 587 +++++++++++++--------------
 src/extra/sfen_packer.h              |  23 ++
 src/learn/convert.cpp                |   3 +-
 src/learn/convert.h                  |  37 ++
 src/learn/gensfen.cpp                |   4 +-
 src/learn/gensfen.h                  |  16 +
 src/learn/{learner.cpp => learn.cpp} |   5 +-
 src/learn/learn.h                    | 118 ++----
 src/learn/packed_sfen.h              |  49 +++
 src/position.cpp                     |  41 ++
 src/position.h                       |  14 +-
 src/search.h                         |  11 +
 src/uci.cpp                          |  25 +-
 14 files changed, 511 insertions(+), 424 deletions(-)
 create mode 100644 src/extra/sfen_packer.h
 create mode 100644 src/learn/convert.h
 create mode 100644 src/learn/gensfen.h
 rename src/learn/{learner.cpp => learn.cpp} (99%)
 create mode 100644 src/learn/packed_sfen.h

diff --git a/src/Makefile b/src/Makefile
index 49c6c1b3..88d759d2 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -56,7 +56,7 @@ SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp
 	nnue/features/enpassant.cpp \
 	nnue/nnue_test_command.cpp \
 	extra/sfen_packer.cpp \
-	learn/learner.cpp \
+	learn/learn.cpp \
 	learn/gensfen.cpp \
 	learn/convert.cpp \
 	learn/learning_tools.cpp \
diff --git a/src/extra/sfen_packer.cpp b/src/extra/sfen_packer.cpp
index 1d82111d..b58ad5dd 100644
--- a/src/extra/sfen_packer.cpp
+++ b/src/extra/sfen_packer.cpp
@@ -1,5 +1,9 @@
 ﻿#if defined (EVAL_LEARN)
 
+#include "sfen_packer.h"
+
+#include "../learn/packed_sfen.h"
+
 #include "../misc.h"
 #include "../position.h"
 
@@ -9,153 +13,166 @@
 
 using namespace std;
 
-// -----------------------------------
-// stage compression/decompression
-// -----------------------------------
+namespace Learner {
 
-// Class that handles bitstream
-// useful when doing aspect encoding
-struct BitStream
-{
-  // Set the memory to store the data in advance.
-  // Assume that memory is cleared to 0.
-  void  set_data(uint8_t* data_) { data = data_; reset(); }
-
-  // Get the pointer passed in set_data().
-  uint8_t* get_data() const { return data; }
-
-  // Get the cursor.
-  int get_cursor() const { return bit_cursor; }
-
-  // reset the cursor
-  void reset() { bit_cursor = 0; }
-
-  // Write 1bit to the stream.
-  // If b is non-zero, write out 1. If 0, write 0.
-  void write_one_bit(int b)
+  // Class that handles bitstream
+  // useful when doing aspect encoding
+  struct BitStream
   {
-    if (b)
-      data[bit_cursor / 8] |= 1 << (bit_cursor & 7);
+    // Set the memory to store the data in advance.
+    // Assume that memory is cleared to 0.
+    void set_data(std::uint8_t* data_) { data = data_; reset(); }
 
-    ++bit_cursor;
-  }
+    // Get the pointer passed in set_data().
+    uint8_t* get_data() const { return data; }
 
-  // Get 1 bit from the stream.
-  int read_one_bit()
+    // Get the cursor.
+    int get_cursor() const { return bit_cursor; }
+
+    // reset the cursor
+    void reset() { bit_cursor = 0; }
+
+    // Write 1bit to the stream.
+    // If b is non-zero, write out 1. If 0, write 0.
+    void write_one_bit(int b)
+    {
+      if (b)
+        data[bit_cursor / 8] |= 1 << (bit_cursor & 7);
+
+      ++bit_cursor;
+    }
+
+    // Get 1 bit from the stream.
+    int read_one_bit()
+    {
+      int b = (data[bit_cursor / 8] >> (bit_cursor & 7)) & 1;
+      ++bit_cursor;
+
+      return b;
+    }
+
+    // write n bits of data
+    // Data shall be written out from the lower order of d.
+    void write_n_bit(int d, int n)
+    {
+      for (int i = 0; i <n; ++i)
+        write_one_bit(d & (1 << i));
+    }
+
+    // read n bits of data
+    // Reverse conversion of write_n_bit().
+    int read_n_bit(int n)
+    {
+      int result = 0;
+      for (int i = 0; i < n; ++i)
+        result |= read_one_bit() ? (1 << i) : 0;
+
+      return result;
+    }
+
+  private:
+    // Next bit position to read/write.
+    int bit_cursor;
+
+    // data entity
+    std::uint8_t* data;
+  };
+
+  // Class for compressing/decompressing sfen
+  // sfen can be packed to 256bit (32bytes) by Huffman coding.
+  // This is proven by mini. The above is Huffman coding.
+  //
+  // Internal format = 1-bit turn + 7-bit king position *2 + piece on board (Huffman coding) + hand piece (Huffman coding)
+  // Side to move (White = 0, Black = 1) (1bit)
+  // White King Position (6 bits)
+  // Black King Position (6 bits)
+  // Huffman Encoding of the board
+  // Castling availability (1 bit x 4)
+  // En passant square (1 or 1 + 6 bits)
+  // Rule 50 (6 bits)
+  // Game play (8 bits)
+  //
+  // TODO(someone): Rename SFEN to FEN.
+  //
+  struct SfenPacker
   {
-    int b = (data[bit_cursor / 8] >> (bit_cursor & 7)) & 1;
-    ++bit_cursor;
+    void pack(const Position& pos);
 
-    return b;
-  }
+    // sfen packed by pack() (256bit = 32bytes)
+    // Or sfen to decode with unpack()
+    uint8_t *data; // uint8_t[32];
 
-  // write n bits of data
-  // Data shall be written out from the lower order of d.
-  void write_n_bit(int d, int n)
+    BitStream stream;
+
+    // Output the board pieces to stream.
+    void write_board_piece_to_stream(Piece pc);
+
+    // Read one board piece from stream
+    Piece read_board_piece_from_stream();
+  };
+
+
+  // Huffman coding
+  // * is simplified from mini encoding to make conversion easier.
+  //
+  // 1 box on the board (other than NO_PIECE) = 2 to 6 bits (+ 1-bit flag + 1-bit forward and backward)
+  // 1 piece of hand piece = 1-5bit (+ 1-bit flag + 1bit ahead and behind)
+  //
+  // empty xxxxx0 + 0 (none)
+  // step xxxx01 + 2 xxxx0 + 2
+  // incense xx0011 + 2 xx001 + 2
+  // Katsura xx1011 + 2 xx101 + 2
+  // silver xx0111 + 2 xx011 + 2
+  // Gold x01111 + 1 x0111 + 1 // Gold is valid and has no flags.
+  // corner 011111 + 2 01111 + 2
+  // Fly 111111 + 2 11111 + 2
+  //
+  // Assuming all pieces are on the board,
+  // Sky 81-40 pieces = 41 boxes = 41bit
+  // Walk 4bit*18 pieces = 72bit
+  // Incense 6bit*4 pieces = 24bit
+  // Katsura 6bit*4 pieces = 24bit
+  // Silver 6bit*4 pieces = 24bit
+  // Gold 6bit* 4 pieces = 24bit
+  // corner 8bit* 2 pieces = 16bit
+  // Fly 8bit* 2 pieces = 16bit
+  // -------
+  // 241bit + 1bit (turn) + 7bit × 2 (King's position after) = 256bit
+  //
+  // When the piece on the board moves to the hand piece, the piece on the board becomes empty, so the box on the board can be expressed with 1 bit,
+  // Since the hand piece can be expressed by 1 bit less than the piece on the board, the total number of bits does not change in the end.
+  // Therefore, in this expression, any aspect can be expressed by this bit number.
+  // It is a hand piece and no flag is required, but if you include this, the bit number of the piece on the board will be -1
+  // Since the total number of bits can be fixed, we will include this as well.
+
+  // Huffman Encoding
+  //
+  // Empty  xxxxxxx0
+  // Pawn   xxxxx001 + 1 bit (Side to move)
+  // Knight xxxxx011 + 1 bit (Side to move)
+  // Bishop xxxxx101 + 1 bit (Side to move)
+  // Rook   xxxxx111 + 1 bit (Side to move)
+
+  struct HuffmanedPiece
   {
-    for (int i = 0; i <n; ++i)
-      write_one_bit(d & (1 << i));
-  }
+    int code; // how it will be coded
+    int bits; // How many bits do you have
+  };
 
-  // read n bits of data
-  // Reverse conversion of write_n_bit().
-  int read_n_bit(int n)
+  constexpr HuffmanedPiece huffman_table[] =
   {
-    int result = 0;
-    for (int i = 0; i < n; ++i)
-      result |= read_one_bit() ? (1 << i) : 0;
+    {0b0000,1}, // NO_PIECE
+    {0b0001,4}, // PAWN
+    {0b0011,4}, // KNIGHT
+    {0b0101,4}, // BISHOP
+    {0b0111,4}, // ROOK
+    {0b1001,4}, // QUEEN
+  };
 
-    return result;
-  }
-
-private:
-  // Next bit position to read/write.
-  int bit_cursor;
-
-  // data entity
-  uint8_t* data;
-};
-
-
-// Huffman coding
-// * is simplified from mini encoding to make conversion easier.
-//
-// 1 box on the board (other than NO_PIECE) = 2 to 6 bits (+ 1-bit flag + 1-bit forward and backward)
-// 1 piece of hand piece = 1-5bit (+ 1-bit flag + 1bit ahead and behind)
-//
-// empty xxxxx0 + 0 (none)
-// step xxxx01 + 2 xxxx0 + 2
-// incense xx0011 + 2 xx001 + 2
-// Katsura xx1011 + 2 xx101 + 2
-// silver xx0111 + 2 xx011 + 2
-// Gold x01111 + 1 x0111 + 1 // Gold is valid and has no flags.
-// corner 011111 + 2 01111 + 2
-// Fly 111111 + 2 11111 + 2
-//
-// Assuming all pieces are on the board,
-// Sky 81-40 pieces = 41 boxes = 41bit
-// Walk 4bit*18 pieces = 72bit
-// Incense 6bit*4 pieces = 24bit
-// Katsura 6bit*4 pieces = 24bit
-// Silver 6bit*4 pieces = 24bit
-// Gold 6bit* 4 pieces = 24bit
-// corner 8bit* 2 pieces = 16bit
-// Fly 8bit* 2 pieces = 16bit
-// -------
-// 241bit + 1bit (turn) + 7bit × 2 (King's position after) = 256bit
-//
-// When the piece on the board moves to the hand piece, the piece on the board becomes empty, so the box on the board can be expressed with 1 bit,
-// Since the hand piece can be expressed by 1 bit less than the piece on the board, the total number of bits does not change in the end.
-// Therefore, in this expression, any aspect can be expressed by this bit number.
-// It is a hand piece and no flag is required, but if you include this, the bit number of the piece on the board will be -1
-// Since the total number of bits can be fixed, we will include this as well.
-
-// Huffman Encoding
-//
-// Empty  xxxxxxx0
-// Pawn   xxxxx001 + 1 bit (Side to move)
-// Knight xxxxx011 + 1 bit (Side to move)
-// Bishop xxxxx101 + 1 bit (Side to move)
-// Rook   xxxxx111 + 1 bit (Side to move)
-
-struct HuffmanedPiece
-{
-  int code; // how it will be coded
-  int bits; // How many bits do you have
-};
-
-HuffmanedPiece huffman_table[] =
-{
-  {0b0000,1}, // NO_PIECE
-  {0b0001,4}, // PAWN
-  {0b0011,4}, // KNIGHT
-  {0b0101,4}, // BISHOP
-  {0b0111,4}, // ROOK
-  {0b1001,4}, // QUEEN
-};
-
-// Class for compressing/decompressing sfen
-// sfen can be packed to 256bit (32bytes) by Huffman coding.
-// This is proven by mini. The above is Huffman coding.
-//
-// Internal format = 1-bit turn + 7-bit king position *2 + piece on board (Huffman coding) + hand piece (Huffman coding)
-// Side to move (White = 0, Black = 1) (1bit)
-// White King Position (6 bits)
-// Black King Position (6 bits)
-// Huffman Encoding of the board
-// Castling availability (1 bit x 4)
-// En passant square (1 or 1 + 6 bits)
-// Rule 50 (6 bits)
-// Game play (8 bits)
-//
-// TODO(someone): Rename SFEN to FEN.
-//
-struct SfenPacker
-{
   // Pack sfen and store in data[32].
-  void pack(const Position& pos)
+  void SfenPacker::pack(const Position& pos)
   {
-// cout << pos;
+  // cout << pos;
 
     memset(data, 0, 32 /* 256bit */);
     stream.set_data(data);
@@ -202,17 +219,8 @@ struct SfenPacker
     assert(stream.get_cursor() <= 256);
   }
 
-  // sfen packed by pack() (256bit = 32bytes)
-  // Or sfen to decode with unpack()
-  uint8_t *data; // uint8_t[32];
-
-//private:
-  // Position::set_from_packed_sfen(uint8_t data[32]) I want to use these functions, so the line is bad, but I want to keep it public.
-
-  BitStream stream;
-
   // Output the board pieces to stream.
-  void write_board_piece_to_stream(Piece pc)
+  void SfenPacker::write_board_piece_to_stream(Piece pc)
   {
     // piece type
     PieceType pr = type_of(pc);
@@ -227,7 +235,7 @@ struct SfenPacker
   }
 
   // Read one board piece from stream
-  Piece read_board_piece_from_stream()
+  Piece SfenPacker::read_board_piece_from_stream()
   {
     PieceType pr = NO_PIECE_TYPE;
     int code = 0, bits = 0;
@@ -252,181 +260,148 @@ struct SfenPacker
 
     return make_piece(c, pr);
   }
-};
 
-
-// -----------------------------------
-// Add to Position class
-// -----------------------------------
-
-// Add a function that directly unpacks for speed. It's pretty tough.
-// Write it by combining packer::unpack() and Position::set().
-// If there is a problem with the passed phase and there is an error, non-zero is returned.
-int Position::set_from_packed_sfen(const PackedSfen& sfen , StateInfo * si, Thread* th, bool mirror)
-{
-	SfenPacker packer;
-	auto& stream = packer.stream;
-
-  // TODO: separate streams for writing and reading. Here we actually have to
-  // const_cast which is not safe in the long run.
-	stream.set_data(const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(&sfen)));
-
-	std::memset(this, 0, sizeof(Position));
-	std::memset(si, 0, sizeof(StateInfo));
-  std::fill_n(&pieceList[0][0], sizeof(pieceList) / sizeof(Square), SQ_NONE);
-  st = si;
-
-	// Active color
-	sideToMove = (Color)stream.read_one_bit();
-
-  pieceList[W_KING][0] = SQUARE_NB;
-  pieceList[B_KING][0] = SQUARE_NB;
-
-	// First the position of the ball
-	if (mirror)
-	{
-		for (auto c : Colors)
-			board[flip_file((Square)stream.read_n_bit(6))] = make_piece(c, KING);
-	}
-	else
-	{
-		for (auto c : Colors)
-			board[stream.read_n_bit(6)] = make_piece(c, KING);
-	}
-
-  // Piece placement
-  for (Rank r = RANK_8; r >= RANK_1; --r)
+  int set_from_packed_sfen(Position& pos, const PackedSfen& sfen, StateInfo* si, Thread* th, bool mirror)
   {
-    for (File f = FILE_A; f <= FILE_H; ++f)
+    SfenPacker packer;
+    auto& stream = packer.stream;
+
+    // TODO: separate streams for writing and reading. Here we actually have to
+    // const_cast which is not safe in the long run.
+    stream.set_data(const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(&sfen)));
+
+    std::memset(&pos, 0, sizeof(Position));
+    std::memset(si, 0, sizeof(StateInfo));
+    std::fill_n(&pos.pieceList[0][0], sizeof(pos.pieceList) / sizeof(Square), SQ_NONE);
+    pos.st = si;
+
+    // Active color
+    pos.sideToMove = (Color)stream.read_one_bit();
+
+    pos.pieceList[W_KING][0] = SQUARE_NB;
+    pos.pieceList[B_KING][0] = SQUARE_NB;
+
+    // First the position of the ball
+    if (mirror)
     {
-      auto sq = make_square(f, r);
+      for (auto c : Colors)
+        pos.board[flip_file((Square)stream.read_n_bit(6))] = make_piece(c, KING);
+    }
+    else
+    {
+      for (auto c : Colors)
+        pos.board[stream.read_n_bit(6)] = make_piece(c, KING);
+    }
+
+    // Piece placement
+    for (Rank r = RANK_8; r >= RANK_1; --r)
+    {
+      for (File f = FILE_A; f <= FILE_H; ++f)
+      {
+        auto sq = make_square(f, r);
+        if (mirror) {
+          sq = flip_file(sq);
+        }
+
+        // it seems there are already balls
+        Piece pc;
+        if (type_of(pos.board[sq]) != KING)
+        {
+          assert(pos.board[sq] == NO_PIECE);
+          pc = packer.read_board_piece_from_stream();
+        }
+        else
+        {
+          pc = pos.board[sq];
+          // put_piece() will catch ASSERT unless you remove it all.
+          pos.board[sq] = NO_PIECE;
+        }
+
+        // There may be no pieces, so skip in that case.
+        if (pc == NO_PIECE)
+          continue;
+
+        pos.put_piece(Piece(pc), sq);
+
+        if (stream.get_cursor()> 256)
+          return 1;
+
+        //assert(stream.get_cursor() <= 256);
+      }
+    }
+
+    // Castling availability.
+    // TODO(someone): Support chess960.
+    pos.st->castlingRights = 0;
+    if (stream.read_one_bit()) {
+      Square rsq;
+      for (rsq = relative_square(WHITE, SQ_H1); pos.piece_on(rsq) != W_ROOK; --rsq) {}
+      pos.set_castling_right(WHITE, rsq);
+    }
+    if (stream.read_one_bit()) {
+      Square rsq;
+      for (rsq = relative_square(WHITE, SQ_A1); pos.piece_on(rsq) != W_ROOK; ++rsq) {}
+      pos.set_castling_right(WHITE, rsq);
+    }
+    if (stream.read_one_bit()) {
+      Square rsq;
+      for (rsq = relative_square(BLACK, SQ_H1); pos.piece_on(rsq) != B_ROOK; --rsq) {}
+      pos.set_castling_right(BLACK, rsq);
+    }
+    if (stream.read_one_bit()) {
+      Square rsq;
+      for (rsq = relative_square(BLACK, SQ_A1); pos.piece_on(rsq) != B_ROOK; ++rsq) {}
+      pos.set_castling_right(BLACK, rsq);
+    }
+
+    // En passant square. Ignore if no pawn capture is possible
+    if (stream.read_one_bit()) {
+      Square ep_square = static_cast<Square>(stream.read_n_bit(6));
       if (mirror) {
-        sq = flip_file(sq);
+        ep_square = flip_file(ep_square);
       }
+      pos.st->epSquare = ep_square;
 
-      // it seems there are already balls
-      Piece pc;
-      if (type_of(board[sq]) != KING)
-      {
-        assert(board[sq] == NO_PIECE);
-        pc = packer.read_board_piece_from_stream();
-      }
-      else
-      {
-        pc = board[sq];
-        board[sq] = NO_PIECE; // put_piece() will catch ASSERT unless you remove it all.
-      }
-
-      // There may be no pieces, so skip in that case.
-      if (pc == NO_PIECE)
-        continue;
-
-      put_piece(Piece(pc), sq);
-
-      //cout << sq << ' ' << board[sq] << ' ' << stream.get_cursor() << endl;
-
-      if (stream.get_cursor()> 256)
-        return 1;
-      //assert(stream.get_cursor() <= 256);
-
+      if (!(pos.attackers_to(pos.st->epSquare) & pos.pieces(pos.sideToMove, PAWN))
+        || !(pos.pieces(~pos.sideToMove, PAWN) & (pos.st->epSquare + pawn_push(~pos.sideToMove))))
+        pos.st->epSquare = SQ_NONE;
     }
-  }
-
-  // Castling availability.
-  // TODO(someone): Support chess960.
-  st->castlingRights = 0;
-  if (stream.read_one_bit()) {
-    Square rsq;
-    for (rsq = relative_square(WHITE, SQ_H1); piece_on(rsq) != W_ROOK; --rsq) {}
-    set_castling_right(WHITE, rsq);
-  }
-  if (stream.read_one_bit()) {
-    Square rsq;
-    for (rsq = relative_square(WHITE, SQ_A1); piece_on(rsq) != W_ROOK; ++rsq) {}
-    set_castling_right(WHITE, rsq);
-  }
-  if (stream.read_one_bit()) {
-    Square rsq;
-    for (rsq = relative_square(BLACK, SQ_H1); piece_on(rsq) != B_ROOK; --rsq) {}
-    set_castling_right(BLACK, rsq);
-  }
-  if (stream.read_one_bit()) {
-    Square rsq;
-    for (rsq = relative_square(BLACK, SQ_A1); piece_on(rsq) != B_ROOK; ++rsq) {}
-    set_castling_right(BLACK, rsq);
-  }
-
-  // En passant square. Ignore if no pawn capture is possible
-  if (stream.read_one_bit()) {
-    Square ep_square = static_cast<Square>(stream.read_n_bit(6));
-    if (mirror) {
-      ep_square = flip_file(ep_square);
+    else {
+      pos.st->epSquare = SQ_NONE;
     }
-    st->epSquare = ep_square;
 
-    if (!(attackers_to(st->epSquare) & pieces(sideToMove, PAWN))
-      || !(pieces(~sideToMove, PAWN) & (st->epSquare + pawn_push(~sideToMove))))
-      st->epSquare = SQ_NONE;
-  }
-  else {
-    st->epSquare = SQ_NONE;
+    // Halfmove clock
+    pos.st->rule50 = static_cast<Square>(stream.read_n_bit(6));
+
+    // Fullmove number
+    pos.gamePly = static_cast<Square>(stream.read_n_bit(8));
+
+    // Convert from fullmove starting from 1 to gamePly starting from 0,
+    // handle also common incorrect FEN with fullmove = 0.
+    pos.gamePly = std::max(2 * (pos.gamePly - 1), 0) + (pos.sideToMove == BLACK);
+
+    assert(stream.get_cursor() <= 256);
+
+    pos.chess960 = false;
+    pos.thisThread = th;
+    pos.set_state(pos.st);
+
+    assert(pos_is_ok());
+
+    return 0;
   }
 
-  // Halfmove clock
-  st->rule50 = static_cast<Square>(stream.read_n_bit(6));
+  PackedSfen sfen_pack(Position& pos)
+  {
+    PackedSfen sfen;
 
-  // Fullmove number
-  gamePly = static_cast<Square>(stream.read_n_bit(8));
-  // Convert from fullmove starting from 1 to gamePly starting from 0,
-  // handle also common incorrect FEN with fullmove = 0.
-  gamePly = std::max(2 * (gamePly - 1), 0) + (sideToMove == BLACK);
+    SfenPacker sp;
+    sp.data = (uint8_t*)&sfen;
+    sp.pack(pos);
 
-  assert(stream.get_cursor() <= 256);
-
-  chess960 = false;
-  thisThread = th;
-set_state(st);
-
-  //std::cout << *this << std::endl;
-
-  assert(pos_is_ok());
-
-	return 0;
+    return sfen;
+  }
 }
 
-// Give the board, hand piece, and turn, and return the sfen.
-//std::string Position::sfen_from_rawdata(Piece board[81], Hand hands[2], Color turn, int gamePly_)
-//{
-// // Copy it to an internal structure and call sfen() if the conversion process depends only on it
-// // Maybe it will be converted normally...
-//  Position pos;
-//
-//  memcpy(pos.board, board, sizeof(Piece) * 81);
-//  memcpy(pos.hand, hands, sizeof(Hand) * 2);
-//  pos.sideToMove = turn;
-//  pos.gamePly = gamePly_;
-//
-//  return pos.sfen();
-//
-// // Implementation of ↑ is beautiful, but slow.
-// // This is a bottleneck when learning a large amount of game records, so write a function to unpack directly.
-//}
-
-// Get the packed sfen. Returns to the buffer specified in the argument.
-void Position::sfen_pack(PackedSfen& sfen)
-{
-  SfenPacker sp;
-  sp.data = (uint8_t*)&sfen;
-  sp.pack(*this);
-}
-
-//// Unpack the packed sfen. Returns an sfen string.
-//std::string Position::sfen_unpack(const PackedSfen& sfen)
-//{
-// SfenPacker sp;
-// sp.data = (uint8_t*)&sfen;
-// return sp.unpack();
-//}
-
 
 #endif // USE_SFEN_PACKER
diff --git a/src/extra/sfen_packer.h b/src/extra/sfen_packer.h
new file mode 100644
index 00000000..c3832db2
--- /dev/null
+++ b/src/extra/sfen_packer.h
@@ -0,0 +1,23 @@
+#ifndef _SFEN_PACKER_H_
+#define _SFEN_PACKER_H_
+
+#if defined(EVAL_LEARN)
+
+#include <cstdint>
+
+#include "../types.h"
+
+#include "../learn/packed_sfen.h"
+class Position;
+struct StateInfo;
+class Thread;
+
+namespace Learner {
+
+    int set_from_packed_sfen(Position& pos, const PackedSfen& sfen, StateInfo* si, Thread* th, bool mirror);
+    PackedSfen sfen_pack(Position& pos);
+}
+
+#endif
+
+#endif
\ No newline at end of file
diff --git a/src/learn/convert.cpp b/src/learn/convert.cpp
index 364ad3dd..d50233eb 100644
--- a/src/learn/convert.cpp
+++ b/src/learn/convert.cpp
@@ -1,9 +1,10 @@
 #if defined(EVAL_LEARN)
 
+#include "convert.h"
+
 // evaluate header for learning
 #include "../eval/evaluate_common.h"
 
-#include "learn.h"
 #include "multi_think.h"
 #include "../uci.h"
 #include "../syzygy/tbprobe.h"
diff --git a/src/learn/convert.h b/src/learn/convert.h
new file mode 100644
index 00000000..a79820a3
--- /dev/null
+++ b/src/learn/convert.h
@@ -0,0 +1,37 @@
+#ifndef _CONVERT_H_
+#define _CONVERT_H_
+
+#include <vector>
+#include <string>
+#include <sstream>
+
+#if defined(EVAL_LEARN)
+namespace Learner {
+    void convert_bin_from_pgn_extract(
+        const std::vector<std::string>& filenames,
+        const std::string& output_file_name,
+        const bool pgn_eval_side_to_move,
+        const bool convert_no_eval_fens_as_score_zero);
+
+    void convert_bin(
+        const std::vector<std::string>& filenames,
+        const std::string& output_file_name,
+        const int ply_minimum,
+        const int ply_maximum,
+        const int interpolate_eval,
+        const int src_score_min_value,
+        const int src_score_max_value,
+        const int dest_score_min_value,
+        const int dest_score_max_value,
+        const bool check_invalid_fen,
+        const bool check_illegal_move);
+
+    void convert_plain(
+        const std::vector<std::string>& filenames,
+        const std::string& output_file_name);
+
+    void convert(std::istringstream& is);
+}
+#endif
+
+#endif
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 9088fd81..9f53e983 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -1,5 +1,8 @@
 ﻿#if defined(EVAL_LEARN)
 
+#include "gensfen.h"
+#include "packed_sfen.h"
+
 #include "../eval/evaluate_common.h"
 #include "../misc.h"
 #include "../nnue/evaluate_nnue_learner.h"
@@ -8,7 +11,6 @@
 #include "../thread.h"
 #include "../tt.h"
 #include "../uci.h"
-#include "learn.h"
 #include "multi_think.h"
 
 #include "../extra/nnue_data_binpack_format.h"
diff --git a/src/learn/gensfen.h b/src/learn/gensfen.h
new file mode 100644
index 00000000..dd0f71fb
--- /dev/null
+++ b/src/learn/gensfen.h
@@ -0,0 +1,16 @@
+#ifndef _GENSFEN_H_
+#define _GENSFEN_H_
+
+#include <sstream>
+
+#include "../position.h"
+
+#if defined(EVAL_LEARN)
+namespace Learner {
+
+    // Automatic generation of teacher position
+    void gen_sfen(Position& pos, std::istringstream& is);
+}
+#endif
+
+#endif
\ No newline at end of file
diff --git a/src/learn/learner.cpp b/src/learn/learn.cpp
similarity index 99%
rename from src/learn/learner.cpp
rename to src/learn/learn.cpp
index da093192..f4f7b409 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learn.cpp
@@ -19,6 +19,9 @@
 
 #if defined(EVAL_LEARN)
 
+#include "learn.h"
+#include "convert.h"
+
 #include "../eval/evaluate_common.h"
 #include "../misc.h"
 #include "../nnue/evaluate_nnue_learner.h"
@@ -27,7 +30,7 @@
 #include "../thread.h"
 #include "../tt.h"
 #include "../uci.h"
-#include "learn.h"
+#include "../search.h"
 #include "multi_think.h"
 
 #include "../extra/nnue_data_binpack_format.h"
diff --git a/src/learn/learn.h b/src/learn/learn.h
index b7ca18e8..b8acc2df 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -14,7 +14,7 @@
 // Even if it is a double type, there is almost no difference in the way of convergence, so fix it to float.
 
 // when using float
-typedef float LearnFloatType;
+using LearnFloatType = float;
 
 // when using double
 //typedef double LearnFloatType;
@@ -36,105 +36,47 @@ typedef float LearnFloatType;
 // ----------------------
 // Definition of struct used in Learner
 // ----------------------
+
+#include "packed_sfen.h"
+
 #include "../position.h"
 
+#include <sstream>
+
 namespace Learner
 {
-	// ----------------------
-	// Settings for learning
-	// ----------------------
+    // ----------------------
+    // Settings for learning
+    // ----------------------
 
-	// mini-batch size.
-	// Calculate the gradient by combining this number of phases.
-	// If you make it smaller, the number of update_weights() will increase and the convergence will be faster. The gradient is incorrect.
-	// If you increase it, the number of update_weights() decreases, so the convergence will be slow. The slope will come out accurately.
-	// I don't think you need to change this value in most cases.
+    // mini-batch size.
+    // Calculate the gradient by combining this number of phases.
+    // If you make it smaller, the number of update_weights() will increase and the convergence will be faster. The gradient is incorrect.
+    // If you increase it, the number of update_weights() decreases, so the convergence will be slow. The slope will come out accurately.
+    // I don't think you need to change this value in most cases.
 
-	constexpr std::size_t LEARN_MINI_BATCH_SIZE = 1000 * 1000 * 1;
+    constexpr std::size_t LEARN_MINI_BATCH_SIZE = 1000 * 1000 * 1;
 
-	// The number of phases to read from the file at one time. After reading this much, shuffle.
-	// It is better to have a certain size, but this number x 40 bytes x 3 times as much memory is consumed. 400MB*3 is consumed in the 10M phase.
-	// Must be a multiple of THREAD_BUFFER_SIZE(=10000).
+    // The number of phases to read from the file at one time. After reading this much, shuffle.
+    // It is better to have a certain size, but this number x 40 bytes x 3 times as much memory is consumed. 400MB*3 is consumed in the 10M phase.
+    // Must be a multiple of THREAD_BUFFER_SIZE(=10000).
 
-	constexpr std::size_t LEARN_SFEN_READ_SIZE = 1000 * 1000 * 10;
+    constexpr std::size_t LEARN_SFEN_READ_SIZE = 1000 * 1000 * 10;
 
-	// Saving interval of evaluation function at learning. Save each time you learn this number of phases.
-	// Needless to say, the longer the saving interval, the shorter the learning time.
-	// Folder name is incremented for each save like 0/, 1/, 2/...
-	// By default, once every 1 billion phases.
-	constexpr std::size_t LEARN_EVAL_SAVE_INTERVAL = 1000000000ULL;
+    // Saving interval of evaluation function at learning. Save each time you learn this number of phases.
+    // Needless to say, the longer the saving interval, the shorter the learning time.
+    // Folder name is incremented for each save like 0/, 1/, 2/...
+    // By default, once every 1 billion phases.
+    constexpr std::size_t LEARN_EVAL_SAVE_INTERVAL = 1000000000ULL;
 
-	// Reduce the output of rmse during learning to 1 for this number of times.
-	// rmse calculation is done in one thread, so it takes some time, so reducing the output is effective.
-	constexpr std::size_t LEARN_RMSE_OUTPUT_INTERVAL = 1;
+    // Reduce the output of rmse during learning to 1 for this number of times.
+    // rmse calculation is done in one thread, so it takes some time, so reducing the output is effective.
+    constexpr std::size_t LEARN_RMSE_OUTPUT_INTERVAL = 1;
 
-	//Structure in which PackedSfen and evaluation value are integrated
-	// If you write different contents for each option, it will be a problem when reusing the teacher game
-	// For the time being, write all the following members regardless of the options.
-	struct PackedSfenValue
-	{
-		// phase
-		PackedSfen sfen;
+    double calc_grad(Value shallow, const PackedSfenValue& psv);
 
-		// Evaluation value returned from Learner::search()
-		int16_t score;
-
-		// PV first move
-		// Used when finding the match rate with the teacher
-		uint16_t move;
-
-		// Trouble of the phase from the initial phase.
-		uint16_t gamePly;
-
-		// 1 if the player on this side ultimately wins the game. -1 if you are losing.
-		// 0 if a draw is reached.
-		// The draw is in the teacher position generation command gensfen,
-		// Only write if LEARN_GENSFEN_DRAW_RESULT is enabled.
-		int8_t game_result;
-
-		// When exchanging the file that wrote the teacher aspect with other people
-		//Because this structure size is not fixed, pad it so that it is 40 bytes in any environment.
-		uint8_t padding;
-
-		// 32 + 2 + 2 + 2 + 1 + 1 = 40bytes
-	};
-
-	// Type that returns the reading line and the evaluation value at that time
-	// Used in Learner::search(), Learner::qsearch().
-	typedef std::pair<Value, std::vector<Move> > ValueAndPV;
-
-	// Phase array: PSVector stands for packed sfen vector.
-	typedef std::vector<PackedSfenValue> PSVector;
-
-	// So far, only Yaneura King 2018 Otafuku has this stub
-	// This stub is required if EVAL_LEARN is defined.
-	extern Learner::ValueAndPV  search(Position& pos, int depth , size_t multiPV = 1 , uint64_t NodesLimit = 0);
-	extern Learner::ValueAndPV qsearch(Position& pos);
-
-	double calc_grad(Value shallow, const PackedSfenValue& psv);
-	
-	void convert_bin_from_pgn_extract(
-		const std::vector<std::string>& filenames,
-		const std::string& output_file_name,
-		const bool pgn_eval_side_to_move,
-		const bool convert_no_eval_fens_as_score_zero);
-	
-	void convert_bin(
-		const std::vector<std::string>& filenames,
-		const std::string& output_file_name,
-		const int ply_minimum,
-		const int ply_maximum,
-		const int interpolate_eval,
-		const int src_score_min_value,
-		const int src_score_max_value,
-		const int dest_score_min_value,
-		const int dest_score_max_value,
-		const bool check_invalid_fen,
-		const bool check_illegal_move);
-
-	void convert_plain(
-		const std::vector<std::string>& filenames,
-		const std::string& output_file_name);
+    // Learning from the generated game record
+    void learn(Position& pos, std::istringstream& is);
 }
 
 #endif
diff --git a/src/learn/packed_sfen.h b/src/learn/packed_sfen.h
new file mode 100644
index 00000000..101e5e34
--- /dev/null
+++ b/src/learn/packed_sfen.h
@@ -0,0 +1,49 @@
+#ifndef _PACKED_SFEN_H_
+#define _PACKED_SFEN_H_
+
+#include <vector>
+#include <cstdint>
+
+#if defined(EVAL_LEARN)
+namespace Learner {
+
+    // packed sfen
+    struct PackedSfen { std::uint8_t data[32]; };
+
+    // Structure in which PackedSfen and evaluation value are integrated
+    // If you write different contents for each option, it will be a problem when reusing the teacher game
+    // For the time being, write all the following members regardless of the options.
+    struct PackedSfenValue
+    {
+        // phase
+        PackedSfen sfen;
+
+        // Evaluation value returned from Learner::search()
+        std::int16_t score;
+
+        // PV first move
+        // Used when finding the match rate with the teacher
+        std::uint16_t move;
+
+        // Trouble of the phase from the initial phase.
+        std::uint16_t gamePly;
+
+        // 1 if the player on this side ultimately wins the game. -1 if you are losing.
+        // 0 if a draw is reached.
+        // The draw is in the teacher position generation command gensfen,
+        // Only write if LEARN_GENSFEN_DRAW_RESULT is enabled.
+        std::int8_t game_result;
+
+        // When exchanging the file that wrote the teacher aspect with other people
+        //Because this structure size is not fixed, pad it so that it is 40 bytes in any environment.
+        std::uint8_t padding;
+
+        // 32 + 2 + 2 + 2 + 1 + 1 = 40bytes
+    };
+
+    // Phase array: PSVector stands for packed sfen vector.
+    using PSVector = std::vector<PackedSfenValue>;
+}
+#endif
+
+#endif
diff --git a/src/position.cpp b/src/position.cpp
index 5ac461bc..a9fc8272 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -32,6 +32,11 @@
 #include "uci.h"
 #include "syzygy/tbprobe.h"
 
+#if defined(EVAL_LEARN)
+#include "learn/packed_sfen.h"
+#include "extra/sfen_packer.h"
+#endif
+
 using std::string;
 
 namespace Zobrist {
@@ -1346,3 +1351,39 @@ bool Position::pos_is_ok() const {
 
   return true;
 }
+
+#if defined(EVAL_LEARN)
+
+// Add a function that directly unpacks for speed. It's pretty tough.
+// Write it by combining packer::unpack() and Position::set().
+// If there is a problem with the passed phase and there is an error, non-zero is returned.
+int Position::set_from_packed_sfen(const Learner::PackedSfen& sfen , StateInfo* si, Thread* th, bool mirror)
+{
+  return Learner::set_from_packed_sfen(*this, sfen, si, th, mirror);
+}
+
+// Give the board, hand piece, and turn, and return the sfen.
+//std::string Position::sfen_from_rawdata(Piece board[81], Hand hands[2], Color turn, int gamePly_)
+//{
+// // Copy it to an internal structure and call sfen() if the conversion process depends only on it
+// // Maybe it will be converted normally...
+//  Position pos;
+//
+//  memcpy(pos.board, board, sizeof(Piece) * 81);
+//  memcpy(pos.hand, hands, sizeof(Hand) * 2);
+//  pos.sideToMove = turn;
+//  pos.gamePly = gamePly_;
+//
+//  return pos.sfen();
+//
+// // Implementation of ↑ is beautiful, but slow.
+// // This is a bottleneck when learning a large amount of game records, so write a function to unpack directly.
+//}
+
+// Get the packed sfen. Returns to the buffer specified in the argument.
+void Position::sfen_pack(Learner::PackedSfen& sfen)
+{
+  sfen = Learner::sfen_pack(*this);
+}
+
+#endif
\ No newline at end of file
diff --git a/src/position.h b/src/position.h
index e3f758e0..382748af 100644
--- a/src/position.h
+++ b/src/position.h
@@ -30,6 +30,11 @@
 
 #include "nnue/nnue_accumulator.h"
 
+#if defined(EVAL_LEARN)
+#include "learn/packed_sfen.h"
+#include "extra/sfen_packer.h"
+#endif
+
 
 /// StateInfo struct stores information needed to restore a Position object to
 /// its previous state when we retract a move. Whenever a move is made on the
@@ -75,9 +80,6 @@ typedef std::unique_ptr<std::deque<StateInfo>> StateListPtr;
 /// traversing the search tree.
 class Thread;
 
-// packed sfen
-struct PackedSfen { uint8_t data[32]; }; 
-
 class Position {
 public:
   static void init();
@@ -178,15 +180,17 @@ public:
 #if defined(EVAL_LEARN)
   // --sfenization helper
 
+  friend int Learner::set_from_packed_sfen(Position& pos, const Learner::PackedSfen& sfen, StateInfo* si, Thread* th, bool mirror);
+
   // Get the packed sfen. Returns to the buffer specified in the argument.
   // Do not include gamePly in pack.
-  void sfen_pack(PackedSfen& sfen);
+  void sfen_pack(Learner::PackedSfen& sfen);
 
   // It is slow to go through sfen, so I made a function to set packed sfen directly.
   // Equivalent to pos.set(sfen_unpack(data),si,th);.
   // If there is a problem with the passed phase and there is an error, non-zero is returned.
   // PackedSfen does not include gamePly so it cannot be restored. If you want to set it, specify it with an argument.
-  int set_from_packed_sfen(const PackedSfen& sfen, StateInfo* si, Thread* th, bool mirror = false);
+  int set_from_packed_sfen(const Learner::PackedSfen& sfen, StateInfo* si, Thread* th, bool mirror = false);
 
   // Give the board, hand piece, and turn, and return the sfen.
   //static std::string sfen_from_rawdata(Piece board[81], Hand hands[2], Color turn, int gamePly);
diff --git a/src/search.h b/src/search.h
index 9d5ce279..5e092273 100644
--- a/src/search.h
+++ b/src/search.h
@@ -117,4 +117,15 @@ void clear();
 
 } // namespace Search
 
+#if defined(EVAL_LEARN)
+namespace Learner {
+
+  // A pair of reader and evaluation value. Returned by Learner::search(),Learner::qsearch().
+  using ValueAndPV = std::pair<Value, std::vector<Move>>;
+
+  ValueAndPV qsearch(Position& pos);
+  ValueAndPV search(Position& pos, int depth_, size_t multiPV = 1, uint64_t nodesLimit = 0);
+}
+#endif
+
 #endif // #ifndef SEARCH_H_INCLUDED
diff --git a/src/uci.cpp b/src/uci.cpp
index 96adf927..0a28fc1f 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -33,6 +33,10 @@
 #include "tt.h"
 #include "uci.h"
 
+#include "learn/gensfen.h"
+#include "learn/learn.h"
+#include "learn/convert.h"
+
 using namespace std;
 
 extern vector<string> setup_bench(const Position&, istream&);
@@ -40,27 +44,6 @@ extern vector<string> setup_bench(const Position&, istream&);
 // FEN string of the initial position, normal chess
 const char* StartFEN = "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1";
 
-// Command to automatically generate a game record
-#if defined (EVAL_LEARN)
-namespace Learner
-{
-  // Automatic generation of teacher position
-  void gen_sfen(Position& pos, istringstream& is);
-
-  // Learning from the generated game record
-  void learn(Position& pos, istringstream& is);
-
-  void convert(istringstream& is);
-
-  // A pair of reader and evaluation value. Returned by Learner::search(),Learner::qsearch().
-  typedef std::pair<Value, std::vector<Move> > ValueAndPV;
-
-  ValueAndPV qsearch(Position& pos);
-  ValueAndPV search(Position& pos, int depth_, size_t multiPV = 1, uint64_t nodesLimit = 0);
-
-}
-#endif
-
 void test_cmd(Position& pos, istringstream& is)
 {
     // Initialize as it may be searched.

From a059fa86c4aa00e8a2ed96aacdf01684ec50a0b4 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 11 Sep 2020 12:08:26 +0200
Subject: [PATCH 096/398] Move sfen_packer to learn.

---
 src/Makefile              |   2 +-
 src/extra/sfen_packer.cpp | 407 --------------------------------------
 src/extra/sfen_packer.h   |  23 ---
 src/position.cpp          |   2 +-
 src/position.h            |   2 +-
 5 files changed, 3 insertions(+), 433 deletions(-)
 delete mode 100644 src/extra/sfen_packer.cpp
 delete mode 100644 src/extra/sfen_packer.h

diff --git a/src/Makefile b/src/Makefile
index 88d759d2..aa13603a 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -55,7 +55,7 @@ SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp
 	nnue/features/castling_right.cpp \
 	nnue/features/enpassant.cpp \
 	nnue/nnue_test_command.cpp \
-	extra/sfen_packer.cpp \
+	learn/sfen_packer.cpp \
 	learn/learn.cpp \
 	learn/gensfen.cpp \
 	learn/convert.cpp \
diff --git a/src/extra/sfen_packer.cpp b/src/extra/sfen_packer.cpp
deleted file mode 100644
index b58ad5dd..00000000
--- a/src/extra/sfen_packer.cpp
+++ /dev/null
@@ -1,407 +0,0 @@
-﻿#if defined (EVAL_LEARN)
-
-#include "sfen_packer.h"
-
-#include "../learn/packed_sfen.h"
-
-#include "../misc.h"
-#include "../position.h"
-
-#include <sstream>
-#include <fstream>
-#include <cstring> // std::memset()
-
-using namespace std;
-
-namespace Learner {
-
-  // Class that handles bitstream
-  // useful when doing aspect encoding
-  struct BitStream
-  {
-    // Set the memory to store the data in advance.
-    // Assume that memory is cleared to 0.
-    void set_data(std::uint8_t* data_) { data = data_; reset(); }
-
-    // Get the pointer passed in set_data().
-    uint8_t* get_data() const { return data; }
-
-    // Get the cursor.
-    int get_cursor() const { return bit_cursor; }
-
-    // reset the cursor
-    void reset() { bit_cursor = 0; }
-
-    // Write 1bit to the stream.
-    // If b is non-zero, write out 1. If 0, write 0.
-    void write_one_bit(int b)
-    {
-      if (b)
-        data[bit_cursor / 8] |= 1 << (bit_cursor & 7);
-
-      ++bit_cursor;
-    }
-
-    // Get 1 bit from the stream.
-    int read_one_bit()
-    {
-      int b = (data[bit_cursor / 8] >> (bit_cursor & 7)) & 1;
-      ++bit_cursor;
-
-      return b;
-    }
-
-    // write n bits of data
-    // Data shall be written out from the lower order of d.
-    void write_n_bit(int d, int n)
-    {
-      for (int i = 0; i <n; ++i)
-        write_one_bit(d & (1 << i));
-    }
-
-    // read n bits of data
-    // Reverse conversion of write_n_bit().
-    int read_n_bit(int n)
-    {
-      int result = 0;
-      for (int i = 0; i < n; ++i)
-        result |= read_one_bit() ? (1 << i) : 0;
-
-      return result;
-    }
-
-  private:
-    // Next bit position to read/write.
-    int bit_cursor;
-
-    // data entity
-    std::uint8_t* data;
-  };
-
-  // Class for compressing/decompressing sfen
-  // sfen can be packed to 256bit (32bytes) by Huffman coding.
-  // This is proven by mini. The above is Huffman coding.
-  //
-  // Internal format = 1-bit turn + 7-bit king position *2 + piece on board (Huffman coding) + hand piece (Huffman coding)
-  // Side to move (White = 0, Black = 1) (1bit)
-  // White King Position (6 bits)
-  // Black King Position (6 bits)
-  // Huffman Encoding of the board
-  // Castling availability (1 bit x 4)
-  // En passant square (1 or 1 + 6 bits)
-  // Rule 50 (6 bits)
-  // Game play (8 bits)
-  //
-  // TODO(someone): Rename SFEN to FEN.
-  //
-  struct SfenPacker
-  {
-    void pack(const Position& pos);
-
-    // sfen packed by pack() (256bit = 32bytes)
-    // Or sfen to decode with unpack()
-    uint8_t *data; // uint8_t[32];
-
-    BitStream stream;
-
-    // Output the board pieces to stream.
-    void write_board_piece_to_stream(Piece pc);
-
-    // Read one board piece from stream
-    Piece read_board_piece_from_stream();
-  };
-
-
-  // Huffman coding
-  // * is simplified from mini encoding to make conversion easier.
-  //
-  // 1 box on the board (other than NO_PIECE) = 2 to 6 bits (+ 1-bit flag + 1-bit forward and backward)
-  // 1 piece of hand piece = 1-5bit (+ 1-bit flag + 1bit ahead and behind)
-  //
-  // empty xxxxx0 + 0 (none)
-  // step xxxx01 + 2 xxxx0 + 2
-  // incense xx0011 + 2 xx001 + 2
-  // Katsura xx1011 + 2 xx101 + 2
-  // silver xx0111 + 2 xx011 + 2
-  // Gold x01111 + 1 x0111 + 1 // Gold is valid and has no flags.
-  // corner 011111 + 2 01111 + 2
-  // Fly 111111 + 2 11111 + 2
-  //
-  // Assuming all pieces are on the board,
-  // Sky 81-40 pieces = 41 boxes = 41bit
-  // Walk 4bit*18 pieces = 72bit
-  // Incense 6bit*4 pieces = 24bit
-  // Katsura 6bit*4 pieces = 24bit
-  // Silver 6bit*4 pieces = 24bit
-  // Gold 6bit* 4 pieces = 24bit
-  // corner 8bit* 2 pieces = 16bit
-  // Fly 8bit* 2 pieces = 16bit
-  // -------
-  // 241bit + 1bit (turn) + 7bit × 2 (King's position after) = 256bit
-  //
-  // When the piece on the board moves to the hand piece, the piece on the board becomes empty, so the box on the board can be expressed with 1 bit,
-  // Since the hand piece can be expressed by 1 bit less than the piece on the board, the total number of bits does not change in the end.
-  // Therefore, in this expression, any aspect can be expressed by this bit number.
-  // It is a hand piece and no flag is required, but if you include this, the bit number of the piece on the board will be -1
-  // Since the total number of bits can be fixed, we will include this as well.
-
-  // Huffman Encoding
-  //
-  // Empty  xxxxxxx0
-  // Pawn   xxxxx001 + 1 bit (Side to move)
-  // Knight xxxxx011 + 1 bit (Side to move)
-  // Bishop xxxxx101 + 1 bit (Side to move)
-  // Rook   xxxxx111 + 1 bit (Side to move)
-
-  struct HuffmanedPiece
-  {
-    int code; // how it will be coded
-    int bits; // How many bits do you have
-  };
-
-  constexpr HuffmanedPiece huffman_table[] =
-  {
-    {0b0000,1}, // NO_PIECE
-    {0b0001,4}, // PAWN
-    {0b0011,4}, // KNIGHT
-    {0b0101,4}, // BISHOP
-    {0b0111,4}, // ROOK
-    {0b1001,4}, // QUEEN
-  };
-
-  // Pack sfen and store in data[32].
-  void SfenPacker::pack(const Position& pos)
-  {
-  // cout << pos;
-
-    memset(data, 0, 32 /* 256bit */);
-    stream.set_data(data);
-
-    // turn
-    // Side to move.
-    stream.write_one_bit((int)(pos.side_to_move()));
-
-    // 7-bit positions for leading and trailing balls
-    // White king and black king, 6 bits for each.
-    for(auto c: Colors)
-      stream.write_n_bit(pos.king_square(c), 6);
-
-    // Write the pieces on the board other than the kings.
-    for (Rank r = RANK_8; r >= RANK_1; --r)
-    {
-      for (File f = FILE_A; f <= FILE_H; ++f)
-      {
-        Piece pc = pos.piece_on(make_square(f, r));
-        if (type_of(pc) == KING)
-          continue;
-        write_board_piece_to_stream(pc);
-      }
-    }
-
-    // TODO(someone): Support chess960.
-    stream.write_one_bit(pos.can_castle(WHITE_OO));
-    stream.write_one_bit(pos.can_castle(WHITE_OOO));
-    stream.write_one_bit(pos.can_castle(BLACK_OO));
-    stream.write_one_bit(pos.can_castle(BLACK_OOO));
-
-    if (pos.ep_square() == SQ_NONE) {
-      stream.write_one_bit(0);
-    }
-    else {
-      stream.write_one_bit(1);
-      stream.write_n_bit(static_cast<int>(pos.ep_square()), 6);
-    }
-
-    stream.write_n_bit(pos.state()->rule50, 6);
-
-    stream.write_n_bit(1 + (pos.game_ply()-(pos.side_to_move() == BLACK)) / 2, 8);
-
-    assert(stream.get_cursor() <= 256);
-  }
-
-  // Output the board pieces to stream.
-  void SfenPacker::write_board_piece_to_stream(Piece pc)
-  {
-    // piece type
-    PieceType pr = type_of(pc);
-    auto c = huffman_table[pr];
-    stream.write_n_bit(c.code, c.bits);
-
-    if (pc == NO_PIECE)
-      return;
-
-    // first and second flag
-    stream.write_one_bit(color_of(pc));
-  }
-
-  // Read one board piece from stream
-  Piece SfenPacker::read_board_piece_from_stream()
-  {
-    PieceType pr = NO_PIECE_TYPE;
-    int code = 0, bits = 0;
-    while (true)
-    {
-      code |= stream.read_one_bit() << bits;
-      ++bits;
-
-      assert(bits <= 6);
-
-      for (pr = NO_PIECE_TYPE; pr <KING; ++pr)
-        if (huffman_table[pr].code == code
-          && huffman_table[pr].bits == bits)
-          goto Found;
-    }
-  Found:;
-    if (pr == NO_PIECE_TYPE)
-      return NO_PIECE;
-
-    // first and second flag
-    Color c = (Color)stream.read_one_bit();
-
-    return make_piece(c, pr);
-  }
-
-  int set_from_packed_sfen(Position& pos, const PackedSfen& sfen, StateInfo* si, Thread* th, bool mirror)
-  {
-    SfenPacker packer;
-    auto& stream = packer.stream;
-
-    // TODO: separate streams for writing and reading. Here we actually have to
-    // const_cast which is not safe in the long run.
-    stream.set_data(const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(&sfen)));
-
-    std::memset(&pos, 0, sizeof(Position));
-    std::memset(si, 0, sizeof(StateInfo));
-    std::fill_n(&pos.pieceList[0][0], sizeof(pos.pieceList) / sizeof(Square), SQ_NONE);
-    pos.st = si;
-
-    // Active color
-    pos.sideToMove = (Color)stream.read_one_bit();
-
-    pos.pieceList[W_KING][0] = SQUARE_NB;
-    pos.pieceList[B_KING][0] = SQUARE_NB;
-
-    // First the position of the ball
-    if (mirror)
-    {
-      for (auto c : Colors)
-        pos.board[flip_file((Square)stream.read_n_bit(6))] = make_piece(c, KING);
-    }
-    else
-    {
-      for (auto c : Colors)
-        pos.board[stream.read_n_bit(6)] = make_piece(c, KING);
-    }
-
-    // Piece placement
-    for (Rank r = RANK_8; r >= RANK_1; --r)
-    {
-      for (File f = FILE_A; f <= FILE_H; ++f)
-      {
-        auto sq = make_square(f, r);
-        if (mirror) {
-          sq = flip_file(sq);
-        }
-
-        // it seems there are already balls
-        Piece pc;
-        if (type_of(pos.board[sq]) != KING)
-        {
-          assert(pos.board[sq] == NO_PIECE);
-          pc = packer.read_board_piece_from_stream();
-        }
-        else
-        {
-          pc = pos.board[sq];
-          // put_piece() will catch ASSERT unless you remove it all.
-          pos.board[sq] = NO_PIECE;
-        }
-
-        // There may be no pieces, so skip in that case.
-        if (pc == NO_PIECE)
-          continue;
-
-        pos.put_piece(Piece(pc), sq);
-
-        if (stream.get_cursor()> 256)
-          return 1;
-
-        //assert(stream.get_cursor() <= 256);
-      }
-    }
-
-    // Castling availability.
-    // TODO(someone): Support chess960.
-    pos.st->castlingRights = 0;
-    if (stream.read_one_bit()) {
-      Square rsq;
-      for (rsq = relative_square(WHITE, SQ_H1); pos.piece_on(rsq) != W_ROOK; --rsq) {}
-      pos.set_castling_right(WHITE, rsq);
-    }
-    if (stream.read_one_bit()) {
-      Square rsq;
-      for (rsq = relative_square(WHITE, SQ_A1); pos.piece_on(rsq) != W_ROOK; ++rsq) {}
-      pos.set_castling_right(WHITE, rsq);
-    }
-    if (stream.read_one_bit()) {
-      Square rsq;
-      for (rsq = relative_square(BLACK, SQ_H1); pos.piece_on(rsq) != B_ROOK; --rsq) {}
-      pos.set_castling_right(BLACK, rsq);
-    }
-    if (stream.read_one_bit()) {
-      Square rsq;
-      for (rsq = relative_square(BLACK, SQ_A1); pos.piece_on(rsq) != B_ROOK; ++rsq) {}
-      pos.set_castling_right(BLACK, rsq);
-    }
-
-    // En passant square. Ignore if no pawn capture is possible
-    if (stream.read_one_bit()) {
-      Square ep_square = static_cast<Square>(stream.read_n_bit(6));
-      if (mirror) {
-        ep_square = flip_file(ep_square);
-      }
-      pos.st->epSquare = ep_square;
-
-      if (!(pos.attackers_to(pos.st->epSquare) & pos.pieces(pos.sideToMove, PAWN))
-        || !(pos.pieces(~pos.sideToMove, PAWN) & (pos.st->epSquare + pawn_push(~pos.sideToMove))))
-        pos.st->epSquare = SQ_NONE;
-    }
-    else {
-      pos.st->epSquare = SQ_NONE;
-    }
-
-    // Halfmove clock
-    pos.st->rule50 = static_cast<Square>(stream.read_n_bit(6));
-
-    // Fullmove number
-    pos.gamePly = static_cast<Square>(stream.read_n_bit(8));
-
-    // Convert from fullmove starting from 1 to gamePly starting from 0,
-    // handle also common incorrect FEN with fullmove = 0.
-    pos.gamePly = std::max(2 * (pos.gamePly - 1), 0) + (pos.sideToMove == BLACK);
-
-    assert(stream.get_cursor() <= 256);
-
-    pos.chess960 = false;
-    pos.thisThread = th;
-    pos.set_state(pos.st);
-
-    assert(pos_is_ok());
-
-    return 0;
-  }
-
-  PackedSfen sfen_pack(Position& pos)
-  {
-    PackedSfen sfen;
-
-    SfenPacker sp;
-    sp.data = (uint8_t*)&sfen;
-    sp.pack(pos);
-
-    return sfen;
-  }
-}
-
-
-#endif // USE_SFEN_PACKER
diff --git a/src/extra/sfen_packer.h b/src/extra/sfen_packer.h
deleted file mode 100644
index c3832db2..00000000
--- a/src/extra/sfen_packer.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef _SFEN_PACKER_H_
-#define _SFEN_PACKER_H_
-
-#if defined(EVAL_LEARN)
-
-#include <cstdint>
-
-#include "../types.h"
-
-#include "../learn/packed_sfen.h"
-class Position;
-struct StateInfo;
-class Thread;
-
-namespace Learner {
-
-    int set_from_packed_sfen(Position& pos, const PackedSfen& sfen, StateInfo* si, Thread* th, bool mirror);
-    PackedSfen sfen_pack(Position& pos);
-}
-
-#endif
-
-#endif
\ No newline at end of file
diff --git a/src/position.cpp b/src/position.cpp
index a9fc8272..9465afbc 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -34,7 +34,7 @@
 
 #if defined(EVAL_LEARN)
 #include "learn/packed_sfen.h"
-#include "extra/sfen_packer.h"
+#include "learn/sfen_packer.h"
 #endif
 
 using std::string;
diff --git a/src/position.h b/src/position.h
index 382748af..10cf45ba 100644
--- a/src/position.h
+++ b/src/position.h
@@ -32,7 +32,7 @@
 
 #if defined(EVAL_LEARN)
 #include "learn/packed_sfen.h"
-#include "extra/sfen_packer.h"
+#include "learn/sfen_packer.h"
 #endif
 
 
From 96fa8fa8dce77a840190d7ec4bf61adc6ffd5cc7 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 11 Sep 2020 12:21:41 +0200
Subject: [PATCH 097/398] Add missing files.

---
 src/learn/sfen_packer.cpp | 407 ++++++++++++++++++++++++++++++++++++++
 src/learn/sfen_packer.h   |  24 +++
 2 files changed, 431 insertions(+)
 create mode 100644 src/learn/sfen_packer.cpp
 create mode 100644 src/learn/sfen_packer.h

diff --git a/src/learn/sfen_packer.cpp b/src/learn/sfen_packer.cpp
new file mode 100644
index 00000000..236c875f
--- /dev/null
+++ b/src/learn/sfen_packer.cpp
@@ -0,0 +1,407 @@
+﻿#if defined (EVAL_LEARN)
+
+#include "sfen_packer.h"
+
+#include "packed_sfen.h"
+
+#include "misc.h"
+#include "position.h"
+
+#include <sstream>
+#include <fstream>
+#include <cstring> // std::memset()
+
+using namespace std;
+
+namespace Learner {
+
+  // Class that handles bitstream
+  // useful when doing aspect encoding
+  struct BitStream
+  {
+    // Set the memory to store the data in advance.
+    // Assume that memory is cleared to 0.
+    void set_data(std::uint8_t* data_) { data = data_; reset(); }
+
+    // Get the pointer passed in set_data().
+    uint8_t* get_data() const { return data; }
+
+    // Get the cursor.
+    int get_cursor() const { return bit_cursor; }
+
+    // reset the cursor
+    void reset() { bit_cursor = 0; }
+
+    // Write 1bit to the stream.
+    // If b is non-zero, write out 1. If 0, write 0.
+    void write_one_bit(int b)
+    {
+      if (b)
+        data[bit_cursor / 8] |= 1 << (bit_cursor & 7);
+
+      ++bit_cursor;
+    }
+
+    // Get 1 bit from the stream.
+    int read_one_bit()
+    {
+      int b = (data[bit_cursor / 8] >> (bit_cursor & 7)) & 1;
+      ++bit_cursor;
+
+      return b;
+    }
+
+    // write n bits of data
+    // Data shall be written out from the lower order of d.
+    void write_n_bit(int d, int n)
+    {
+      for (int i = 0; i <n; ++i)
+        write_one_bit(d & (1 << i));
+    }
+
+    // read n bits of data
+    // Reverse conversion of write_n_bit().
+    int read_n_bit(int n)
+    {
+      int result = 0;
+      for (int i = 0; i < n; ++i)
+        result |= read_one_bit() ? (1 << i) : 0;
+
+      return result;
+    }
+
+  private:
+    // Next bit position to read/write.
+    int bit_cursor;
+
+    // data entity
+    std::uint8_t* data;
+  };
+
+  // Class for compressing/decompressing sfen
+  // sfen can be packed to 256bit (32bytes) by Huffman coding.
+  // This is proven by mini. The above is Huffman coding.
+  //
+  // Internal format = 1-bit turn + 7-bit king position *2 + piece on board (Huffman coding) + hand piece (Huffman coding)
+  // Side to move (White = 0, Black = 1) (1bit)
+  // White King Position (6 bits)
+  // Black King Position (6 bits)
+  // Huffman Encoding of the board
+  // Castling availability (1 bit x 4)
+  // En passant square (1 or 1 + 6 bits)
+  // Rule 50 (6 bits)
+  // Game play (8 bits)
+  //
+  // TODO(someone): Rename SFEN to FEN.
+  //
+  struct SfenPacker
+  {
+    void pack(const Position& pos);
+
+    // sfen packed by pack() (256bit = 32bytes)
+    // Or sfen to decode with unpack()
+    uint8_t *data; // uint8_t[32];
+
+    BitStream stream;
+
+    // Output the board pieces to stream.
+    void write_board_piece_to_stream(Piece pc);
+
+    // Read one board piece from stream
+    Piece read_board_piece_from_stream();
+  };
+
+
+  // Huffman coding
+  // * is simplified from mini encoding to make conversion easier.
+  //
+  // 1 box on the board (other than NO_PIECE) = 2 to 6 bits (+ 1-bit flag + 1-bit forward and backward)
+  // 1 piece of hand piece = 1-5bit (+ 1-bit flag + 1bit ahead and behind)
+  //
+  // empty xxxxx0 + 0 (none)
+  // step xxxx01 + 2 xxxx0 + 2
+  // incense xx0011 + 2 xx001 + 2
+  // Katsura xx1011 + 2 xx101 + 2
+  // silver xx0111 + 2 xx011 + 2
+  // Gold x01111 + 1 x0111 + 1 // Gold is valid and has no flags.
+  // corner 011111 + 2 01111 + 2
+  // Fly 111111 + 2 11111 + 2
+  //
+  // Assuming all pieces are on the board,
+  // Sky 81-40 pieces = 41 boxes = 41bit
+  // Walk 4bit*18 pieces = 72bit
+  // Incense 6bit*4 pieces = 24bit
+  // Katsura 6bit*4 pieces = 24bit
+  // Silver 6bit*4 pieces = 24bit
+  // Gold 6bit* 4 pieces = 24bit
+  // corner 8bit* 2 pieces = 16bit
+  // Fly 8bit* 2 pieces = 16bit
+  // -------
+  // 241bit + 1bit (turn) + 7bit × 2 (King's position after) = 256bit
+  //
+  // When the piece on the board moves to the hand piece, the piece on the board becomes empty, so the box on the board can be expressed with 1 bit,
+  // Since the hand piece can be expressed by 1 bit less than the piece on the board, the total number of bits does not change in the end.
+  // Therefore, in this expression, any aspect can be expressed by this bit number.
+  // It is a hand piece and no flag is required, but if you include this, the bit number of the piece on the board will be -1
+  // Since the total number of bits can be fixed, we will include this as well.
+
+  // Huffman Encoding
+  //
+  // Empty  xxxxxxx0
+  // Pawn   xxxxx001 + 1 bit (Side to move)
+  // Knight xxxxx011 + 1 bit (Side to move)
+  // Bishop xxxxx101 + 1 bit (Side to move)
+  // Rook   xxxxx111 + 1 bit (Side to move)
+
+  struct HuffmanedPiece
+  {
+    int code; // how it will be coded
+    int bits; // How many bits do you have
+  };
+
+  constexpr HuffmanedPiece huffman_table[] =
+  {
+    {0b0000,1}, // NO_PIECE
+    {0b0001,4}, // PAWN
+    {0b0011,4}, // KNIGHT
+    {0b0101,4}, // BISHOP
+    {0b0111,4}, // ROOK
+    {0b1001,4}, // QUEEN
+  };
+
+  // Pack sfen and store in data[32].
+  void SfenPacker::pack(const Position& pos)
+  {
+  // cout << pos;
+
+    memset(data, 0, 32 /* 256bit */);
+    stream.set_data(data);
+
+    // turn
+    // Side to move.
+    stream.write_one_bit((int)(pos.side_to_move()));
+
+    // 7-bit positions for leading and trailing balls
+    // White king and black king, 6 bits for each.
+    for(auto c: Colors)
+      stream.write_n_bit(pos.king_square(c), 6);
+
+    // Write the pieces on the board other than the kings.
+    for (Rank r = RANK_8; r >= RANK_1; --r)
+    {
+      for (File f = FILE_A; f <= FILE_H; ++f)
+      {
+        Piece pc = pos.piece_on(make_square(f, r));
+        if (type_of(pc) == KING)
+          continue;
+        write_board_piece_to_stream(pc);
+      }
+    }
+
+    // TODO(someone): Support chess960.
+    stream.write_one_bit(pos.can_castle(WHITE_OO));
+    stream.write_one_bit(pos.can_castle(WHITE_OOO));
+    stream.write_one_bit(pos.can_castle(BLACK_OO));
+    stream.write_one_bit(pos.can_castle(BLACK_OOO));
+
+    if (pos.ep_square() == SQ_NONE) {
+      stream.write_one_bit(0);
+    }
+    else {
+      stream.write_one_bit(1);
+      stream.write_n_bit(static_cast<int>(pos.ep_square()), 6);
+    }
+
+    stream.write_n_bit(pos.state()->rule50, 6);
+
+    stream.write_n_bit(1 + (pos.game_ply()-(pos.side_to_move() == BLACK)) / 2, 8);
+
+    assert(stream.get_cursor() <= 256);
+  }
+
+  // Output the board pieces to stream.
+  void SfenPacker::write_board_piece_to_stream(Piece pc)
+  {
+    // piece type
+    PieceType pr = type_of(pc);
+    auto c = huffman_table[pr];
+    stream.write_n_bit(c.code, c.bits);
+
+    if (pc == NO_PIECE)
+      return;
+
+    // first and second flag
+    stream.write_one_bit(color_of(pc));
+  }
+
+  // Read one board piece from stream
+  Piece SfenPacker::read_board_piece_from_stream()
+  {
+    PieceType pr = NO_PIECE_TYPE;
+    int code = 0, bits = 0;
+    while (true)
+    {
+      code |= stream.read_one_bit() << bits;
+      ++bits;
+
+      assert(bits <= 6);
+
+      for (pr = NO_PIECE_TYPE; pr <KING; ++pr)
+        if (huffman_table[pr].code == code
+          && huffman_table[pr].bits == bits)
+          goto Found;
+    }
+  Found:;
+    if (pr == NO_PIECE_TYPE)
+      return NO_PIECE;
+
+    // first and second flag
+    Color c = (Color)stream.read_one_bit();
+
+    return make_piece(c, pr);
+  }
+
+  int set_from_packed_sfen(Position& pos, const PackedSfen& sfen, StateInfo* si, Thread* th, bool mirror)
+  {
+    SfenPacker packer;
+    auto& stream = packer.stream;
+
+    // TODO: separate streams for writing and reading. Here we actually have to
+    // const_cast which is not safe in the long run.
+    stream.set_data(const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(&sfen)));
+
+    pos.clear();
+    std::memset(si, 0, sizeof(StateInfo));
+    std::fill_n(&pos.pieceList[0][0], sizeof(pos.pieceList) / sizeof(Square), SQ_NONE);
+    pos.st = si;
+
+    // Active color
+    pos.sideToMove = (Color)stream.read_one_bit();
+
+    pos.pieceList[W_KING][0] = SQUARE_NB;
+    pos.pieceList[B_KING][0] = SQUARE_NB;
+
+    // First the position of the ball
+    if (mirror)
+    {
+      for (auto c : Colors)
+        pos.board[flip_file((Square)stream.read_n_bit(6))] = make_piece(c, KING);
+    }
+    else
+    {
+      for (auto c : Colors)
+        pos.board[stream.read_n_bit(6)] = make_piece(c, KING);
+    }
+
+    // Piece placement
+    for (Rank r = RANK_8; r >= RANK_1; --r)
+    {
+      for (File f = FILE_A; f <= FILE_H; ++f)
+      {
+        auto sq = make_square(f, r);
+        if (mirror) {
+          sq = flip_file(sq);
+        }
+
+        // it seems there are already balls
+        Piece pc;
+        if (type_of(pos.board[sq]) != KING)
+        {
+          assert(pos.board[sq] == NO_PIECE);
+          pc = packer.read_board_piece_from_stream();
+        }
+        else
+        {
+          pc = pos.board[sq];
+          // put_piece() will catch ASSERT unless you remove it all.
+          pos.board[sq] = NO_PIECE;
+        }
+
+        // There may be no pieces, so skip in that case.
+        if (pc == NO_PIECE)
+          continue;
+
+        pos.put_piece(Piece(pc), sq);
+
+        if (stream.get_cursor()> 256)
+          return 1;
+
+        //assert(stream.get_cursor() <= 256);
+      }
+    }
+
+    // Castling availability.
+    // TODO(someone): Support chess960.
+    pos.st->castlingRights = 0;
+    if (stream.read_one_bit()) {
+      Square rsq;
+      for (rsq = relative_square(WHITE, SQ_H1); pos.piece_on(rsq) != W_ROOK; --rsq) {}
+      pos.set_castling_right(WHITE, rsq);
+    }
+    if (stream.read_one_bit()) {
+      Square rsq;
+      for (rsq = relative_square(WHITE, SQ_A1); pos.piece_on(rsq) != W_ROOK; ++rsq) {}
+      pos.set_castling_right(WHITE, rsq);
+    }
+    if (stream.read_one_bit()) {
+      Square rsq;
+      for (rsq = relative_square(BLACK, SQ_H1); pos.piece_on(rsq) != B_ROOK; --rsq) {}
+      pos.set_castling_right(BLACK, rsq);
+    }
+    if (stream.read_one_bit()) {
+      Square rsq;
+      for (rsq = relative_square(BLACK, SQ_A1); pos.piece_on(rsq) != B_ROOK; ++rsq) {}
+      pos.set_castling_right(BLACK, rsq);
+    }
+
+    // En passant square. Ignore if no pawn capture is possible
+    if (stream.read_one_bit()) {
+      Square ep_square = static_cast<Square>(stream.read_n_bit(6));
+      if (mirror) {
+        ep_square = flip_file(ep_square);
+      }
+      pos.st->epSquare = ep_square;
+
+      if (!(pos.attackers_to(pos.st->epSquare) & pos.pieces(pos.sideToMove, PAWN))
+        || !(pos.pieces(~pos.sideToMove, PAWN) & (pos.st->epSquare + pawn_push(~pos.sideToMove))))
+        pos.st->epSquare = SQ_NONE;
+    }
+    else {
+      pos.st->epSquare = SQ_NONE;
+    }
+
+    // Halfmove clock
+    pos.st->rule50 = static_cast<Square>(stream.read_n_bit(6));
+
+    // Fullmove number
+    pos.gamePly = static_cast<Square>(stream.read_n_bit(8));
+
+    // Convert from fullmove starting from 1 to gamePly starting from 0,
+    // handle also common incorrect FEN with fullmove = 0.
+    pos.gamePly = std::max(2 * (pos.gamePly - 1), 0) + (pos.sideToMove == BLACK);
+
+    assert(stream.get_cursor() <= 256);
+
+    pos.chess960 = false;
+    pos.thisThread = th;
+    pos.set_state(pos.st);
+
+    assert(pos_is_ok());
+
+    return 0;
+  }
+
+  PackedSfen sfen_pack(Position& pos)
+  {
+    PackedSfen sfen;
+
+    SfenPacker sp;
+    sp.data = (uint8_t*)&sfen;
+    sp.pack(pos);
+
+    return sfen;
+  }
+}
+
+
+#endif // USE_SFEN_PACKER
diff --git a/src/learn/sfen_packer.h b/src/learn/sfen_packer.h
new file mode 100644
index 00000000..af900902
--- /dev/null
+++ b/src/learn/sfen_packer.h
@@ -0,0 +1,24 @@
+#ifndef _SFEN_PACKER_H_
+#define _SFEN_PACKER_H_
+
+#if defined(EVAL_LEARN)
+
+#include "types.h"
+
+#include "learn/packed_sfen.h"
+
+#include <cstdint>
+
+class Position;
+struct StateInfo;
+class Thread;
+
+namespace Learner {
+
+    int set_from_packed_sfen(Position& pos, const PackedSfen& sfen, StateInfo* si, Thread* th, bool mirror);
+    PackedSfen sfen_pack(Position& pos);
+}
+
+#endif
+
+#endif
\ No newline at end of file

From 3c87d4fa9b8ec9951c69141ebf426ea4495a8bbd Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 11 Sep 2020 12:22:03 +0200
Subject: [PATCH 098/398] "Fix" warning when memsetting Position

---
 src/position.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/position.h b/src/position.h
index 10cf45ba..aa2d34e7 100644
--- a/src/position.h
+++ b/src/position.h
@@ -192,6 +192,8 @@ public:
   // PackedSfen does not include gamePly so it cannot be restored. If you want to set it, specify it with an argument.
   int set_from_packed_sfen(const Learner::PackedSfen& sfen, StateInfo* si, Thread* th, bool mirror = false);
 
+  void clear() { std::memset(this, 0, sizeof(Position)); }
+
   // Give the board, hand piece, and turn, and return the sfen.
   //static std::string sfen_from_rawdata(Piece board[81], Hand hands[2], Color turn, int gamePly);
 

From 98f24570abe9605df21f786921a41f34fdfaf2fc Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 11 Sep 2020 12:23:29 +0200
Subject: [PATCH 099/398] Add src to include paths, remove non-standard ".." in
 includes in learn directory.

---
 src/Makefile                 |  2 +-
 src/learn/convert.cpp        | 22 ++++++++++++----------
 src/learn/gensfen.cpp        | 24 ++++++++++++++----------
 src/learn/gensfen.h          |  4 ++--
 src/learn/half_float.h       |  2 +-
 src/learn/learn.cpp          | 26 +++++++++++++++-----------
 src/learn/learn.h            |  5 ++---
 src/learn/learning_tools.cpp |  2 +-
 src/learn/learning_tools.h   |  6 +++---
 src/learn/multi_think.cpp    | 14 +++++++-------
 src/learn/multi_think.h      | 11 ++++++-----
 11 files changed, 64 insertions(+), 54 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index aa13603a..ac0b7338 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -321,7 +321,7 @@ endif
 ### ==========================================================================
 
 ### 3.1 Selecting compiler (default = gcc)
-CXXFLAGS += -g -Wall -Wcast-qual -fno-exceptions -std=c++17 $(EXTRACXXFLAGS) $(LEARNCXXFLAGS)
+CXXFLAGS += -g -Wall -Wcast-qual -fno-exceptions -std=c++17 -I. $(EXTRACXXFLAGS) $(LEARNCXXFLAGS)
 DEPENDFLAGS += -std=c++17
 LDFLAGS += $(EXTRALDFLAGS) $(LEARNLDFLAGS)
 
diff --git a/src/learn/convert.cpp b/src/learn/convert.cpp
index d50233eb..e9dcb10b 100644
--- a/src/learn/convert.cpp
+++ b/src/learn/convert.cpp
@@ -2,18 +2,20 @@
 
 #include "convert.h"
 
-// evaluate header for learning
-#include "../eval/evaluate_common.h"
-
 #include "multi_think.h"
-#include "../uci.h"
-#include "../syzygy/tbprobe.h"
-#include "../misc.h"
-#include "../thread.h"
-#include "../position.h"
-#include "../tt.h"
 
-#include "../extra/nnue_data_binpack_format.h"
+#include "uci.h"
+#include "misc.h"
+#include "thread.h"
+#include "position.h"
+#include "tt.h"
+
+// evaluate header for learning
+#include "eval/evaluate_common.h"
+
+#include "extra/nnue_data_binpack_format.h"
+
+#include "syzygy/tbprobe.h"
 
 #include <sstream>
 #include <fstream>
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 9f53e983..ebf47188 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -1,19 +1,23 @@
 ﻿#if defined(EVAL_LEARN)
 
 #include "gensfen.h"
-#include "packed_sfen.h"
 
-#include "../eval/evaluate_common.h"
-#include "../misc.h"
-#include "../nnue/evaluate_nnue_learner.h"
-#include "../position.h"
-#include "../syzygy/tbprobe.h"
-#include "../thread.h"
-#include "../tt.h"
-#include "../uci.h"
+#include "packed_sfen.h"
 #include "multi_think.h"
 
-#include "../extra/nnue_data_binpack_format.h"
+#include "misc.h"
+#include "position.h"
+#include "thread.h"
+#include "tt.h"
+#include "uci.h"
+
+#include "eval/evaluate_common.h"
+
+#include "extra/nnue_data_binpack_format.h"
+
+#include "nnue/evaluate_nnue_learner.h"
+
+#include "syzygy/tbprobe.h"
 
 #include <chrono>
 #include <climits>
diff --git a/src/learn/gensfen.h b/src/learn/gensfen.h
index dd0f71fb..45e4ca23 100644
--- a/src/learn/gensfen.h
+++ b/src/learn/gensfen.h
@@ -1,9 +1,9 @@
 #ifndef _GENSFEN_H_
 #define _GENSFEN_H_
 
-#include <sstream>
+#include "position.h"
 
-#include "../position.h"
+#include <sstream>
 
 #if defined(EVAL_LEARN)
 namespace Learner {
diff --git a/src/learn/half_float.h b/src/learn/half_float.h
index 30b3e482..ebe77526 100644
--- a/src/learn/half_float.h
+++ b/src/learn/half_float.h
@@ -7,7 +7,7 @@
 // Floating point operation by 16bit type
 // Assume that the float type code generated by the compiler is in IEEE 754 format and use it.
 
-#include "../types.h"
+#include "types.h"
 
 namespace HalfFloat
 {
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index f4f7b409..b5df2276 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -20,20 +20,24 @@
 #if defined(EVAL_LEARN)
 
 #include "learn.h"
-#include "convert.h"
 
-#include "../eval/evaluate_common.h"
-#include "../misc.h"
-#include "../nnue/evaluate_nnue_learner.h"
-#include "../position.h"
-#include "../syzygy/tbprobe.h"
-#include "../thread.h"
-#include "../tt.h"
-#include "../uci.h"
-#include "../search.h"
+#include "convert.h"
 #include "multi_think.h"
 
-#include "../extra/nnue_data_binpack_format.h"
+#include "misc.h"
+#include "position.h"
+#include "thread.h"
+#include "tt.h"
+#include "uci.h"
+#include "search.h"
+
+#include "eval/evaluate_common.h"
+
+#include "extra/nnue_data_binpack_format.h"
+
+#include "nnue/evaluate_nnue_learner.h"
+
+#include "syzygy/tbprobe.h"
 
 #include <chrono>
 #include <climits>
diff --git a/src/learn/learn.h b/src/learn/learn.h
index b8acc2df..7ee89009 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -3,8 +3,6 @@
 
 #if defined(EVAL_LEARN)
 
-#include <vector>
-
 // ----------------------
 // Floating point for learning
 // ----------------------
@@ -39,9 +37,10 @@ using LearnFloatType = float;
 
 #include "packed_sfen.h"
 
-#include "../position.h"
+#include "position.h"
 
 #include <sstream>
+#include <vector>
 
 namespace Learner
 {
diff --git a/src/learn/learning_tools.cpp b/src/learn/learning_tools.cpp
index eca11c47..285b3487 100644
--- a/src/learn/learning_tools.cpp
+++ b/src/learn/learning_tools.cpp
@@ -2,7 +2,7 @@
 
 #if defined (EVAL_LEARN)
 
-#include "../misc.h"
+#include "misc.h"
 
 using namespace Eval;
 
diff --git a/src/learn/learning_tools.h b/src/learn/learning_tools.h
index 1f9bdf96..194a9732 100644
--- a/src/learn/learning_tools.h
+++ b/src/learn/learning_tools.h
@@ -3,11 +3,11 @@
 
 // A set of machine learning tools related to the weight array used for machine learning of evaluation functions
 
-#include "learn.h"
-
 #if defined (EVAL_LEARN)
 
-#include "../misc.h"  // PRNG , my_insertion_sort
+#include "learn.h"
+
+#include "misc.h"  // PRNG , my_insertion_sort
 
 #include <array>
 #include <cmath>	// std::sqrt()
diff --git a/src/learn/multi_think.cpp b/src/learn/multi_think.cpp
index 82ebeabb..28b3e152 100644
--- a/src/learn/multi_think.cpp
+++ b/src/learn/multi_think.cpp
@@ -1,10 +1,10 @@
-﻿#include "../types.h"
-
-#if defined(EVAL_LEARN)
+﻿#if defined(EVAL_LEARN)
 
 #include "multi_think.h"
-#include "../tt.h"
-#include "../uci.h"
+
+#include "tt.h"
+#include "uci.h"
+#include "types.h"
 
 #include <thread>
 
@@ -35,13 +35,13 @@ void MultiThink::go_think()
 
 	// Secure end flag of worker thread
 	thread_finished.resize(thread_num);
-	
+
 	// start worker thread
 	for (size_t i = 0; i < thread_num; ++i)
 	{
 		thread_finished[i] = 0;
 		threads.push_back(std::thread([i, this]
-		{ 
+		{
 			// exhaust all processor threads.
 			WinProcGroup::bindThisThread(i);
 
diff --git a/src/learn/multi_think.h b/src/learn/multi_think.h
index 6225144c..4f423da0 100644
--- a/src/learn/multi_think.h
+++ b/src/learn/multi_think.h
@@ -3,15 +3,16 @@
 
 #if defined(EVAL_LEARN)
 
-#include <functional>
-#include <mutex>
+#include "learn.h"
 
-#include "../misc.h"
-#include "../learn/learn.h"
-#include "../thread_win32_osx.h"
+#include "misc.h"
+#include "thread_win32_osx.h"
 
 #include <atomic>
 #include <limits>
+#include <functional>
+#include <mutex>
+
 
 // Learning from a game record, when making yourself think and generating a fixed track, etc.
 // Helper class used when multiple threads want to call Search::think() individually.

From 3388c22d7165429c040e468c43b326364c19122b Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 11 Sep 2020 13:06:53 +0200
Subject: [PATCH 100/398] Fix incorrect use of UCI::Option of type "combo".

---
 src/evaluate.cpp  | 2 +-
 src/ucioption.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 94581998..3b0b0f88 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -37,7 +37,7 @@ namespace Eval {
   UseNNUEMode useNNUE;
   std::string eval_file_loaded="None";
 
-  static UseNNUEMode nnue_mode_from_option(const std::string& mode)
+  static UseNNUEMode nnue_mode_from_option(const UCI::Option& mode)
   {
     if (mode == "false")
       return UseNNUEMode::False;
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index 61e47539..91fa199b 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -86,7 +86,7 @@ void init(OptionsMap& o) {
   o["SyzygyProbeDepth"]      << Option(1, 1, 100);
   o["Syzygy50MoveRule"]      << Option(true);
   o["SyzygyProbeLimit"]      << Option(7, 0, 7);
-#ifdef EVAL_LEARN
+#if defined(EVAL_LEARN)
   o["Use NNUE"]              << Option("true var true var false var pure", "true", on_use_NNUE);
 #else
   o["Use NNUE"]              << Option("true var true var false", "true", on_use_NNUE);

From bcfe28b2ae468d045e2f96b659401422531bacba Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 11 Sep 2020 13:07:16 +0200
Subject: [PATCH 101/398] Fix compilation of sfen_packer.cpp in debug.

---
 src/learn/sfen_packer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/learn/sfen_packer.cpp b/src/learn/sfen_packer.cpp
index 236c875f..791870ca 100644
--- a/src/learn/sfen_packer.cpp
+++ b/src/learn/sfen_packer.cpp
@@ -386,7 +386,7 @@ namespace Learner {
     pos.thisThread = th;
     pos.set_state(pos.st);
 
-    assert(pos_is_ok());
+    assert(pos.pos_is_ok());
 
     return 0;
   }

From 580b09381b0fa42d3bce3e5eb7acc10a15675cf3 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Sat, 12 Sep 2020 14:11:46 +0200
Subject: [PATCH 102/398] Add a learning command to CI

fixes a small issue, with ponder

Probably the learning command can be improved a bit, so that despite the limited data, the code coverage is better.
---
 src/learn/learn.cpp         |  2 ++
 tests/instrumented_learn.sh | 37 ++++++++++++++++++++++++++++++-------
 2 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index b5df2276..0459dd90 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -1981,6 +1981,8 @@ namespace Learner
         // Read evaluation function parameters
         Eval::init_NNUE();
 
+        Threads.main()->ponder = false;
+
         cout << "init_training.." << endl;
         Eval::NNUE::InitializeTraining(eta1, eta1_epoch, eta2, eta2_epoch, eta3);
         Eval::NNUE::SetBatchSize(nn_batch_size);
diff --git a/tests/instrumented_learn.sh b/tests/instrumented_learn.sh
index 147c0c97..71f9421c 100755
--- a/tests/instrumented_learn.sh
+++ b/tests/instrumented_learn.sh
@@ -64,8 +64,8 @@ EOF
   ;;
 esac
 
-mkdir -p training_data_01
-mkdir -p training_data_02
+mkdir -p training_data
+mkdir -p validation_data
 
 # gensfen testing 01
 cat << EOF > gensfen01.exp
@@ -78,9 +78,9 @@ cat << EOF > gensfen01.exp
  send "setoption name Threads value $threads\n"
  send "setoption name Use NNUE value false\n"
  send "isready\n"
- send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data_01/training_data.bin use_raw_nnue_eval 0 sfen_format bin\n"
+ send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data/training_data.bin use_raw_nnue_eval 0 sfen_format bin\n"
  expect "gensfen finished."
- send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data_01/training_data.binpack use_raw_nnue_eval 0 sfen_format binpack\n"
+ send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data/training_data.binpack use_raw_nnue_eval 0 sfen_format binpack\n"
  expect "gensfen finished."
 
  send "quit\n"
@@ -102,9 +102,9 @@ cat << EOF > gensfen02.exp
  send "setoption name Threads value $threads\n"
  send "setoption name Use NNUE value true\n"
  send "isready\n"
- send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data_01/training_data.bin use_raw_nnue_eval 0 sfen_format bin\n"
+ send "gensfen depth 4 loop 50 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name validation_data/valdidation_data.bin use_raw_nnue_eval 0 sfen_format bin\n"
  expect "gensfen finished."
- send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data_01/training_data.binpack use_raw_nnue_eval 0 sfen_format binpack\n"
+ send "gensfen depth 4 loop 50 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name validation_data/validation_data.binpack use_raw_nnue_eval 0 sfen_format binpack\n"
  expect "gensfen finished."
 
  send "quit\n"
@@ -115,7 +115,30 @@ cat << EOF > gensfen02.exp
  exit \$value
 EOF
 
-for exp in gensfen01.exp gensfen02.exp
+# simple learning
+cat << EOF > learn01.exp
+ set timeout 240
+ spawn $exeprefix ./stockfish
+
+ send "uci\n"
+ send "setoption name SkipLoadingEval value true\n"
+ send "setoption name Use NNUE value true\n"
+ send "setoption name Threads value $threads\n"
+ send "isready\n"
+ send "learn targetdir training_data loop 2 batchsize 100 use_draw_in_training 1 use_draw_in_validation 1 eta 1 lambda 1 eval_limit 32000 nn_batch_size 30 newbob_decay 0.5 eval_save_interval 30 loss_output_interval 10 mirror_percentage 50 validation_set_file_name validation_data/validation_data.bin\n"
+
+ expect "save_eval() finished."
+
+ send "quit\n"
+ expect eof
+
+ # return error code of the spawned program, useful for valgrind
+ lassign [wait] pid spawnid os_error_flag value
+ exit \$value
+
+EOF
+
+for exp in gensfen01.exp gensfen02.exp learn01.exp
 do
 
   echo "$prefix expect $exp $postfix"

From 8d499e6efa924c4214bfeef65a1368c3f8b025bf Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Sat, 12 Sep 2020 14:36:43 +0200
Subject: [PATCH 103/398] Fix flags for dependency generation
 (98f24570abe9605df21f786921a41f34fdfaf2fc)

---
 src/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index ac0b7338..35030be7 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -322,7 +322,7 @@ endif
 
 ### 3.1 Selecting compiler (default = gcc)
 CXXFLAGS += -g -Wall -Wcast-qual -fno-exceptions -std=c++17 -I. $(EXTRACXXFLAGS) $(LEARNCXXFLAGS)
-DEPENDFLAGS += -std=c++17
+DEPENDFLAGS += -std=c++17 -I.
 LDFLAGS += $(EXTRALDFLAGS) $(LEARNLDFLAGS)
 
 ifeq ($(COMP),)
@@ -928,6 +928,6 @@ profile-learn: net config-sanity objclean profileclean
 	rm generated_kifu.bin
 
 .depend:
-	-@$(CXX) $(DEPENDFLAGS) -MM $(SRCS) > $@ 2> /dev/null
+	-@$(CXX) $(DEPENDFLAGS) -MM $(SRCS) > $@
 
 -include .depend

From d33e7a9b07d1aae2edf72f87ae0ba00db5a15cd9 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 12 Sep 2020 16:19:24 +0200
Subject: [PATCH 104/398] Remove conditional compilation on EVAL_LEARN

---
 src/eval/evaluate_common.h                     |  4 ----
 src/evaluate.cpp                               |  5 -----
 src/evaluate.h                                 |  7 ++-----
 src/learn/convert.cpp                          |  3 ---
 src/learn/convert.h                            |  2 --
 src/learn/gensfen.cpp                          |  5 +----
 src/learn/gensfen.h                            |  2 --
 src/learn/learn.cpp                            |  4 ----
 src/learn/learn.h                              |  4 ----
 src/learn/learning_tools.cpp                   |  4 ----
 src/learn/learning_tools.h                     |  3 ---
 src/learn/multi_think.cpp                      |  7 +------
 src/learn/multi_think.h                        |  4 ----
 src/learn/packed_sfen.h                        |  3 ---
 src/learn/sfen_packer.cpp                      |  7 +------
 src/learn/sfen_packer.h                        |  4 ----
 src/nnue/evaluate_nnue_learner.cpp             |  4 ----
 src/nnue/evaluate_nnue_learner.h               |  4 ----
 src/nnue/trainer/trainer.h                     |  4 ----
 src/nnue/trainer/trainer_affine_transform.h    |  4 ----
 src/nnue/trainer/trainer_clipped_relu.h        |  4 ----
 src/nnue/trainer/trainer_feature_transformer.h |  4 ----
 src/nnue/trainer/trainer_input_slice.h         |  4 ----
 src/nnue/trainer/trainer_sum.h                 |  4 ----
 src/position.cpp                               |  6 ------
 src/position.h                                 |  4 ----
 src/search.cpp                                 | 10 ----------
 src/search.h                                   |  9 ---------
 src/tt.cpp                                     |  4 ----
 src/tt.h                                       |  2 --
 src/uci.cpp                                    |  7 +------
 src/ucioption.cpp                              |  8 --------
 32 files changed, 6 insertions(+), 144 deletions(-)

diff --git a/src/eval/evaluate_common.h b/src/eval/evaluate_common.h
index 7799fe79..47e69a44 100644
--- a/src/eval/evaluate_common.h
+++ b/src/eval/evaluate_common.h
@@ -1,8 +1,6 @@
 ﻿#ifndef _EVALUATE_COMMON_H_
 #define _EVALUATE_COMMON_H_
 
-#if defined(EVAL_LEARN)
-
 // A common header-like function for modern evaluation functions.
 
 #include <string>
@@ -21,6 +19,4 @@ namespace Eval
 	double get_eta();
 }
 
-#endif // defined(EVAL_LEARN)
-
 #endif // _EVALUATE_KPPT_COMMON_H_
diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 3b0b0f88..e619a747 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -43,11 +43,8 @@ namespace Eval {
       return UseNNUEMode::False;
     else if (mode == "true")
       return UseNNUEMode::True;
-
-#ifdef EVAL_LEARN
     else if (mode == "pure")
       return UseNNUEMode::Pure;
-#endif
 
     return UseNNUEMode::False;
   }
@@ -955,11 +952,9 @@ make_v:
 /// evaluation of the position from the point of view of the side to move.
 
 Value Eval::evaluate(const Position& pos) {
-#ifdef EVAL_LEARN
   if (useNNUE == UseNNUEMode::Pure) {
       return NNUE::evaluate(pos);
   }
-#endif
 
   bool classical = useNNUE == UseNNUEMode::False
                 || abs(eg_value(pos.psq_score())) * 16 > NNUEThreshold1 * (16 + pos.rule50_count());
diff --git a/src/evaluate.h b/src/evaluate.h
index 61052e90..900a77fc 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -29,11 +29,8 @@ namespace Eval {
   enum struct UseNNUEMode
   {
     False,
-    True
-
-#ifdef EVAL_LEARN
-    ,Pure
-#endif
+    True,
+    Pure
   };
 
   std::string trace(const Position& pos);
diff --git a/src/learn/convert.cpp b/src/learn/convert.cpp
index e9dcb10b..483296a1 100644
--- a/src/learn/convert.cpp
+++ b/src/learn/convert.cpp
@@ -1,5 +1,3 @@
-#if defined(EVAL_LEARN)
-
 #include "convert.h"
 
 #include "multi_think.h"
@@ -606,4 +604,3 @@ namespace Learner
         convert(args);
     }
 }
-#endif
diff --git a/src/learn/convert.h b/src/learn/convert.h
index a79820a3..a41885d9 100644
--- a/src/learn/convert.h
+++ b/src/learn/convert.h
@@ -5,7 +5,6 @@
 #include <string>
 #include <sstream>
 
-#if defined(EVAL_LEARN)
 namespace Learner {
     void convert_bin_from_pgn_extract(
         const std::vector<std::string>& filenames,
@@ -32,6 +31,5 @@ namespace Learner {
 
     void convert(std::istringstream& is);
 }
-#endif
 
 #endif
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index ebf47188..afbcce37 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -1,6 +1,4 @@
-﻿#if defined(EVAL_LEARN)
-
-#include "gensfen.h"
+﻿#include "gensfen.h"
 
 #include "packed_sfen.h"
 #include "multi_think.h"
@@ -1207,4 +1205,3 @@ namespace Learner
         std::cout << "gensfen finished." << endl;
     }
 }
-#endif
diff --git a/src/learn/gensfen.h b/src/learn/gensfen.h
index 45e4ca23..d39e44c9 100644
--- a/src/learn/gensfen.h
+++ b/src/learn/gensfen.h
@@ -5,12 +5,10 @@
 
 #include <sstream>
 
-#if defined(EVAL_LEARN)
 namespace Learner {
 
     // Automatic generation of teacher position
     void gen_sfen(Position& pos, std::istringstream& is);
 }
-#endif
 
 #endif
\ No newline at end of file
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 0459dd90..3f951888 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -17,8 +17,6 @@
 // → I will not be involved in the engine because it is a problem that the GUI should assist.
 // etc..
 
-#if defined(EVAL_LEARN)
-
 #include "learn.h"
 
 #include "convert.h"
@@ -2048,5 +2046,3 @@ namespace Learner
     }
 
 } // namespace Learner
-
-#endif // EVAL_LEARN
diff --git a/src/learn/learn.h b/src/learn/learn.h
index 7ee89009..4b09f825 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -1,8 +1,6 @@
 ﻿#ifndef _LEARN_H_
 #define _LEARN_H_
 
-#if defined(EVAL_LEARN)
-
 // ----------------------
 // Floating point for learning
 // ----------------------
@@ -78,6 +76,4 @@ namespace Learner
     void learn(Position& pos, std::istringstream& is);
 }
 
-#endif
-
 #endif // ifndef _LEARN_H_
diff --git a/src/learn/learning_tools.cpp b/src/learn/learning_tools.cpp
index 285b3487..925905c6 100644
--- a/src/learn/learning_tools.cpp
+++ b/src/learn/learning_tools.cpp
@@ -1,7 +1,5 @@
 ﻿#include "learning_tools.h"
 
-#if defined (EVAL_LEARN)
-
 #include "misc.h"
 
 using namespace Eval;
@@ -18,5 +16,3 @@ namespace EvalLearningTools
 	uint64_t Weight::eta1_epoch;
 	uint64_t Weight::eta2_epoch;
 }
-
-#endif
diff --git a/src/learn/learning_tools.h b/src/learn/learning_tools.h
index 194a9732..dcb2c4aa 100644
--- a/src/learn/learning_tools.h
+++ b/src/learn/learning_tools.h
@@ -3,8 +3,6 @@
 
 // A set of machine learning tools related to the weight array used for machine learning of evaluation functions
 
-#if defined (EVAL_LEARN)
-
 #include "learn.h"
 
 #include "misc.h"  // PRNG , my_insertion_sort
@@ -98,5 +96,4 @@ namespace EvalLearningTools
 	};
 }
 
-#endif // defined (EVAL_LEARN)
 #endif
diff --git a/src/learn/multi_think.cpp b/src/learn/multi_think.cpp
index 28b3e152..043238fa 100644
--- a/src/learn/multi_think.cpp
+++ b/src/learn/multi_think.cpp
@@ -1,6 +1,4 @@
-﻿#if defined(EVAL_LEARN)
-
-#include "multi_think.h"
+﻿#include "multi_think.h"
 
 #include "tt.h"
 #include "uci.h"
@@ -118,6 +116,3 @@ void MultiThink::go_think()
 		Options[s.first] = std::string(s.second);
 
 }
-
-
-#endif // defined(EVAL_LEARN)
diff --git a/src/learn/multi_think.h b/src/learn/multi_think.h
index 4f423da0..7de9d6b9 100644
--- a/src/learn/multi_think.h
+++ b/src/learn/multi_think.h
@@ -1,8 +1,6 @@
 ﻿#ifndef _MULTI_THINK_
 #define _MULTI_THINK_
 
-#if defined(EVAL_LEARN)
-
 #include "learn.h"
 
 #include "misc.h"
@@ -151,6 +149,4 @@ protected:
 	std::mutex task_mutex;
 };
 
-#endif // defined(EVAL_LEARN) && defined(YANEURAOU_2018_OTAFUKU_ENGINE)
-
 #endif
diff --git a/src/learn/packed_sfen.h b/src/learn/packed_sfen.h
index 101e5e34..3aa4fcac 100644
--- a/src/learn/packed_sfen.h
+++ b/src/learn/packed_sfen.h
@@ -4,7 +4,6 @@
 #include <vector>
 #include <cstdint>
 
-#if defined(EVAL_LEARN)
 namespace Learner {
 
     // packed sfen
@@ -45,5 +44,3 @@ namespace Learner {
     using PSVector = std::vector<PackedSfenValue>;
 }
 #endif
-
-#endif
diff --git a/src/learn/sfen_packer.cpp b/src/learn/sfen_packer.cpp
index 791870ca..734a477b 100644
--- a/src/learn/sfen_packer.cpp
+++ b/src/learn/sfen_packer.cpp
@@ -1,6 +1,4 @@
-﻿#if defined (EVAL_LEARN)
-
-#include "sfen_packer.h"
+﻿#include "sfen_packer.h"
 
 #include "packed_sfen.h"
 
@@ -402,6 +400,3 @@ namespace Learner {
     return sfen;
   }
 }
-
-
-#endif // USE_SFEN_PACKER
diff --git a/src/learn/sfen_packer.h b/src/learn/sfen_packer.h
index af900902..533d3fc9 100644
--- a/src/learn/sfen_packer.h
+++ b/src/learn/sfen_packer.h
@@ -1,8 +1,6 @@
 #ifndef _SFEN_PACKER_H_
 #define _SFEN_PACKER_H_
 
-#if defined(EVAL_LEARN)
-
 #include "types.h"
 
 #include "learn/packed_sfen.h"
@@ -19,6 +17,4 @@ namespace Learner {
     PackedSfen sfen_pack(Position& pos);
 }
 
-#endif
-
 #endif
\ No newline at end of file
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 8b0413e5..ea680e31 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -1,7 +1,5 @@
 ﻿// Code for learning NNUE evaluation function
 
-#if defined(EVAL_LEARN)
-
 #include <random>
 #include <fstream>
 #include <filesystem>
@@ -238,5 +236,3 @@ double get_eta() {
 }
 
 }  // namespace Eval
-
-#endif  // defined(EVAL_LEARN)
diff --git a/src/nnue/evaluate_nnue_learner.h b/src/nnue/evaluate_nnue_learner.h
index 0e5fbcd2..e9bd2fd2 100644
--- a/src/nnue/evaluate_nnue_learner.h
+++ b/src/nnue/evaluate_nnue_learner.h
@@ -3,8 +3,6 @@
 #ifndef _EVALUATE_NNUE_LEARNER_H_
 #define _EVALUATE_NNUE_LEARNER_H_
 
-#if defined(EVAL_LEARN)
-
 #include "../learn/learn.h"
 
 namespace Eval {
@@ -41,6 +39,4 @@ void CheckHealth();
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_LEARN)
-
 #endif
diff --git a/src/nnue/trainer/trainer.h b/src/nnue/trainer/trainer.h
index 94553c07..659863ad 100644
--- a/src/nnue/trainer/trainer.h
+++ b/src/nnue/trainer/trainer.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_TRAINER_H_
 #define _NNUE_TRAINER_H_
 
-#if defined(EVAL_LEARN)
-
 #include "../nnue_common.h"
 #include "../features/index_list.h"
 
@@ -120,6 +118,4 @@ std::shared_ptr<T> MakeAlignedSharedPtr(ArgumentTypes&&... arguments) {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_LEARN)
-
 #endif
diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h
index 4b5ddee6..50751ffe 100644
--- a/src/nnue/trainer/trainer_affine_transform.h
+++ b/src/nnue/trainer/trainer_affine_transform.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_TRAINER_AFFINE_TRANSFORM_H_
 #define _NNUE_TRAINER_AFFINE_TRANSFORM_H_
 
-#if defined(EVAL_LEARN)
-
 #include "../../learn/learn.h"
 #include "../layers/affine_transform.h"
 #include "trainer.h"
@@ -296,6 +294,4 @@ class Trainer<Layers::AffineTransform<PreviousLayer, OutputDimensions>> {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_LEARN)
-
 #endif
diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index 72575bf8..cf7a2447 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_TRAINER_CLIPPED_RELU_H_
 #define _NNUE_TRAINER_CLIPPED_RELU_H_
 
-#if defined(EVAL_LEARN)
-
 #include "../../learn/learn.h"
 #include "../layers/clipped_relu.h"
 #include "trainer.h"
@@ -137,6 +135,4 @@ class Trainer<Layers::ClippedReLU<PreviousLayer>> {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_LEARN)
-
 #endif
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 6b94d952..190e009a 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
 #define _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
 
-#if defined(EVAL_LEARN)
-
 #include "../../learn/learn.h"
 #include "../nnue_feature_transformer.h"
 #include "trainer.h"
@@ -372,6 +370,4 @@ class Trainer<FeatureTransformer> {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_LEARN)
-
 #endif
diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
index 6b0adc9f..e2cd0c25 100644
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_TRAINER_INPUT_SLICE_H_
 #define _NNUE_TRAINER_INPUT_SLICE_H_
 
-#if defined(EVAL_LEARN)
-
 #include "../../learn/learn.h"
 #include "../layers/input_slice.h"
 #include "trainer.h"
@@ -246,6 +244,4 @@ class Trainer<Layers::InputSlice<OutputDimensions, Offset>> {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_LEARN)
-
 #endif
diff --git a/src/nnue/trainer/trainer_sum.h b/src/nnue/trainer/trainer_sum.h
index 0b7abe36..65a0b681 100644
--- a/src/nnue/trainer/trainer_sum.h
+++ b/src/nnue/trainer/trainer_sum.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_TRAINER_SUM_H_
 #define _NNUE_TRAINER_SUM_H_
 
-#if defined(EVAL_LEARN)
-
 #include "../../learn/learn.h"
 #include "../layers/sum.h"
 #include "trainer.h"
@@ -185,6 +183,4 @@ class Trainer<Layers::Sum<PreviousLayer>> {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_LEARN)
-
 #endif
diff --git a/src/position.cpp b/src/position.cpp
index 9465afbc..38ac7c5c 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -32,10 +32,8 @@
 #include "uci.h"
 #include "syzygy/tbprobe.h"
 
-#if defined(EVAL_LEARN)
 #include "learn/packed_sfen.h"
 #include "learn/sfen_packer.h"
-#endif
 
 using std::string;
 
@@ -1352,8 +1350,6 @@ bool Position::pos_is_ok() const {
   return true;
 }
 
-#if defined(EVAL_LEARN)
-
 // Add a function that directly unpacks for speed. It's pretty tough.
 // Write it by combining packer::unpack() and Position::set().
 // If there is a problem with the passed phase and there is an error, non-zero is returned.
@@ -1385,5 +1381,3 @@ void Position::sfen_pack(Learner::PackedSfen& sfen)
 {
   sfen = Learner::sfen_pack(*this);
 }
-
-#endif
\ No newline at end of file
diff --git a/src/position.h b/src/position.h
index aa2d34e7..2163dca3 100644
--- a/src/position.h
+++ b/src/position.h
@@ -30,10 +30,8 @@
 
 #include "nnue/nnue_accumulator.h"
 
-#if defined(EVAL_LEARN)
 #include "learn/packed_sfen.h"
 #include "learn/sfen_packer.h"
-#endif
 
 
 /// StateInfo struct stores information needed to restore a Position object to
@@ -177,7 +175,6 @@ public:
   // Used by NNUE
   StateInfo* state() const;
 
-#if defined(EVAL_LEARN)
   // --sfenization helper
 
   friend int Learner::set_from_packed_sfen(Position& pos, const Learner::PackedSfen& sfen, StateInfo* si, Thread* th, bool mirror);
@@ -199,7 +196,6 @@ public:
 
   // Returns the position of the ball on the c side.
   Square king_square(Color c) const { return pieceList[make_piece(c, KING)][0]; }
-#endif // EVAL_LEARN
 
 private:
   // Initialization helpers (used while setting up a position)
diff --git a/src/search.cpp b/src/search.cpp
index b92ea7c8..f8cf3cbc 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -54,9 +54,7 @@ using std::string;
 using Eval::evaluate;
 using namespace Search;
 
-#if defined(EVAL_LEARN)
 bool Search::prune_at_shallow_depth_on_pv_node = false;
-#endif
 
 namespace {
 
@@ -991,9 +989,7 @@ moves_loop: // When in check, search starts from here
       ss->moveCount = ++moveCount;
 
       if (rootNode && thisThread == Threads.main() && Time.elapsed() > 3000
-#if defined(EVAL_LEARN)
           && !Limits.silent
-#endif
           )
           sync_cout << "info depth " << depth
                     << " currmove " << UCI::move(move, pos.is_chess960())
@@ -1011,9 +1007,7 @@ moves_loop: // When in check, search starts from here
 
       // Step 12. Pruning at shallow depth (~200 Elo)
       if (  !rootNode
-#ifdef EVAL_LEARN
           && (PvNode ? prune_at_shallow_depth_on_pv_node : true)
-#endif
           && pos.non_pawn_material(us)
           && bestValue > VALUE_TB_LOSS_IN_MAX_PLY)
       {
@@ -1564,10 +1558,8 @@ moves_loop: // When in check, search starts from here
 
       // Check for legality just before making the move
       if (
-#if defined(EVAL_LEARN)
         // HACK: pos.piece_on(from_sq(m)) sometimes will be NO_PIECE during machine learning.
         !pos.pseudo_legal(move) ||
-#endif // EVAL_LEARN
         !pos.legal(move)
         )
       {
@@ -1978,7 +1970,6 @@ void Tablebases::rank_root_moves(Position& pos, Search::RootMoves& rootMoves) {
 
 // --- expose the functions such as fixed depth search used for learning to the outside
 
-#if defined (EVAL_LEARN)
 
 namespace Learner
 {
@@ -2278,4 +2269,3 @@ namespace Learner
   }
 
 }
-#endif
diff --git a/src/search.h b/src/search.h
index 5e092273..20dfe909 100644
--- a/src/search.h
+++ b/src/search.h
@@ -32,10 +32,7 @@ namespace Search {
 /// Threshold used for countermoves based pruning
 constexpr int CounterMovePruneThreshold = 0;
 
-
-#if defined(EVAL_LEARN)
 extern bool prune_at_shallow_depth_on_pv_node;
-#endif
 
 /// Stack struct keeps track of the information we need to remember from nodes
 /// shallower and deeper in the tree during the search. Each search thread has
@@ -90,9 +87,7 @@ struct LimitsType {
     time[WHITE] = time[BLACK] = inc[WHITE] = inc[BLACK] = npmsec = movetime = TimePoint(0);
     movestogo = depth = mate = perft = infinite = 0;
     nodes = 0;
-#if defined (EVAL_LEARN)
     silent = false;
-#endif
   }
 
   bool use_time_management() const {
@@ -103,11 +98,9 @@ struct LimitsType {
   TimePoint time[COLOR_NB], inc[COLOR_NB], npmsec, movetime, startTime;
   int movestogo, depth, mate, perft, infinite;
   int64_t nodes;
-#if defined (EVAL_LEARN)
   // Silent mode that does not output to the screen (for continuous self-play in process)
   // Do not output PV at this time.
   bool silent;
-#endif
 };
 
 extern LimitsType Limits;
@@ -117,7 +110,6 @@ void clear();
 
 } // namespace Search
 
-#if defined(EVAL_LEARN)
 namespace Learner {
 
   // A pair of reader and evaluation value. Returned by Learner::search(),Learner::qsearch().
@@ -126,6 +118,5 @@ namespace Learner {
   ValueAndPV qsearch(Position& pos);
   ValueAndPV search(Position& pos, int depth_, size_t multiPV = 1, uint64_t nodesLimit = 0);
 }
-#endif
 
 #endif // #ifndef SEARCH_H_INCLUDED
diff --git a/src/tt.cpp b/src/tt.cpp
index fc8ab3b1..c64670ac 100644
--- a/src/tt.cpp
+++ b/src/tt.cpp
@@ -28,9 +28,7 @@
 
 TranspositionTable TT; // Our global transposition table
 
-#ifdef EVAL_LEARN
 bool TranspositionTable::enable_transposition_table = true;
-#endif
 
 /// TTEntry::save() populates the TTEntry with a new node's data, possibly
 /// overwriting an old position. Update is not atomic and can be racy.
@@ -120,12 +118,10 @@ void TranspositionTable::clear() {
 
 TTEntry* TranspositionTable::probe(const Key key, bool& found) const {
 
-#ifdef EVAL_LEARN
   if (!enable_transposition_table) {
       found = false;
       return first_entry(0);
   }
-#endif
 
   TTEntry* const tte = first_entry(key);
   const uint16_t key16 = (uint16_t)key;  // Use the low 16 bits as key inside the cluster
diff --git a/src/tt.h b/src/tt.h
index e83b6f3c..29072bd8 100644
--- a/src/tt.h
+++ b/src/tt.h
@@ -84,9 +84,7 @@ public:
     return &table[mul_hi64(key, clusterCount)].entry[0];
   }
 
-#ifdef EVAL_LEARN
   static bool enable_transposition_table;
-#endif
 
 private:
   friend struct TTEntry;
diff --git a/src/uci.cpp b/src/uci.cpp
index 0a28fc1f..1128d4d9 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -245,7 +245,6 @@ double UCI::win_rate_model_double(double v, int ply) {
 // Call qsearch(),search() directly for testing
 // --------------------
 
-#if defined(EVAL_LEARN)
 void qsearch_cmd(Position& pos)
 {
   cout << "qsearch : ";
@@ -277,8 +276,6 @@ void search_cmd(Position& pos, istringstream& is)
   cout << endl;
 }
 
-#endif
-
 /// UCI::loop() waits for a command from stdin, parses it and calls the appropriate
 /// function. Also intercepts EOF from stdin to ensure gracefully exiting if the
 /// GUI dies unexpectedly. When called with some command line arguments, e.g. to
@@ -334,7 +331,7 @@ void UCI::loop(int argc, char* argv[]) {
       else if (token == "d")        sync_cout << pos << sync_endl;
       else if (token == "eval")     trace_eval(pos);
       else if (token == "compiler") sync_cout << compiler_info() << sync_endl;
-#if defined (EVAL_LEARN)
+
       else if (token == "gensfen") Learner::gen_sfen(pos, is);
       else if (token == "learn") Learner::learn(pos, is);
       else if (token == "convert") Learner::convert(is);
@@ -343,8 +340,6 @@ void UCI::loop(int argc, char* argv[]) {
       else if (token == "qsearch") qsearch_cmd(pos);
       else if (token == "search") search_cmd(pos, is);
 
-#endif
-
       // test command
       else if (token == "test") test_cmd(pos, is);
       else
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index 91fa199b..aa85dc07 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -42,14 +42,12 @@ void on_threads(const Option& o) { Threads.set(size_t(o)); }
 void on_tb_path(const Option& o) { Tablebases::init(o); }
 void on_use_NNUE(const Option& ) { Eval::init_NNUE(); }
 void on_eval_file(const Option& ) { Eval::init_NNUE(); }
-#ifdef EVAL_LEARN
 void on_prune_at_shallow_depth_on_pv_node(const Option& o) {
     Search::prune_at_shallow_depth_on_pv_node = o;
 }
 void on_enable_transposition_table(const Option& o) {
     TranspositionTable::enable_transposition_table = o;
 }
-#endif
 
 /// Our case insensitive less() function as required by UCI protocol
 bool CaseInsensitiveLess::operator() (const string& s1, const string& s2) const {
@@ -86,11 +84,7 @@ void init(OptionsMap& o) {
   o["SyzygyProbeDepth"]      << Option(1, 1, 100);
   o["Syzygy50MoveRule"]      << Option(true);
   o["SyzygyProbeLimit"]      << Option(7, 0, 7);
-#if defined(EVAL_LEARN)
   o["Use NNUE"]              << Option("true var true var false var pure", "true", on_use_NNUE);
-#else
-  o["Use NNUE"]              << Option("true var true var false", "true", on_use_NNUE);
-#endif
   // The default must follow the format nn-[SHA256 first 12 digits].nnue
   // for the build process (profile-build and fishtest) to work.
   o["EvalFile"]              << Option("nn-82215d0fd0df.nnue", on_eval_file);
@@ -102,7 +96,6 @@ void init(OptionsMap& o) {
   o["SkipLoadingEval"]       << Option(false);
   // how many moves to use a fixed move
   // o["BookMoves"] << Option(16, 0, 10000);
-#if defined(EVAL_LEARN)
   // When learning the evaluation function, you can change the folder to save the evaluation function.
   // Evalsave by default. This folder shall be prepared in advance.
   // Automatically create a folder under this folder like "0/", "1/", ... and save the evaluation function file there.
@@ -111,7 +104,6 @@ void init(OptionsMap& o) {
   o["PruneAtShallowDepthOnPvNode"] << Option(false, on_prune_at_shallow_depth_on_pv_node);
   // Enable transposition table.
   o["EnableTranspositionTable"] << Option(true, on_enable_transposition_table);
-#endif
 }
 
 
From 1e2fca4040ef94c60c5318d1d707f395337fdb74 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 12 Sep 2020 16:23:49 +0200
Subject: [PATCH 105/398] Move learn target to build target and profile-learn
 to profile-build.

---
 src/Makefile | 41 ++++++++++++-----------------------------
 1 file changed, 12 insertions(+), 29 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 35030be7..b9ad8fbd 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -735,22 +735,31 @@ endif
         clang-profile-use clang-profile-make
 
 build: config-sanity
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) all
+	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
+	EXTRACXXFLAGS=' -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
+	EXTRALDFLAGS=' $(BLASLDFLAGS) -fopenmp  ' \
+	all
 
 profile-build: net config-sanity objclean profileclean
 	@echo ""
 	@echo "Step 1/4. Building instrumented executable ..."
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make)
+	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make) \
+	LEARNCXXFLAGS=' -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
+	LEARNLDFLAGS='  $(BLASLDFLAGS) -fopenmp '
 	@echo ""
 	@echo "Step 2/4. Running benchmark for pgo-build ..."
 	$(PGOBENCH) > /dev/null
+	$(PGOGENSFEN) > /dev/null
 	@echo ""
 	@echo "Step 3/4. Building optimized executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) objclean
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_use)
+	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_use) \
+	LEARNCXXFLAGS=' -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
+	LEARNLDFLAGS=' $(BLASLDFLAGS) -fopenmp '
 	@echo ""
 	@echo "Step 4/4. Deleting profile data ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) profileclean
+	rm generated_kifu.bin
 
 strip:
 	$(STRIP) $(EXE)
@@ -901,32 +910,6 @@ icc-profile-use:
 	EXTRACXXFLAGS='-prof_use -prof_dir ./profdir' \
 	all
 
-learn: config-sanity
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
-	EXTRACXXFLAGS=' -DEVAL_LEARN -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
-	EXTRALDFLAGS=' $(BLASLDFLAGS) -fopenmp  ' \
-	all
-
-profile-learn: net config-sanity objclean profileclean
-	@echo ""
-	@echo "Step 1/4. Building instrumented executable ..."
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make) \
-	LEARNCXXFLAGS=' -DEVAL_LEARN -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
-	LEARNLDFLAGS='  $(BLASLDFLAGS) -fopenmp '
-	@echo ""
-	@echo "Step 2/4. Running benchmark for pgo-build ..."
-	$(PGOGENSFEN)
-	@echo ""
-	@echo "Step 3/4. Building optimized executable ..."
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) objclean
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_use) \
-	LEARNCXXFLAGS=' -DEVAL_LEARN -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
-	LEARNLDFLAGS=' $(BLASLDFLAGS) -fopenmp '
-	@echo ""
-	@echo "Step 4/4. Deleting profile data ..."
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) profileclean
-	rm generated_kifu.bin
-
 .depend:
 	-@$(CXX) $(DEPENDFLAGS) -MM $(SRCS) > $@
 

From 1da452029b3180769e206efc5a696fc37f37d1e6 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 12 Sep 2020 16:27:35 +0200
Subject: [PATCH 106/398] Update travis to use build target instead of learn.

---
 .travis.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 608d22c1..418888f6 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -108,5 +108,5 @@ script:
 
   # NNUE testing
   - export CXXFLAGS="-O1 -fno-inline"
-  - make clean && make -j2 ARCH=x86-64-modern debug=no optimize=no learn > /dev/null && ../tests/instrumented_learn.sh --valgrind
-  - make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=no learn > /dev/null && ../tests/instrumented_learn.sh --sanitizer-undefined
+  - make clean && make -j2 ARCH=x86-64-modern debug=no optimize=no build > /dev/null && ../tests/instrumented_learn.sh --valgrind
+  - make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=no build > /dev/null && ../tests/instrumented_learn.sh --sanitizer-undefined

From 9d84af11fe0fd1cf97f64efb490cd4fd35544326 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 12 Sep 2020 18:20:21 +0200
Subject: [PATCH 107/398] Remove remaining learn builds from CI. No replacement
 needed.

---
 .travis.yml | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 418888f6..6ebfeeb2 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -75,11 +75,6 @@ script:
   # - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-32 build && ../tests/signature.sh $benchref; fi
   - make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref
 
-  # start some basic learner CI
-  - make clean && make -j2 ARCH=x86-64-modern learn
-  - make clean && make -j2 ARCH=x86-64-modern profile-learn
-  - make clean && make -j2 ARCH=x86-64-modern debug=yes optimize=no learn
-
   # compile only for some more advanced architectures (might not run in travis)
   - make clean && make -j2 ARCH=x86-64-avx2 build
   - make clean && make -j2 ARCH=x86-64-bmi2 build

From a6b02a61b7da82611a2f2f4227eb2308185b1b8b Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 12 Sep 2020 18:22:09 +0200
Subject: [PATCH 108/398] Remove 32 bit builds.

---
 .travis.yml | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 6ebfeeb2..aa325412 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -67,12 +67,6 @@ script:
   - make clean && make -j2 ARCH=x86-64 build && ../tests/signature.sh $benchref
   # TODO avoid _mm_malloc
   # - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-64 build && ../tests/signature.sh $benchref; fi
-  # - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 optimize=no debug=yes build && ../tests/signature.sh $benchref; fi
-  - make clean && make -j2 ARCH=x86-32-sse41-popcnt build && ../tests/signature.sh $benchref
-  - make clean && make -j2 ARCH=x86-32-sse2 build && ../tests/signature.sh $benchref
-  # TODO avoid _mm_malloc
-  # - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 build && ../tests/signature.sh $benchref; fi
-  # - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-32 build && ../tests/signature.sh $benchref; fi
   - make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref
 
   # compile only for some more advanced architectures (might not run in travis)

From 8d1ad6fbf6795f2574dc954ee6fc255b25e68761 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 12 Sep 2020 21:16:27 +0200
Subject: [PATCH 109/398] Add a makefile option to enable use of BLAS. Default
 to "no"

---
 src/Makefile | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index b9ad8fbd..1c43d631 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -111,6 +111,7 @@ else
    SUPPORTED_ARCH=false
 endif
 
+blas = no
 optimize = yes
 debug = no
 sanitize = no
@@ -132,17 +133,25 @@ ARCH = x86-64-modern
 STRIP = strip
 
 ### BLAS libraries
-ifeq ($(KERNEL),Linux)
-	BLASCXXFLAGS =
-	BLASLDFLAGS = -lopenblas
-else
-	BLASCXXFLAGS = -I/mingw64/include/OpenBLAS
-
-	ifeq ($(debug),yes)
-		BLASLDFLAGS = -lopenblas -Wl,-static
+ifeq ($(blas), yes)
+	ifeq ($(KERNEL),Linux)
+		BLASCXXFLAGS =
+		BLASLDFLAGS = -lopenblas
 	else
-		BLASLDFLAGS = -lopenblas -Wl,-s -static
+		BLASCXXFLAGS = -I/mingw64/include/OpenBLAS
+
+		ifeq ($(debug),yes)
+			BLASLDFLAGS = -lopenblas -Wl,-static
+		else
+			BLASLDFLAGS = -lopenblas -Wl,-s -static
+		endif
 	endif
+
+	BLASDEFINE = -DUSE_BLAS
+else
+	BLASCXXFLAGS =
+	BLASLDFLAGS =
+	BLASDEFINE =
 endif
 
 ### 2.2 Architecture specific
@@ -736,7 +745,7 @@ endif
 
 build: config-sanity
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
-	EXTRACXXFLAGS=' -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
+	EXTRACXXFLAGS=' $(BLASDEFINE) $(BLASCXXFLAGS) -fopenmp ' \
 	EXTRALDFLAGS=' $(BLASLDFLAGS) -fopenmp  ' \
 	all
 
@@ -744,7 +753,7 @@ profile-build: net config-sanity objclean profileclean
 	@echo ""
 	@echo "Step 1/4. Building instrumented executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make) \
-	LEARNCXXFLAGS=' -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
+	LEARNCXXFLAGS=' $(BLASDEFINE) $(BLASCXXFLAGS) -fopenmp ' \
 	LEARNLDFLAGS='  $(BLASLDFLAGS) -fopenmp '
 	@echo ""
 	@echo "Step 2/4. Running benchmark for pgo-build ..."
@@ -754,7 +763,7 @@ profile-build: net config-sanity objclean profileclean
 	@echo "Step 3/4. Building optimized executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) objclean
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_use) \
-	LEARNCXXFLAGS=' -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
+	LEARNCXXFLAGS=' $(BLASDEFINE) $(BLASCXXFLAGS) -fopenmp ' \
 	LEARNLDFLAGS=' $(BLASLDFLAGS) -fopenmp '
 	@echo ""
 	@echo "Step 4/4. Deleting profile data ..."

From f049c4776a78ec3d3b44198c1972c0a6768815d7 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 12 Sep 2020 21:19:15 +0200
Subject: [PATCH 110/398] Add tests in CI to cover compilation of both blas=no
 and blas=yes.

---
 .travis.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.travis.yml b/.travis.yml
index aa325412..204f2657 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -70,6 +70,8 @@ script:
   - make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref
 
   # compile only for some more advanced architectures (might not run in travis)
+  - make clean && make -j2 ARCH=x86-64-avx2 blas=yes build
+
   - make clean && make -j2 ARCH=x86-64-avx2 build
   - make clean && make -j2 ARCH=x86-64-bmi2 build
   - make clean && make -j2 ARCH=x86-64-avx512 build

From fbae6604b1332c64cef74e2f81c83b1ab8ba147b Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 13 Sep 2020 00:18:01 +0200
Subject: [PATCH 111/398] Remove LEARNCXXFLAGS, LEARNLDFLAGS, BLASDEFINE,
 BLASCXXFLAGS, BLASLDFLAGS in favor of directly modifying CXXFLAGS and
 LDFLAGS.

---
 src/Makefile | 63 ++++++++++++++++++++++------------------------------
 1 file changed, 26 insertions(+), 37 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 1c43d631..9b59c5bb 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -132,28 +132,6 @@ neon = no
 ARCH = x86-64-modern
 STRIP = strip
 
-### BLAS libraries
-ifeq ($(blas), yes)
-	ifeq ($(KERNEL),Linux)
-		BLASCXXFLAGS =
-		BLASLDFLAGS = -lopenblas
-	else
-		BLASCXXFLAGS = -I/mingw64/include/OpenBLAS
-
-		ifeq ($(debug),yes)
-			BLASLDFLAGS = -lopenblas -Wl,-static
-		else
-			BLASLDFLAGS = -lopenblas -Wl,-s -static
-		endif
-	endif
-
-	BLASDEFINE = -DUSE_BLAS
-else
-	BLASCXXFLAGS =
-	BLASLDFLAGS =
-	BLASDEFINE =
-endif
-
 ### 2.2 Architecture specific
 
 ifeq ($(findstring x86,$(ARCH)),x86)
@@ -330,9 +308,8 @@ endif
 ### ==========================================================================
 
 ### 3.1 Selecting compiler (default = gcc)
-CXXFLAGS += -g -Wall -Wcast-qual -fno-exceptions -std=c++17 -I. $(EXTRACXXFLAGS) $(LEARNCXXFLAGS)
-DEPENDFLAGS += -std=c++17 -I.
-LDFLAGS += $(EXTRALDFLAGS) $(LEARNLDFLAGS)
+CXXFLAGS += -g -Wall -Wcast-qual -fno-exceptions -std=c++17 -fopenmp -I. $(EXTRACXXFLAGS)
+DEPENDFLAGS += -std=c++17 -I. $(EXTRALDFLAGS)
 
 ifeq ($(COMP),)
 	COMP=gcc
@@ -487,14 +464,33 @@ ifneq ($(comp),mingw)
 endif
 endif
 
-### 3.2.1 Debugging
+### 3.2.1. BLAS libraries
+ifeq ($(blas), yes)
+	LDFLAGS += -lopenblas
+
+	ifeq ($(KERNEL),Linux)
+		LDFLAGS +=
+	else
+		CXXFLAGS += -I/mingw64/include/OpenBLAS
+
+		ifeq ($(debug),yes)
+			LDFLAGS += -Wl,-static
+		else
+			LDFLAGS += -Wl,-s -static
+		endif
+	endif
+
+	CXXFLAGS += -DUSE_BLAS
+endif
+
+### 3.2.2 Debugging
 ifeq ($(debug),no)
 	CXXFLAGS += -DNDEBUG
 else
 	CXXFLAGS += -g
 endif
 
-### 3.2.2 Debugging with undefined behavior sanitizers
+### 3.2.3 Debugging with undefined behavior sanitizers
 ifneq ($(sanitize),no)
         CXXFLAGS += -g3 -fsanitize=$(sanitize)
         LDFLAGS += -fsanitize=$(sanitize)
@@ -744,17 +740,12 @@ endif
         clang-profile-use clang-profile-make
 
 build: config-sanity
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
-	EXTRACXXFLAGS=' $(BLASDEFINE) $(BLASCXXFLAGS) -fopenmp ' \
-	EXTRALDFLAGS=' $(BLASLDFLAGS) -fopenmp  ' \
-	all
+	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) all
 
 profile-build: net config-sanity objclean profileclean
 	@echo ""
 	@echo "Step 1/4. Building instrumented executable ..."
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make) \
-	LEARNCXXFLAGS=' $(BLASDEFINE) $(BLASCXXFLAGS) -fopenmp ' \
-	LEARNLDFLAGS='  $(BLASLDFLAGS) -fopenmp '
+	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make)
 	@echo ""
 	@echo "Step 2/4. Running benchmark for pgo-build ..."
 	$(PGOBENCH) > /dev/null
@@ -762,9 +753,7 @@ profile-build: net config-sanity objclean profileclean
 	@echo ""
 	@echo "Step 3/4. Building optimized executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) objclean
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_use) \
-	LEARNCXXFLAGS=' $(BLASDEFINE) $(BLASCXXFLAGS) -fopenmp ' \
-	LEARNLDFLAGS=' $(BLASLDFLAGS) -fopenmp '
+	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_use)
 	@echo ""
 	@echo "Step 4/4. Deleting profile data ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) profileclean

From 72164ba59ca4f0143b170e4721ba9aa38c591cc6 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 13 Sep 2020 02:06:33 +0200
Subject: [PATCH 112/398] Add missing -fopenmp LDFLAG

---
 src/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Makefile b/src/Makefile
index 9b59c5bb..81e2ff17 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -309,6 +309,7 @@ endif
 
 ### 3.1 Selecting compiler (default = gcc)
 CXXFLAGS += -g -Wall -Wcast-qual -fno-exceptions -std=c++17 -fopenmp -I. $(EXTRACXXFLAGS)
+LDFLAGS += -fopenmp
 DEPENDFLAGS += -std=c++17 -I. $(EXTRALDFLAGS)
 
 ifeq ($(COMP),)

From 4b70f4bf23305ea6cb1e24e7fd9311cd20c6f46e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 13 Sep 2020 02:07:29 +0200
Subject: [PATCH 113/398] Add extra ld flags to the proper variable.

---
 src/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 81e2ff17..5477d68e 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -309,8 +309,8 @@ endif
 
 ### 3.1 Selecting compiler (default = gcc)
 CXXFLAGS += -g -Wall -Wcast-qual -fno-exceptions -std=c++17 -fopenmp -I. $(EXTRACXXFLAGS)
-LDFLAGS += -fopenmp
-DEPENDFLAGS += -std=c++17 -I. $(EXTRALDFLAGS)
+LDFLAGS += -fopenmp $(EXTRALDFLAGS)
+DEPENDFLAGS += -std=c++17 -I.
 
 ifeq ($(COMP),)
 	COMP=gcc

From 50b4ff83548632fc9070d701754abf0360c41839 Mon Sep 17 00:00:00 2001
From: Matthies <a.matthies@online.de>
Date: Sat, 12 Sep 2020 17:59:36 +0200
Subject: [PATCH 114/398] Add missing include to make MSVC compile

---
 src/extra/nnue_data_binpack_format.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/extra/nnue_data_binpack_format.h b/src/extra/nnue_data_binpack_format.h
index 2c555939..7ceafbc0 100644
--- a/src/extra/nnue_data_binpack_format.h
+++ b/src/extra/nnue_data_binpack_format.h
@@ -41,6 +41,7 @@ THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include <array>
 #include <limits>
 #include <climits>
+#include <optional>
 
 #if (defined(_MSC_VER) || defined(__INTEL_COMPILER)) && !defined(__clang__)
 #include <intrin.h>
@@ -7196,4 +7197,4 @@ namespace binpack
 
         std::cout << "Finished. Converted " << numProcessedPositions << " positions.\n";
     }
-}
\ No newline at end of file
+}

From 0a5893d337aac9a89cea1c4cddbd7a7d44a0ae81 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Sun, 13 Sep 2020 14:05:52 +0900
Subject: [PATCH 115/398] Update README.md

Updated description according to recent option changes.
---
 README.md | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 6d28a998..081f75d5 100644
--- a/README.md
+++ b/README.md
@@ -17,12 +17,10 @@ setoption name Threads value x
 setoption name Hash value y
 setoption name SyzygyPath value path
 isready
-gensfen depth a loop b use_draw_in_training_data_generation 1 eval_limit 32000 use_raw_nnue_eval 0
+gensfen depth a loop b use_draw_in_training_data_generation 1 eval_limit 32000
 ```
 Specify how many threads and how much memory you would like to use with the x and y values. The option SyzygyPath is not necessary, but if you would like to use it, you must first have Syzygy endgame tablebases on your computer, which you can find [here](http://oics.olympuschess.com/tracker/index.php). You will need to have a torrent client to download these tablebases, as that is probably the fastest way to obtain them. The path is the path to the folder containing those tablebases. It does not have to be surrounded in quotes.
 
-use_raw_nnue_eval controls if the training data generator or trainer uses raw NNUE eval values.  Don't forget to set use_raw_nnue_eval 0 when initial training data are generated.  Otherwise, the gensfen command will crash.
-
 This will save a file named "generated_kifu.bin" in the same folder as the binary. Once generation is done, rename the file to something like "1billiondepth12.bin" to remember the depth and quantity of the positions and move it to a folder named "trainingdata" in the same directory as the binaries.
 #### Generation Parameters
 - Depth is the searched depth per move, or how far the engine looks forward. This value is an integer.
@@ -34,7 +32,7 @@ Use the "learn" binary. Create an empty folder named "evalsave" in the same dire
 ```
 uci
 setoption name SkipLoadingEval value true
-setoption name Use NNUE value true
+setoption name Use NNUE value pure
 setoption name Threads value x
 isready
 learn targetdir trainingdata loop 100 batchsize 1000000 use_draw_in_training 1 use_draw_in_validation 1 eta 1 lambda 1 eval_limit 32000 nn_batch_size 1000 newbob_decay 0.5 eval_save_interval 250000000 loss_output_interval 1000000 mirror_percentage 50 validation_set_file_name validationdata\val.bin
@@ -46,7 +44,7 @@ Nets get saved in the "evalsave" folder.
 - lambda is the amount of weight it puts to eval of learning data vs win/draw/loss results. 1 puts all weight on eval, lambda 0 puts all weight on WDL results.
 
 ### Reinforcement Learning
-If you would like to do some reinforcement learning on your original network, you must first generate training data using the learn binaries with the setting `Use NNUE` set to true. Make sure that your previously trained network is in the eval folder. Use the commands specified above. Make sure `SkipLoadingEval` is set to false so that the data generated is using the neural net's eval by typing the command `setoption name SkipLoadingEval value false` before typing the `isready` command. You should aim to generate less positions than the first run, around 1/10 of the number of positions generated in the first run. The depth should be higher as well. You should also do the same for validation data, with the depth being higher than the last run.
+If you would like to do some reinforcement learning on your original network, you must first generate training data using the learn binaries with the setting `Use NNUE` set to `pure`. Make sure that your previously trained network is in the eval folder. Use the commands specified above. Make sure `SkipLoadingEval` is set to false so that the data generated is using the neural net's eval by typing the command `setoption name SkipLoadingEval value false` before typing the `isready` command. You should aim to generate less positions than the first run, around 1/10 of the number of positions generated in the first run. The depth should be higher as well. You should also do the same for validation data, with the depth being higher than the last run.
 
 After you have generated the training data, you must move it into your training data folder and delete the older data so that the binary does not accidentally train on the same data again. Do the same for the validation data and name it to val-1.bin to make it less confusing. Make sure the evalsave folder is empty. Then, using the same binary, type in the training commands shown above. Do __NOT__ set `SkipLoadingEval` to true, it must be false or you will get a completely new network, instead of a network trained with reinforcement learning. You should also set eval_save_interval to a number that is lower than the amount of positions in your training data, perhaps also 1/10 of the original value. The validation file should be set to the new validation data, not the old data.
 

From 1c84da9caa08a142655bedf6def85e62e4736801 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Sun, 13 Sep 2020 16:32:01 +0900
Subject: [PATCH 116/398] Fixed a bug that an assertion fails in the trainer.
 if the SkipLoading is false.

Fixes #128
---
 src/learn/learn.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 0459dd90..46c6a9dc 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -1988,7 +1988,13 @@ namespace Learner
         Eval::NNUE::SetBatchSize(nn_batch_size);
         Eval::NNUE::SetOptions(nn_options);
         if (newbob_decay != 1.0 && !Options["SkipLoadingEval"]) {
-            learn_think.best_nn_directory = std::string(Options["EvalDir"]);
+            // Save the current net to [EvalDir]\original.
+            Eval::save_eval("original");
+
+            // Set the folder above to best_nn_directory so that the trainer can
+            // resotre the network parameters from the original net file.
+            learn_think.best_nn_directory =
+                Path::Combine(Options["EvalSaveDir"], "original");
         }
 
         cout << "init done." << endl;

From a94a076e3925dcb47cc6a24182d35d01267642a4 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Sun, 13 Sep 2020 16:35:52 +0900
Subject: [PATCH 117/398] Fixed a comment.

---
 src/learn/learn.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 46c6a9dc..eaabc524 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -1988,7 +1988,7 @@ namespace Learner
         Eval::NNUE::SetBatchSize(nn_batch_size);
         Eval::NNUE::SetOptions(nn_options);
         if (newbob_decay != 1.0 && !Options["SkipLoadingEval"]) {
-            // Save the current net to [EvalDir]\original.
+            // Save the current net to [EvalSaveDir]\original.
             Eval::save_eval("original");
 
             // Set the folder above to best_nn_directory so that the trainer can

From 3ea2d5ef6198ac43e9beaae60bad8fd6f4e071f2 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Sun, 13 Sep 2020 08:34:22 +0200
Subject: [PATCH 118/398] Remove use of non-existent EvalDir option.

additionally allow all options to be converted to string.
Without this, restoring of the options (multi_think.cpp:117) can't work.

fixes https://github.com/nodchip/Stockfish/issues/128

Now gensfen/learn pass with debug=yes in CI
---
 .travis.yml         | 2 +-
 src/learn/learn.cpp | 3 ---
 src/ucioption.cpp   | 2 +-
 3 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 608d22c1..fee1bed2 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -109,4 +109,4 @@ script:
   # NNUE testing
   - export CXXFLAGS="-O1 -fno-inline"
   - make clean && make -j2 ARCH=x86-64-modern debug=no optimize=no learn > /dev/null && ../tests/instrumented_learn.sh --valgrind
-  - make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=no learn > /dev/null && ../tests/instrumented_learn.sh --sanitizer-undefined
+  - make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=yes learn > /dev/null && ../tests/instrumented_learn.sh --sanitizer-undefined
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 0459dd90..67b186b3 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -1987,9 +1987,6 @@ namespace Learner
         Eval::NNUE::InitializeTraining(eta1, eta1_epoch, eta2, eta2_epoch, eta3);
         Eval::NNUE::SetBatchSize(nn_batch_size);
         Eval::NNUE::SetOptions(nn_options);
-        if (newbob_decay != 1.0 && !Options["SkipLoadingEval"]) {
-            learn_think.best_nn_directory = std::string(Options["EvalDir"]);
-        }
 
         cout << "init done." << endl;
 
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index 91fa199b..1a80efff 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -165,7 +165,7 @@ Option::operator double() const {
 }
 
 Option::operator std::string() const {
-  assert(type == "string");
+  assert(type == "check" || type == "spin" || type == "combo" || type == "button" || type == "string");
   return currentValue;
 }
 

From fb877c2c3ec28ca4bd4d8586f3028ebb6f2cd6ad Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 13 Sep 2020 12:14:35 +0200
Subject: [PATCH 119/398] Add some building instructions to readme.

---
 README.md | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/README.md b/README.md
index 6d28a998..cdcda0d4 100644
--- a/README.md
+++ b/README.md
@@ -7,6 +7,31 @@
 ## Overview
 Stockfish NNUE is a port of a shogi neural network named NNUE (efficiently updateable neural network backwards) to Stockfish 11. To learn more about the Stockfish chess engine, look [here](stockfish.md) for an overview and [here](https://github.com/official-stockfish/Stockfish) for the official repository.
 
+## Building
+To compile:
+```
+make -jN ARCH=... build
+```
+
+To compile with Profile Guided Optimizations. Requires that the computer that is used for compilation supports the selected `ARCH`.
+```
+make -jN ARCH=... profile-build
+```
+
+`N` is the number of threads to use for compilation.
+
+`ARCH` is one of:
+`x86-64-vnni512`, `x86-64-vnni256`, `x86-64-avx512`, `x86-64-bmi2`, `x86-64-avx2`,
+`x86-64-sse41-popcnt`, `x86-64-modern`, `x86-64-ssse3`, `x86-64-sse3-popcnt`,
+`x86-64`, `x86-32-sse41-popcnt`, `x86-32-sse2`, `x86-32`, `ppc-64`, `ppc-32,
+armv7`, `armv7-neon`, `armv8`, `apple-silicon`, `general-64`, `general-32`.
+
+`ARCH` needs to be chosen based based on the instruction set of the CPU that will run stockfish. `x86-64-modern` will produce a binary that works on most common processors, but other options may increase performance for specific hardware.
+
+Additional options:
+
+- `blas=[yes/no]` - whether to use an external BLAS library. Default is `no`. Using an external BLAS library may have a significantly improve learning performance and by default expects openBLAS to be installed.
+
 ## Training Guide
 ### Generating Training Data
 To generate training data from the classic eval, use the gensfen command with the setting "Use NNUE" set to "false". The given example is generation in its simplest form. There are more commands. 

From bd434b80c677966865c2e343658aec98c2966415 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 13 Sep 2020 13:40:56 +0200
Subject: [PATCH 120/398] debug=yes for last CI test

---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 204f2657..9dad6b1d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -100,4 +100,4 @@ script:
   # NNUE testing
   - export CXXFLAGS="-O1 -fno-inline"
   - make clean && make -j2 ARCH=x86-64-modern debug=no optimize=no build > /dev/null && ../tests/instrumented_learn.sh --valgrind
-  - make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=no build > /dev/null && ../tests/instrumented_learn.sh --sanitizer-undefined
+  - make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=yes build > /dev/null && ../tests/instrumented_learn.sh --sanitizer-undefined

From 9ee8ce67bf6b0fa681ca7c29b5c33e52105f087e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 13 Sep 2020 13:42:13 +0200
Subject: [PATCH 121/398] Move removal of generate training data file to
 profileclean.

---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index 5477d68e..3e10702f 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -758,7 +758,6 @@ profile-build: net config-sanity objclean profileclean
 	@echo ""
 	@echo "Step 4/4. Deleting profile data ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) profileclean
-	rm generated_kifu.bin
 
 strip:
 	$(STRIP) $(EXE)
@@ -805,6 +804,7 @@ profileclean:
 	@rm -rf profdir
 	@rm -f bench.txt *.gcda *.gcno ./syzygy/*.gcda ./nnue/*.gcda ./nnue/features/*.gcda *.s ./learn/*.gcda ./extra/*.gcda ./eval/*.gcda
 	@rm -f stockfish.profdata *.profraw
+	@rm generated_kifu.bin
 
 default:
 	help

From e4a4f4001fe91604fed4ad01b1429d4674168aed Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 13 Sep 2020 13:44:19 +0200
Subject: [PATCH 122/398] parametrize the name of the training data file
 generated during pgo

---
 src/Makefile | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 3e10702f..982df26b 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -39,8 +39,9 @@ PREFIX = /usr/local
 BINDIR = $(PREFIX)/bin
 
 ### Built-in benchmark for pgo-builds
+PGO_TRAINING_DATA_FILE = pgo_training_data
 PGOBENCH = ./$(EXE) bench
-PGOGENSFEN = ./$(EXE) gensfen depth 3 loop 1000
+PGOGENSFEN = ./$(EXE) gensfen depth 3 loop 1000 output_file_name $(PGO_TRAINING_DATA_FILE)
 
 ### Source and object files
 SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp \
@@ -804,7 +805,7 @@ profileclean:
 	@rm -rf profdir
 	@rm -f bench.txt *.gcda *.gcno ./syzygy/*.gcda ./nnue/*.gcda ./nnue/features/*.gcda *.s ./learn/*.gcda ./extra/*.gcda ./eval/*.gcda
 	@rm -f stockfish.profdata *.profraw
-	@rm generated_kifu.bin
+	@rm $(PGO_TRAINING_DATA_FILE)
 
 default:
 	help

From 2e2de7607bbb958e699bb2e76a60ad36b912f5b0 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 13 Sep 2020 13:47:19 +0200
Subject: [PATCH 123/398] Add extension to the PGO_TRAINING_DATA_FILE so that
 the generated file name matches the one we try to delete.

---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index 982df26b..499e8d78 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -39,7 +39,7 @@ PREFIX = /usr/local
 BINDIR = $(PREFIX)/bin
 
 ### Built-in benchmark for pgo-builds
-PGO_TRAINING_DATA_FILE = pgo_training_data
+PGO_TRAINING_DATA_FILE = pgo_training_data.bin
 PGOBENCH = ./$(EXE) bench
 PGOGENSFEN = ./$(EXE) gensfen depth 3 loop 1000 output_file_name $(PGO_TRAINING_DATA_FILE)
 

From 89f38c938bac12171abe5d778efd4857b478693b Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 13 Sep 2020 13:52:42 +0200
Subject: [PATCH 124/398] Don't prompt when the training data file doesn't
 exist when trying to delete it

---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index 499e8d78..69517c3c 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -805,7 +805,7 @@ profileclean:
 	@rm -rf profdir
 	@rm -f bench.txt *.gcda *.gcno ./syzygy/*.gcda ./nnue/*.gcda ./nnue/features/*.gcda *.s ./learn/*.gcda ./extra/*.gcda ./eval/*.gcda
 	@rm -f stockfish.profdata *.profraw
-	@rm $(PGO_TRAINING_DATA_FILE)
+	@rm -f $(PGO_TRAINING_DATA_FILE)
 
 default:
 	help

From 30a1bc4c64e0cf41269c34868b457ed6b4b5acb5 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 13 Sep 2020 14:19:30 +0200
Subject: [PATCH 125/398] Change default value of "PruneAtShallowDepthOnPvNode"
 so that the bench matches master.

---
 src/search.cpp    | 2 +-
 src/ucioption.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index f8cf3cbc..7c6f8ace 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -54,7 +54,7 @@ using std::string;
 using Eval::evaluate;
 using namespace Search;
 
-bool Search::prune_at_shallow_depth_on_pv_node = false;
+bool Search::prune_at_shallow_depth_on_pv_node = true;
 
 namespace {
 
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index e4a26098..06298596 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -101,7 +101,7 @@ void init(OptionsMap& o) {
   // Automatically create a folder under this folder like "0/", "1/", ... and save the evaluation function file there.
   o["EvalSaveDir"] << Option("evalsave");
   // Prune at shallow depth on PV nodes. Setting this value to true gains elo in shallow search.
-  o["PruneAtShallowDepthOnPvNode"] << Option(false, on_prune_at_shallow_depth_on_pv_node);
+  o["PruneAtShallowDepthOnPvNode"] << Option(true, on_prune_at_shallow_depth_on_pv_node);
   // Enable transposition table.
   o["EnableTranspositionTable"] << Option(true, on_enable_transposition_table);
 }

From 5d088e02c8046c04536f00ffa2298b5982d153c0 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Sun, 13 Sep 2020 18:16:04 +0200
Subject: [PATCH 126/398] add convert_plain to CI

---
 tests/instrumented_learn.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/instrumented_learn.sh b/tests/instrumented_learn.sh
index 71f9421c..7f76fd76 100755
--- a/tests/instrumented_learn.sh
+++ b/tests/instrumented_learn.sh
@@ -80,6 +80,8 @@ cat << EOF > gensfen01.exp
  send "isready\n"
  send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data/training_data.bin use_raw_nnue_eval 0 sfen_format bin\n"
  expect "gensfen finished."
+ send "learn training_data/training_data.bin convert_plain output_file_name training_data.txt\n"
+ expect "all done"
  send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data/training_data.binpack use_raw_nnue_eval 0 sfen_format binpack\n"
  expect "gensfen finished."
 

From 35ab8254b70f62a4e0138c475fad0c77dcc0af2d Mon Sep 17 00:00:00 2001
From: mckx00 <mckx00@gmail.com>
Date: Sun, 13 Sep 2020 19:28:32 -0700
Subject: [PATCH 127/398] Simplify StatSCore Initialization

No need to initialize StatScore at rootNode. Current Logic is redundant because at subsequent levels the grandchildren statScore is initialized to zero.

closes https://github.com/official-stockfish/Stockfish/pull/3122

Non functional change.
---
 src/search.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index 4aeadc28..07c491b6 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -654,9 +654,7 @@ namespace {
     // starts with statScore = 0. Later grandchildren start with the last calculated
     // statScore of the previous grandchild. This influences the reduction rules in
     // LMR which are based on the statScore of parent position.
-    if (rootNode)
-        (ss+4)->statScore = 0;
-    else
+    if (!rootNode)
         (ss+2)->statScore = 0;
 
     // Step 4. Transposition table lookup. We don't want the score of a partial

From 7135678f71b7f6ee32e92b8dbef2b16b403d8ea9 Mon Sep 17 00:00:00 2001
From: Sergio Vieri <sergio.vieri.hp@gmail.com>
Date: Mon, 14 Sep 2020 17:24:05 +0800
Subject: [PATCH 128/398] Update default net to nn-03744f8d56d8.nnue

Equivalent to 20200914-1520

closes https://github.com/official-stockfish/Stockfish/pull/3123

Bench: 4222126
---
 src/evaluate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/evaluate.h b/src/evaluate.h
index 3da6a9fe..c723bd8f 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -38,7 +38,7 @@ namespace Eval {
   // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
   // for the build process (profile-build and fishtest) to work. Do not change the
   // name of the macro, as it is used in the Makefile.
-  #define EvalFileDefaultName   "nn-308d71810dff.nnue"
+  #define EvalFileDefaultName   "nn-03744f8d56d8.nnue"
 
   namespace NNUE {
 

From d160436921dec1675e18b8a2d2a1da1693002588 Mon Sep 17 00:00:00 2001
From: Joseph Ellis <jhellis3@gmail.com>
Date: Tue, 15 Sep 2020 15:02:44 -0500
Subject: [PATCH 129/398] Update description for PruneAtShallowDepthOnPvNode

---
 src/ucioption.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index 06298596..dde3844a 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -100,7 +100,7 @@ void init(OptionsMap& o) {
   // Evalsave by default. This folder shall be prepared in advance.
   // Automatically create a folder under this folder like "0/", "1/", ... and save the evaluation function file there.
   o["EvalSaveDir"] << Option("evalsave");
-  // Prune at shallow depth on PV nodes. Setting this value to true gains elo in shallow search.
+  // Prune at shallow depth on PV nodes. False is recommended when using fixed depth search.
   o["PruneAtShallowDepthOnPvNode"] << Option(true, on_prune_at_shallow_depth_on_pv_node);
   // Enable transposition table.
   o["EnableTranspositionTable"] << Option(true, on_enable_transposition_table);

From 6ae09ba266021a61afe8f5a7b7a0d82f6609c8f6 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Mon, 14 Sep 2020 19:11:57 +0900
Subject: [PATCH 130/398] Fixed a bug that the root color is wrong.

---
 src/learn/learn.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 753efafa..70459963 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -842,6 +842,8 @@ namespace Learner
         // EvalHash has been disabled in advance. (If not, the same value will be returned every time)
         const auto [_, pv] = qsearch(task_pos);
 
+        const auto rootColor = task_pos.side_to_move();
+
         std::vector<StateInfo, AlignedAllocator<StateInfo>> states(pv.size());
         for (size_t i = 0; i < pv.size(); ++i)
         {
@@ -849,7 +851,6 @@ namespace Learner
             Eval::NNUE::update_eval(task_pos);
         }
 
-        const auto rootColor = task_pos.side_to_move();
         const Value shallow_value =
             (rootColor == task_pos.side_to_move())
             ? Eval::evaluate(task_pos)

From bc9be5a71fd9cc81f1761b5f0a827461bb15ffd3 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 16 Sep 2020 14:22:39 +0200
Subject: [PATCH 131/398] Allow setting PRNG seed

---
 src/misc.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/src/misc.h b/src/misc.h
index 4c04d3f0..7537624c 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -19,6 +19,7 @@
 #ifndef MISC_H_INCLUDED
 #define MISC_H_INCLUDED
 
+#include <algorithm>
 #include <cassert>
 #include <chrono>
 #include <functional>
@@ -28,6 +29,7 @@
 #include <vector>
 #include <utility>
 #include <cmath>
+#include <cctype>
 
 #include "types.h"
 
@@ -85,6 +87,19 @@ std::ostream& operator<<(std::ostream&, SyncCout);
 /// For further analysis see
 ///   <http://vigna.di.unimi.it/ftp/papers/xorshift.pdf>
 
+static uint64_t string_hash(const std::string& str)
+{
+  uint64_t h = 525201411107845655ull;
+
+  for (auto c : str) {
+    h ^= static_cast<uint64_t>(c);
+    h *= 0x5bd1e9955bd1e995ull;
+    h ^= h >> 47;
+  }
+
+  return h;
+}
+
 class PRNG {
 
   uint64_t s;
@@ -109,6 +124,19 @@ public:
 
   // Return the random seed used internally.
   uint64_t get_seed() const { return s; }
+
+  void set_seed(uint64_t seed) { s = seed; }
+
+  void set_seed(const std::string& str)
+  {
+    if (std::all_of(str.begin(), str.end(), std::isdigit)) {
+      set_seed(std::stoull(str));
+    }
+    else
+    {
+      set_seed(string_hash(str));
+    }
+  }
 };
 
 // Display a random seed. (For debugging)

From efca5d561fcb7f685962d6d32fd5be8aac7a7f8f Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 16 Sep 2020 14:38:54 +0200
Subject: [PATCH 132/398] More PRNG seeding options

---
 src/misc.h | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/misc.h b/src/misc.h
index 7537624c..5b7c8870 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -111,7 +111,9 @@ class PRNG {
   }
 
 public:
+  PRNG() { set_seed_from_time(); }
   PRNG(uint64_t seed) : s(seed) { assert(seed); }
+  PRNG(const std::string& seed) { set_seed(seed); }
 
   template<typename T> T rand() { return T(rand64()); }
 
@@ -127,9 +129,18 @@ public:
 
   void set_seed(uint64_t seed) { s = seed; }
 
+  void set_seed_from_time()
+  {
+      set_seed(std::chrono::system_clock::now().time_since_epoch().count());
+  }
+
   void set_seed(const std::string& str)
   {
-    if (std::all_of(str.begin(), str.end(), std::isdigit)) {
+    if (str.empty())
+    {
+      set_seed_from_time();
+    }
+    else if (std::all_of(str.begin(), str.end(), [](char c) { return std::isdigit(c);} )) {
       set_seed(std::stoull(str));
     }
     else
@@ -196,7 +207,9 @@ int write_memory_to_file(std::string filename, void* ptr, uint64_t size);
 // async version of PRNG
 struct AsyncPRNG
 {
+  AsyncPRNG() : prng() { }
   AsyncPRNG(uint64_t seed) : prng(seed) { assert(seed); }
+  AsyncPRNG(const std::string& seed) : prng(seed) { }
   // [ASYNC] Extract one random number.
   template<typename T> T rand() {
     std::unique_lock<std::mutex> lk(mutex);

From 184bde47dc0b1703bc03177c467e735f156fb273 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 16 Sep 2020 14:43:21 +0200
Subject: [PATCH 133/398] Add "seed" option to gensfen and learn

---
 src/learn/gensfen.cpp   | 10 +++++++---
 src/learn/learn.cpp     | 33 ++++++++++++++++++---------------
 src/learn/multi_think.h | 11 +++++++----
 3 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index afbcce37..f7cc5669 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -355,7 +355,8 @@ namespace Learner
         // It must be 2**N because it will be used as the mask to calculate hash_index.
         static_assert((GENSFEN_HASH_SIZE& (GENSFEN_HASH_SIZE - 1)) == 0);
 
-        MultiThinkGenSfen(int search_depth_min_, int search_depth_max_, SfenWriter& sw_) :
+        MultiThinkGenSfen(int search_depth_min_, int search_depth_max_, SfenWriter& sw_, const std::string& seed) :
+            MultiThink(seed),
             search_depth_min(search_depth_min_),
             search_depth_max(search_depth_max_),
             sfen_writer(sw_)
@@ -1055,6 +1056,7 @@ namespace Learner
         bool random_file_name = false;
 
         std::string sfen_format;
+        std::string seed;
 
         while (true)
         {
@@ -1111,6 +1113,8 @@ namespace Learner
                 is >> detect_draw_by_insufficient_mating_material;
             else if (token == "sfen_format")
                 is >> sfen_format;
+            else if (token == "seed")
+                is >> seed;
             else
                 cout << "Error! : Illegal token " << token << endl;
         }
@@ -1137,7 +1141,7 @@ namespace Learner
         {
             // Give a random number to output_file_name at this point.
             // Do not use std::random_device().  Because it always the same integers on MinGW.
-            PRNG r(std::chrono::system_clock::now().time_since_epoch().count());
+            PRNG r(seed);
             // Just in case, reassign the random numbers.
             for (int i = 0; i < 10; ++i)
                 r.rand(1);
@@ -1182,7 +1186,7 @@ namespace Learner
             SfenWriter sfen_writer(output_file_name, thread_num);
             sfen_writer.set_save_interval(save_every);
 
-            MultiThinkGenSfen multi_think(search_depth_min, search_depth_max, sfen_writer);
+            MultiThinkGenSfen multi_think(search_depth_min, search_depth_max, sfen_writer, seed);
             multi_think.nodes = nodes;
             multi_think.set_loop_max(loop_max);
             multi_think.eval_limit = eval_limit;
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 70459963..6d0a777d 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -432,8 +432,8 @@ namespace Learner
 
         // Do not use std::random_device().
         // Because it always the same integers on MinGW.
-        SfenReader(int thread_num) :
-            prng(std::chrono::system_clock::now().time_since_epoch().count())
+        SfenReader(int thread_num, const std::string& seed) :
+            prng(seed)
         {
             packed_sfens.resize(thread_num);
             total_read = 0;
@@ -742,7 +742,8 @@ namespace Learner
     // Class to generate sfen with multiple threads
     struct LearnerThink : public MultiThink
     {
-        LearnerThink(SfenReader& sr_) :
+        LearnerThink(SfenReader& sr_, const std::string& seed) :
+            MultiThink(seed),
             sr(sr_),
             stop_flag(false),
             save_only_once(false)
@@ -1437,7 +1438,7 @@ namespace Learner
 
     // Subcontracting the teacher shuffle "learn shuffle" command.
     // output_file_name: name of the output file where the shuffled teacher positions will be written
-    void shuffle_files(const vector<string>& filenames, const string& output_file_name, uint64_t buffer_size)
+    void shuffle_files(const vector<string>& filenames, const string& output_file_name, uint64_t buffer_size, const std::string& seed)
     {
         // The destination folder is
         // tmp/ for temporary writing
@@ -1460,7 +1461,7 @@ namespace Learner
 
         // random number to shuffle
         // Do not use std::random_device().  Because it always the same integers on MinGW.
-        PRNG prng(std::chrono::system_clock::now().time_since_epoch().count());
+        PRNG prng(seed);
 
         // generate the name of the temporary file
         auto make_filename = [](uint64_t i)
@@ -1533,11 +1534,11 @@ namespace Learner
     // Subcontracting the teacher shuffle "learn shuffleq" command.
     // This is written in 1 pass.
     // output_file_name: name of the output file where the shuffled teacher positions will be written
-    void shuffle_files_quick(const vector<string>& filenames, const string& output_file_name)
+    void shuffle_files_quick(const vector<string>& filenames, const string& output_file_name, const std::string& seed)
     {
         // random number to shuffle
         // Do not use std::random_device().  Because it always the same integers on MinGW.
-        PRNG prng(std::chrono::system_clock::now().time_since_epoch().count());
+        PRNG prng(seed);
 
         // number of files
         const size_t file_count = filenames.size();
@@ -1573,7 +1574,7 @@ namespace Learner
 
     // Subcontracting the teacher shuffle "learn shufflem" command.
     // Read the whole memory and write it out with the specified file name.
-    void shuffle_files_on_memory(const vector<string>& filenames, const string output_file_name)
+    void shuffle_files_on_memory(const vector<string>& filenames, const string output_file_name, const std::string& seed)
     {
         PSVector buf;
 
@@ -1591,7 +1592,7 @@ namespace Learner
 
         // shuffle from buf[0] to buf[size-1]
         // Do not use std::random_device().  Because it always the same integers on MinGW.
-        PRNG prng(std::chrono::system_clock::now().time_since_epoch().count());
+        PRNG prng(seed);
         uint64_t size = (uint64_t)buf.size();
         std::cout << "shuffle buf.size() = " << size << std::endl;
 
@@ -1613,9 +1614,7 @@ namespace Learner
     void learn(Position&, istringstream& is)
     {
         const auto thread_num = (int)Options["Threads"];
-        SfenReader sr(thread_num);
 
-        LearnerThink learn_think(sr);
         vector<string> filenames;
 
         // mini_batch_size 1M aspect by default. This can be increased.
@@ -1704,6 +1703,7 @@ namespace Learner
         uint64_t mirror_percentage = 0;
 
         string validation_set_file_name;
+        string seed;
 
         // Assume the filenames are staggered.
         while (true)
@@ -1811,7 +1811,7 @@ namespace Learner
             else if (option == "dest_score_min_value") is >> dest_score_min_value;
             else if (option == "dest_score_max_value") is >> dest_score_max_value;
             else if (option == "convert_teacher_signal_to_winning_probability") is >> convert_teacher_signal_to_winning_probability;
-
+            else if (option == "seed") is >> seed;
             // Otherwise, it's a filename.
             else
                 filenames.push_back(option);
@@ -1829,6 +1829,9 @@ namespace Learner
         cout << "Warning! OpenMP disabled." << endl;
 #endif
 
+        SfenReader sr(thread_num, seed);
+        LearnerThink learn_think(sr, seed);
+
         // Display learning game file
         if (target_dir != "")
         {
@@ -1861,21 +1864,21 @@ namespace Learner
         {
             cout << "buffer_size     : " << buffer_size << endl;
             cout << "shuffle mode.." << endl;
-            shuffle_files(filenames, output_file_name, buffer_size);
+            shuffle_files(filenames, output_file_name, buffer_size, seed);
             return;
         }
 
         if (shuffle_quick)
         {
             cout << "quick shuffle mode.." << endl;
-            shuffle_files_quick(filenames, output_file_name);
+            shuffle_files_quick(filenames, output_file_name, seed);
             return;
         }
 
         if (shuffle_on_memory)
         {
             cout << "shuffle on memory.." << endl;
-            shuffle_files_on_memory(filenames, output_file_name);
+            shuffle_files_on_memory(filenames, output_file_name, seed);
             return;
         }
 
diff --git a/src/learn/multi_think.h b/src/learn/multi_think.h
index 7de9d6b9..4b5662aa 100644
--- a/src/learn/multi_think.h
+++ b/src/learn/multi_think.h
@@ -10,6 +10,8 @@
 #include <limits>
 #include <functional>
 #include <mutex>
+#include <string>
+#include <cstdint>
 
 
 // Learning from a game record, when making yourself think and generating a fixed track, etc.
@@ -19,10 +21,11 @@ struct MultiThink
 {
 	static constexpr std::uint64_t LOOP_COUNT_FINISHED = std::numeric_limits<std::uint64_t>::max();
 
-	MultiThink() : prng(std::chrono::system_clock::now().time_since_epoch().count())
-	{
-		loop_count = 0;
-	}
+	MultiThink() : prng{}, loop_count(0) { }
+
+	MultiThink(std::uint64_t seed) : prng(seed), loop_count(0) { }
+
+	MultiThink(const std::string& seed) : prng(seed), loop_count(0) { }
 
 	// Call this function from the master thread, each thread will think,
 	// Return control when the thought ending condition is satisfied.

From 5f426d8667feda65eaf1eca699f629d31e170d43 Mon Sep 17 00:00:00 2001
From: xoto10 <me@example.com>
Date: Thu, 10 Sep 2020 21:10:57 +0100
Subject: [PATCH 134/398] Use 2 * bestMoveChanges.

NNUE appears to provide a more stable eval than the classic eval,
so the time use dependencies on bestMoveChanges, fallingEval,
etc may need to change to make the best use of available time.
This change doubles the effect of totBestMoveChanges when giving
more time because the choice of best move is unstable.

STC:
LLR: 2.94 (-2.94,2.94) {-0.25,1.25}
Total: 101928 W: 11995 L: 11698 D: 78235 Elo +0.78
Ptnml(0-2): 592, 8707, 32103, 8936, 626
https://tests.stockfishchess.org/tests/view/5f538a462d02727c56b36cec

LTC:
LLR: 2.94 (-2.94,2.94) {0.25,1.25}
Total: 186392 W: 10383 L: 9877 D: 166132 Elo +0.81
Ptnml(0-2): 207, 8370, 75539, 8870, 210
https://tests.stockfishchess.org/tests/view/5f54a9712d02727c56b36d5a

closes https://github.com/official-stockfish/Stockfish/pull/3119

Bench 4222126
---
 src/search.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/search.cpp b/src/search.cpp
index 07c491b6..c7d2efd4 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -520,7 +520,7 @@ void Thread::search() {
               totBestMoveChanges += th->bestMoveChanges;
               th->bestMoveChanges = 0;
           }
-          double bestMoveInstability = 1 + totBestMoveChanges / Threads.size();
+          double bestMoveInstability = 1 + 2 * totBestMoveChanges / Threads.size();
 
           double totalTime = rootMoves.size() == 1 ? 0 :
                              Time.optimum() * fallingEval * reduction * bestMoveInstability;

From d86663af141f1256bfc32ab95891e944d84e8755 Mon Sep 17 00:00:00 2001
From: syzygy1 <3028851+syzygy1@users.noreply.github.com>
Date: Sun, 13 Sep 2020 20:16:52 +0200
Subject: [PATCH 135/398] Improve NDK section in Makefile

This PR sets the "comp" variable simply to "clang",
which seems to be more consistent and allows a small simplification.

The PR also moves the section that sets "profile_make" and "profile_use" to after the NDK section,
which ensures that these variables are now set correctly for NDK/clang.

closes https://github.com/official-stockfish/Stockfish/pull/3121

No functional change
---
 src/Makefile | 32 +++++++++++++-------------------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 340b3008..54868b39 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -381,19 +381,6 @@ ifeq ($(COMP),clang)
 	endif
 endif
 
-ifeq ($(comp),icc)
-	profile_make = icc-profile-make
-	profile_use = icc-profile-use
-else
-ifeq ($(comp),clang)
-	profile_make = clang-profile-make
-	profile_use = clang-profile-use
-else
-	profile_make = gcc-profile-make
-	profile_use = gcc-profile-use
-endif
-endif
-
 ifeq ($(KERNEL),Darwin)
 	CXXFLAGS += -arch $(arch) -mmacosx-version-min=10.14
 	LDFLAGS += -arch $(arch) -mmacosx-version-min=10.14
@@ -405,20 +392,30 @@ endif
 # Currently we don't know how to make PGO builds with the NDK yet.
 ifeq ($(COMP),ndk)
 	CXXFLAGS += -stdlib=libc++ -fPIE
+	comp=clang
 	ifeq ($(arch),armv7)
-		comp=armv7a-linux-androideabi16-clang
 		CXX=armv7a-linux-androideabi16-clang++
 		CXXFLAGS += -mthumb -march=armv7-a -mfloat-abi=softfp -mfpu=neon
 		STRIP=arm-linux-androideabi-strip
 	endif
 	ifeq ($(arch),armv8)
-		comp=aarch64-linux-android21-clang
 		CXX=aarch64-linux-android21-clang++
 		STRIP=aarch64-linux-android-strip
 	endif
 	LDFLAGS += -static-libstdc++ -pie -lm -latomic
 endif
 
+ifeq ($(comp),icc)
+	profile_make = icc-profile-make
+	profile_use = icc-profile-use
+else ifeq ($(comp),clang)
+	profile_make = clang-profile-make
+	profile_use = clang-profile-use
+else
+	profile_make = gcc-profile-make
+	profile_use = gcc-profile-use
+endif
+
 ### Travis CI script uses COMPILER to overwrite CXX
 ifdef COMPILER
 	COMPCXX=$(COMPILER)
@@ -590,10 +587,7 @@ endif
 ### needs access to the optimization flags.
 ifeq ($(optimize),yes)
 ifeq ($(debug), no)
-	ifeq ($(COMP),ndk)
-		CXXFLAGS += -flto=thin
-		LDFLAGS += $(CXXFLAGS)
-	else ifeq ($(comp),clang)
+	ifeq ($(comp),clang)
 		CXXFLAGS += -flto=thin
 		ifneq ($(findstring MINGW,$(KERNEL)),)
 			CXXFLAGS += -fuse-ld=lld

From df43805953b241f95c246ff3e96aece76b518590 Mon Sep 17 00:00:00 2001
From: GoldenRare <deshawnmohan@hotmail.com>
Date: Thu, 10 Sep 2020 00:24:40 -0400
Subject: [PATCH 136/398] Added FEN string to bench output

fixes https://github.com/official-stockfish/Stockfish/pull/3117

closes https://github.com/official-stockfish/Stockfish/pull/3118

No functional change
---
 AUTHORS     | 1 +
 src/uci.cpp | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/AUTHORS b/AUTHORS
index c00ab657..198dfa5a 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -63,6 +63,7 @@ Gary Heckman (gheckman)
 George Sobala (gsobala)
 gguliash
 Gian-Carlo Pascutto (gcp)
+Deshawn Mohan-Smith (GoldenRare)
 Gontran Lemaire (gonlem)
 Goodkov Vasiliy Aleksandrovich (goodkov)
 Gregor Cramer
diff --git a/src/uci.cpp b/src/uci.cpp
index bc0ee0a0..3f3cc458 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -170,7 +170,7 @@ namespace {
 
         if (token == "go" || token == "eval")
         {
-            cerr << "\nPosition: " << cnt++ << '/' << num << endl;
+            cerr << "\nPosition: " << cnt++ << '/' << num << " (" << pos.fen() << ")" << endl;
             if (token == "go")
             {
                go(pos, is, states);

From 0ca93c5b94b820a41e2850ede084096120128a28 Mon Sep 17 00:00:00 2001
From: Unai Corzo <corzounai@gmail.com>
Date: Wed, 16 Sep 2020 19:14:32 +0200
Subject: [PATCH 137/398] Remove castling extension

STC https://tests.stockfishchess.org/tests/view/5f5fa5348fbc1c8a3f476eca
LLR: 2.94 (-2.94,2.94) {-1.25,0.25}
Total: 38520 W: 4713 L: 4610 D: 29197
Ptnml(0-2): 233, 3486, 11734, 3559, 248

LTC https://tests.stockfishchess.org/tests/view/5f62166a912c15f19854b806
LLR: 2.93 (-2.94,2.94) {-0.75,0.25}
Total: 48024 W: 2673 L: 2600 D: 42751
Ptnml(0-2): 64, 2247, 19316, 2322, 63

closes https://github.com/official-stockfish/Stockfish/pull/3128

bench: 3818400
---
 src/search.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index c7d2efd4..17cd0a73 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1127,11 +1127,6 @@ moves_loop: // When in check, search starts from here
                && pos.non_pawn_material() <= 2 * RookValueMg)
           extension = 1;
 
-      // Castling extension
-      if (   type_of(move) == CASTLING
-          && popcount(pos.pieces(us) & ~pos.pieces(PAWN) & (to_sq(move) & KingSide ? KingSide : QueenSide)) <= 2)
-          extension = 1;
-
       // Late irreversible move extension
       if (   move == ttMove
           && pos.rule50_count() > 80

From 64a63464d7bc72a3aac33aa680cd2b2b240ff903 Mon Sep 17 00:00:00 2001
From: Unai Corzo <corzounai@gmail.com>
Date: Wed, 16 Sep 2020 20:42:38 +0200
Subject: [PATCH 138/398] Simplify futility pruning for captures

STC https://tests.stockfishchess.org/tests/view/5f61f0e4b91f2ec371e429c2
LLR: 2.94 (-2.94,2.94) {-1.25,0.25}
Total: 75512 W: 8747 L: 8704 D: 58061
Ptnml(0-2): 440, 6589, 23683, 6576, 468

LTC https://tests.stockfishchess.org/tests/view/5f6215d3912c15f19854b801
LLR: 2.95 (-2.94,2.94) {-0.75,0.25}
Total: 92912 W: 5030 L: 4992 D: 82890
Ptnml(0-2): 88, 4363, 37532, 4369, 104

closes https://github.com/official-stockfish/Stockfish/pull/3129

bench: 3856086
---
 src/search.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/search.cpp b/src/search.cpp
index 17cd0a73..9c5fb58b 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1056,7 +1056,6 @@ moves_loop: // When in check, search starts from here
               if (   !givesCheck
                   && lmrDepth < 6
                   && !(PvNode && abs(bestValue) < 2)
-                  && PieceValue[MG][type_of(movedPiece)] >= PieceValue[MG][type_of(pos.piece_on(to_sq(move)))]
                   && !ss->inCheck
                   && ss->staticEval + 169 + 244 * lmrDepth
                      + PieceValue[MG][type_of(pos.piece_on(to_sq(move)))] <= alpha)

From 8b8a510fd6a1a17b39b2d4b166f60ac7be0dab23 Mon Sep 17 00:00:00 2001
From: syzygy1 <3028851+syzygy1@users.noreply.github.com>
Date: Wed, 16 Sep 2020 17:39:11 +0200
Subject: [PATCH 139/398] Use tiling to speed up accumulator refreshes and
 updates

Perform the update and refresh operations tile by tile in a local
array of vectors. By selecting the array size carefully, we
achieve that the compiler keeps the whole array in vector registers.

Idea and original implementation by @sf-x.

STC: https://tests.stockfishchess.org/tests/view/5f623eec912c15f19854b855
LLR: 2.94 (-2.94,2.94) {-0.25,1.25}
Total: 4872 W: 623 L: 477 D: 3772
Ptnml(0-2): 14, 350, 1585, 450, 37

LTC: https://tests.stockfishchess.org/tests/view/5f62434e912c15f19854b860
LLR: 2.94 (-2.94,2.94) {0.25,1.25}
Total: 25808 W: 1565 L: 1401 D: 22842
Ptnml(0-2): 23, 1186, 10332, 1330, 33

closes https://github.com/official-stockfish/Stockfish/pull/3130

No functional change
---
 src/nnue/nnue_feature_transformer.h | 233 +++++++++++++++-------------
 1 file changed, 125 insertions(+), 108 deletions(-)

diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index 2b6259c3..e71ee60d 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -29,6 +29,56 @@
 
 namespace Eval::NNUE {
 
+  // If vector instructions are enabled, we update and refresh the
+  // accumulator tile by tile such that each tile fits in the CPU's
+  // vector registers.
+  #define TILING
+
+  #ifdef USE_AVX512
+  typedef __m512i vec_t;
+  #define vec_load(a) _mm512_loadA_si512(a)
+  #define vec_store(a,b) _mm512_storeA_si512(a,b)
+  #define vec_add_16(a,b) _mm512_add_epi16(a,b)
+  #define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
+  static constexpr IndexType kNumRegs = 8; // only 8 are needed
+
+  #elif USE_AVX2
+  typedef __m256i vec_t;
+  #define vec_load(a) _mm256_loadA_si256(a)
+  #define vec_store(a,b) _mm256_storeA_si256(a,b)
+  #define vec_add_16(a,b) _mm256_add_epi16(a,b)
+  #define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
+  static constexpr IndexType kNumRegs = 16;
+
+  #elif USE_SSE2
+  typedef __m128i vec_t;
+  #define vec_load(a) (*(a))
+  #define vec_store(a,b) *(a)=(b)
+  #define vec_add_16(a,b) _mm_add_epi16(a,b)
+  #define vec_sub_16(a,b) _mm_sub_epi16(a,b)
+  static constexpr IndexType kNumRegs = Is64Bit ? 16 : 8;
+
+  #elif USE_MMX
+  typedef __m64 vec_t;
+  #define vec_load(a) (*(a))
+  #define vec_store(a,b) *(a)=(b)
+  #define vec_add_16(a,b) _mm_add_pi16(a,b)
+  #define vec_sub_16(a,b) _mm_sub_pi16(a,b)
+  static constexpr IndexType kNumRegs = 8;
+
+  #elif USE_NEON
+  typedef int16x8_t vec_t;
+  #define vec_load(a) (*(a))
+  #define vec_store(a,b) *(a)=(b)
+  #define vec_add_16(a,b) vaddq_s16(a,b)
+  #define vec_sub_16(a,b) vsubq_s16(a,b)
+  static constexpr IndexType kNumRegs = 16;
+
+  #else
+  #undef TILING
+
+  #endif
+
   // Input feature converter
   class FeatureTransformer {
 
@@ -36,6 +86,11 @@ namespace Eval::NNUE {
     // Number of output dimensions for one side
     static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions;
 
+    #ifdef TILING
+    static constexpr IndexType kTileHeight = kNumRegs * sizeof(vec_t) / 2;
+    static_assert(kHalfDimensions % kTileHeight == 0, "kTileHeight must divide kHalfDimensions");
+    #endif
+
    public:
     // Output type
     using OutputType = TransformedFeatureType;
@@ -189,57 +244,41 @@ namespace Eval::NNUE {
       RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i],
                                        active_indices);
       for (Color perspective : { WHITE, BLACK }) {
+  #ifdef TILING
+        for (unsigned j = 0; j < kHalfDimensions / kTileHeight; ++j) {
+          auto biasesTile = reinterpret_cast<const vec_t*>(
+              &biases_[j * kTileHeight]);
+          auto accTile = reinterpret_cast<vec_t*>(
+              &accumulator.accumulation[perspective][i][j * kTileHeight]);
+          vec_t acc[kNumRegs];
+
+          for (unsigned k = 0; k < kNumRegs; ++k)
+            acc[k] = biasesTile[k];
+
+          for (const auto index : active_indices[perspective]) {
+            const IndexType offset = kHalfDimensions * index + j * kTileHeight;
+            auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
+
+            for (unsigned k = 0; k < kNumRegs; ++k)
+              acc[k] = vec_add_16(acc[k], column[k]);
+          }
+
+          for (unsigned k = 0; k < kNumRegs; k++)
+            vec_store(&accTile[k], acc[k]);
+        }
+  #else
         std::memcpy(accumulator.accumulation[perspective][i], biases_,
-                   kHalfDimensions * sizeof(BiasType));
+            kHalfDimensions * sizeof(BiasType));
+
         for (const auto index : active_indices[perspective]) {
           const IndexType offset = kHalfDimensions * index;
-  #if defined(USE_AVX512)
-          auto accumulation = reinterpret_cast<__m512i*>(
-              &accumulator.accumulation[perspective][i][0]);
-          auto column = reinterpret_cast<const __m512i*>(&weights_[offset]);
-          constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
-          for (IndexType j = 0; j < kNumChunks; ++j)
-            _mm512_storeA_si512(&accumulation[j], _mm512_add_epi16(_mm512_loadA_si512(&accumulation[j]), column[j]));
 
-  #elif defined(USE_AVX2)
-          auto accumulation = reinterpret_cast<__m256i*>(
-              &accumulator.accumulation[perspective][i][0]);
-          auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
-          constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-          for (IndexType j = 0; j < kNumChunks; ++j)
-            _mm256_storeA_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadA_si256(&accumulation[j]), column[j]));
-
-  #elif defined(USE_SSE2)
-          auto accumulation = reinterpret_cast<__m128i*>(
-              &accumulator.accumulation[perspective][i][0]);
-          auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
-          constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-          for (IndexType j = 0; j < kNumChunks; ++j)
-            accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
-
-  #elif defined(USE_MMX)
-          auto accumulation = reinterpret_cast<__m64*>(
-              &accumulator.accumulation[perspective][i][0]);
-          auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
-          constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-          for (IndexType j = 0; j < kNumChunks; ++j)
-            accumulation[j] = _mm_add_pi16(accumulation[j], column[j]);
-
-  #elif defined(USE_NEON)
-          auto accumulation = reinterpret_cast<int16x8_t*>(
-              &accumulator.accumulation[perspective][i][0]);
-          auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
-          constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-          for (IndexType j = 0; j < kNumChunks; ++j)
-            accumulation[j] = vaddq_s16(accumulation[j], column[j]);
-
-  #else
           for (IndexType j = 0; j < kHalfDimensions; ++j)
             accumulator.accumulation[perspective][i][j] += weights_[offset + j];
-  #endif
-
         }
+  #endif
       }
+
   #if defined(USE_MMX)
       _mm_empty();
   #endif
@@ -257,29 +296,55 @@ namespace Eval::NNUE {
       bool reset[2];
       RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i],
                                         removed_indices, added_indices, reset);
-      for (Color perspective : { WHITE, BLACK }) {
 
-  #if defined(USE_AVX2)
-        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-        auto accumulation = reinterpret_cast<__m256i*>(
-            &accumulator.accumulation[perspective][i][0]);
+  #ifdef TILING
+      for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
+        for (Color perspective : { WHITE, BLACK }) {
+          auto accTile = reinterpret_cast<vec_t*>(
+              &accumulator.accumulation[perspective][i][j * kTileHeight]);
+          vec_t acc[kNumRegs];
 
-  #elif defined(USE_SSE2)
-        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-        auto accumulation = reinterpret_cast<__m128i*>(
-            &accumulator.accumulation[perspective][i][0]);
+          if (reset[perspective]) {
+            auto biasesTile = reinterpret_cast<const vec_t*>(
+                &biases_[j * kTileHeight]);
+            for (unsigned k = 0; k < kNumRegs; ++k)
+              acc[k] = biasesTile[k];
+          } else {
+            auto prevAccTile = reinterpret_cast<const vec_t*>(
+                &prev_accumulator.accumulation[perspective][i][j * kTileHeight]);
+            for (IndexType k = 0; k < kNumRegs; ++k)
+              acc[k] = vec_load(&prevAccTile[k]);
 
-  #elif defined(USE_MMX)
-        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-        auto accumulation = reinterpret_cast<__m64*>(
-            &accumulator.accumulation[perspective][i][0]);
+            // Difference calculation for the deactivated features
+            for (const auto index : removed_indices[perspective]) {
+              const IndexType offset = kHalfDimensions * index + j * kTileHeight;
+              auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
 
-  #elif defined(USE_NEON)
-        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-        auto accumulation = reinterpret_cast<int16x8_t*>(
-            &accumulator.accumulation[perspective][i][0]);
+              for (IndexType k = 0; k < kNumRegs; ++k)
+                acc[k] = vec_sub_16(acc[k], column[k]);
+            }
+          }
+          { // Difference calculation for the activated features
+            for (const auto index : added_indices[perspective]) {
+              const IndexType offset = kHalfDimensions * index + j * kTileHeight;
+              auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
+
+              for (IndexType k = 0; k < kNumRegs; ++k)
+                acc[k] = vec_add_16(acc[k], column[k]);
+            }
+          }
+
+          for (IndexType k = 0; k < kNumRegs; ++k)
+            vec_store(&accTile[k], acc[k]);
+        }
+      }
+  #if defined(USE_MMX)
+      _mm_empty();
   #endif
 
+  #else
+      for (Color perspective : { WHITE, BLACK }) {
+
         if (reset[perspective]) {
           std::memcpy(accumulator.accumulation[perspective][i], biases_,
                       kHalfDimensions * sizeof(BiasType));
@@ -291,67 +356,19 @@ namespace Eval::NNUE {
           for (const auto index : removed_indices[perspective]) {
             const IndexType offset = kHalfDimensions * index;
 
-  #if defined(USE_AVX2)
-            auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j)
-              accumulation[j] = _mm256_sub_epi16(accumulation[j], column[j]);
-
-  #elif defined(USE_SSE2)
-            auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j)
-              accumulation[j] = _mm_sub_epi16(accumulation[j], column[j]);
-
-  #elif defined(USE_MMX)
-            auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j)
-              accumulation[j] = _mm_sub_pi16(accumulation[j], column[j]);
-
-  #elif defined(USE_NEON)
-            auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j)
-              accumulation[j] = vsubq_s16(accumulation[j], column[j]);
-
-  #else
             for (IndexType j = 0; j < kHalfDimensions; ++j)
               accumulator.accumulation[perspective][i][j] -= weights_[offset + j];
-  #endif
-
           }
         }
         { // Difference calculation for the activated features
           for (const auto index : added_indices[perspective]) {
             const IndexType offset = kHalfDimensions * index;
 
-  #if defined(USE_AVX2)
-            auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j)
-              accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]);
-
-  #elif defined(USE_SSE2)
-            auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j)
-              accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
-
-  #elif defined(USE_MMX)
-            auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j)
-              accumulation[j] = _mm_add_pi16(accumulation[j], column[j]);
-
-  #elif defined(USE_NEON)
-            auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
-            for (IndexType j = 0; j < kNumChunks; ++j)
-              accumulation[j] = vaddq_s16(accumulation[j], column[j]);
-
-  #else
             for (IndexType j = 0; j < kHalfDimensions; ++j)
               accumulator.accumulation[perspective][i][j] += weights_[offset + j];
-  #endif
-
           }
         }
       }
-  #if defined(USE_MMX)
-      _mm_empty();
   #endif
 
       accumulator.computed_accumulation = true;

From e8472b5fbe1eed1cbcdfe06eb8ae9206bac773e0 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Fri, 18 Sep 2020 20:22:01 +0200
Subject: [PATCH 140/398] Fix races in gensfen as detected with thread
 sanitizer.

RootInTB was an incorrectly shared global, probably leading to wrong scoreing

Minor:
 setting TB global state from input by all threads (all threads write same values)
 setting Limits global state by all threads (idem)
 thread counting for finalization

CI can be enabled once races are fixed in the learner, manually goes like:
```
make clean && make -j2 ARCH=x86-64-modern sanitize=thread    optimize=no debug=yes build
../tests/instrumented_learn.sh --sanitizer-thread
```

Needs some review.
---
 src/learn/multi_think.cpp | 33 ++++++++++++++-----
 src/learn/multi_think.h   |  5 +--
 src/search.cpp            | 69 ++++++++++++++-------------------------
 src/search.h              | 12 +++++++
 src/syzygy/tbprobe.h      |  2 --
 src/thread.cpp            |  2 ++
 src/thread.h              |  1 +
 7 files changed, 66 insertions(+), 58 deletions(-)

diff --git a/src/learn/multi_think.cpp b/src/learn/multi_think.cpp
index 043238fa..22e49e81 100644
--- a/src/learn/multi_think.cpp
+++ b/src/learn/multi_think.cpp
@@ -3,6 +3,7 @@
 #include "tt.h"
 #include "uci.h"
 #include "types.h"
+#include "search.h"
 
 #include <thread>
 
@@ -23,6 +24,27 @@ void MultiThink::go_think()
 	// Call the derived class's init().
 	init();
 
+        // init global vars
+        Tablebases::init();
+
+        // About Search::Limits
+        // Be careful because this member variable is global and affects other threads.
+        {
+          auto& limits = Search::Limits;
+
+          // Make the search equivalent to the "go infinite" command. (Because it is troublesome if time management is done)
+          limits.infinite = true;
+
+          // Since PV is an obstacle when displayed, erase it.
+          limits.silent = true;
+
+          // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it.
+          limits.nodes = 0;
+
+          // depth is also processed by the one passed as an argument of Learner::search().
+          limits.depth = 0;
+        }
+
 	// The loop upper limit is set with set_loop_max().
 	loop_count = 0;
 	done_count = 0;
@@ -32,12 +54,11 @@ void MultiThink::go_think()
 	auto thread_num = (size_t)Options["Threads"];
 
 	// Secure end flag of worker thread
-	thread_finished.resize(thread_num);
+        threads_finished=0;
 
 	// start worker thread
 	for (size_t i = 0; i < thread_num; ++i)
 	{
-		thread_finished[i] = 0;
 		threads.push_back(std::thread([i, this]
 		{
 			// exhaust all processor threads.
@@ -47,7 +68,7 @@ void MultiThink::go_think()
 			this->thread_worker(i);
 
 			// Set the end flag because the thread has ended
-			this->thread_finished[i] = 1;
+			this->threads_finished++;
 		}));
 	}
 
@@ -61,11 +82,7 @@ void MultiThink::go_think()
 	// function to determine if all threads have finished
 	auto threads_done = [&]()
 	{
-		// returns false if no one is finished
-		for (auto& f : thread_finished)
-			if (!f)
-				return false;
-		return true;
+		return threads_finished == thread_num;
 	};
 
 	// Call back if the callback function is set.
diff --git a/src/learn/multi_think.h b/src/learn/multi_think.h
index 4b5662aa..e6c436f8 100644
--- a/src/learn/multi_think.h
+++ b/src/learn/multi_think.h
@@ -96,10 +96,7 @@ private:
 	std::mutex loop_mutex;
 
 	// Thread end flag.
-	// vector<bool> may not be reflected properly when trying to rewrite from multiple threads...
-	typedef uint8_t Flag;
-	std::vector<Flag> thread_finished;
-
+        std::atomic<uint64_t> threads_finished;
 };
 
 // Mechanism to process task during idle time.
diff --git a/src/search.cpp b/src/search.cpp
index 7c6f8ace..9f5119a2 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -43,9 +43,24 @@ namespace Search {
 namespace Tablebases {
 
   int Cardinality;
-  bool RootInTB;
   bool UseRule50;
   Depth ProbeDepth;
+
+  void init() {
+
+      UseRule50 = bool(Options["Syzygy50MoveRule"]);
+      ProbeDepth = int(Options["SyzygyProbeDepth"]);
+      Cardinality = int(Options["SyzygyProbeLimit"]);
+
+      // Tables with fewer pieces than SyzygyProbeLimit are searched with
+      // ProbeDepth == DEPTH_ZERO
+      if (Cardinality > MaxCardinality)
+      {
+          Cardinality = MaxCardinality;
+          ProbeDepth = 0;
+      }
+  }
+
 }
 
 namespace TB = Tablebases;
@@ -1844,7 +1859,7 @@ string UCI::pv(const Position& pos, Depth depth, Value alpha, Value beta) {
   size_t pvIdx = pos.this_thread()->pvIdx;
   size_t multiPV = std::min((size_t)Options["MultiPV"], rootMoves.size());
   uint64_t nodesSearched = Threads.nodes_searched();
-  uint64_t tbHits = Threads.tb_hits() + (TB::RootInTB ? rootMoves.size() : 0);
+  uint64_t tbHits = Threads.tb_hits() + (pos.this_thread()->rootInTB ? rootMoves.size() : 0);
 
   for (size_t i = 0; i < multiPV; ++i)
   {
@@ -1856,7 +1871,7 @@ string UCI::pv(const Position& pos, Depth depth, Value alpha, Value beta) {
       Depth d = updated ? depth : depth - 1;
       Value v = updated ? rootMoves[i].score : rootMoves[i].previousScore;
 
-      bool tb = TB::RootInTB && abs(v) < VALUE_MATE_IN_MAX_PLY;
+      bool tb = pos.this_thread()->rootInTB && abs(v) < VALUE_MATE_IN_MAX_PLY;
       v = tb ? rootMoves[i].tbScore : v;
 
       if (ss.rdbuf()->in_avail()) // Not at first line
@@ -1923,10 +1938,8 @@ bool RootMove::extract_ponder_from_tt(Position& pos) {
 
 void Tablebases::rank_root_moves(Position& pos, Search::RootMoves& rootMoves) {
 
-    RootInTB = false;
-    UseRule50 = bool(Options["Syzygy50MoveRule"]);
-    ProbeDepth = int(Options["SyzygyProbeDepth"]);
-    Cardinality = int(Options["SyzygyProbeLimit"]);
+    auto& rootInTB = pos.this_thread()->rootInTB;
+    rootInTB = false;
     bool dtz_available = true;
 
     // Tables with fewer pieces than SyzygyProbeLimit are searched with
@@ -1940,17 +1953,17 @@ void Tablebases::rank_root_moves(Position& pos, Search::RootMoves& rootMoves) {
     if (Cardinality >= popcount(pos.pieces()) && !pos.can_castle(ANY_CASTLING))
     {
         // Rank moves using DTZ tables
-        RootInTB = root_probe(pos, rootMoves);
+        rootInTB = root_probe(pos, rootMoves);
 
-        if (!RootInTB)
+        if (!rootInTB)
         {
             // DTZ tables are missing; try to rank moves using WDL tables
             dtz_available = false;
-            RootInTB = root_probe_wdl(pos, rootMoves);
+            rootInTB = root_probe_wdl(pos, rootMoves);
         }
     }
 
-    if (RootInTB)
+    if (rootInTB)
     {
         // Sort moves according to TB rank
         std::sort(rootMoves.begin(), rootMoves.end(),
@@ -1966,6 +1979,7 @@ void Tablebases::rank_root_moves(Position& pos, Search::RootMoves& rootMoves) {
         for (auto& m : rootMoves)
             m.tbRank = 0;
     }
+
 }
 
 // --- expose the functions such as fixed depth search used for learning to the outside
@@ -1987,39 +2001,6 @@ namespace Learner
 
     std::memset(ss - 7, 0, 10 * sizeof(Stack));
 
-    // About Search::Limits
-    // Be careful because this member variable is global and affects other threads.
-    {
-      auto& limits = Search::Limits;
-
-      // Make the search equivalent to the "go infinite" command. (Because it is troublesome if time management is done)
-      limits.infinite = true;
-
-      // Since PV is an obstacle when displayed, erase it.
-      limits.silent = true;
-
-      // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it.
-      limits.nodes = 0;
-
-      // depth is also processed by the one passed as an argument of Learner::search().
-      limits.depth = 0;
-
-      // Set a large value to prevent the draw value from being returned due to the number of moves near the draw.
-      //limits.max_game_ply = 1 << 16;
-
-      // If you do not include the ball entry rule, it will be a draw and it will be difficult to settle.
-      //limits.enteringKingRule = EnteringKingRule::EKR_27_POINT;
-    }
-
-    // Set DrawValue
-    {
-      // Because it is not prepared for each thread
-      // May be overwritten by another thread. There is no help for it.
-      // If that happens, I think it should be 0.
-      //drawValueTable[REPETITION_DRAW][BLACK] = VALUE_ZERO;
-      //drawValueTable[REPETITION_DRAW][WHITE] = VALUE_ZERO;
-    }
-
     // Regarding this_thread.
 
     {
diff --git a/src/search.h b/src/search.h
index 20dfe909..fd5814ef 100644
--- a/src/search.h
+++ b/src/search.h
@@ -24,6 +24,7 @@
 #include "misc.h"
 #include "movepick.h"
 #include "types.h"
+#include "uci.h"
 
 class Position;
 
@@ -110,6 +111,17 @@ void clear();
 
 } // namespace Search
 
+namespace Tablebases {
+
+extern int MaxCardinality;
+extern int Cardinality;
+extern bool UseRule50;
+extern Depth ProbeDepth;
+
+void init();
+
+}
+
 namespace Learner {
 
   // A pair of reader and evaluation value. Returned by Learner::search(),Learner::qsearch().
diff --git a/src/syzygy/tbprobe.h b/src/syzygy/tbprobe.h
index b998989b..6af5d278 100644
--- a/src/syzygy/tbprobe.h
+++ b/src/syzygy/tbprobe.h
@@ -43,8 +43,6 @@ enum ProbeState {
     ZEROING_BEST_MOVE =  2  // Best move zeroes DTZ (capture or pawn move)
 };
 
-extern int MaxCardinality;
-
 void init(const std::string& paths);
 WDLScore probe_wdl(Position& pos, ProbeState* result);
 int probe_dtz(Position& pos, ProbeState* result);
diff --git a/src/thread.cpp b/src/thread.cpp
index 1aa66a81..ef4cb398 100644
--- a/src/thread.cpp
+++ b/src/thread.cpp
@@ -192,6 +192,8 @@ void ThreadPool::start_thinking(Position& pos, StateListPtr& states,
           || std::count(limits.searchmoves.begin(), limits.searchmoves.end(), m))
           rootMoves.emplace_back(m);
 
+  Tablebases::init();
+
   if (!rootMoves.empty())
       Tablebases::rank_root_moves(pos, rootMoves);
 
diff --git a/src/thread.h b/src/thread.h
index 042bc2e9..e0c838c8 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -74,6 +74,7 @@ public:
   CapturePieceToHistory captureHistory;
   ContinuationHistory continuationHistory[2][2];
   Score contempt;
+  bool rootInTB;
 };
 
 
From 61bc8d12d39cb31303ec9162b1ca8a015d896192 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Fri, 18 Sep 2020 23:06:45 +0200
Subject: [PATCH 141/398] Fix some races in learning

declare a few variables atomic.

Other races remain...
---
 src/learn/learn.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 6d0a777d..6142ce6b 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -695,14 +695,14 @@ namespace Learner
         uint64_t last_done;
 
         // If total_read exceeds this value, update_weights() and calculate mse.
-        uint64_t next_update_weights;
+        std::atomic<uint64_t> next_update_weights;
 
         uint64_t save_count;
 
         // Do not shuffle when reading the phase.
         bool no_shuffle;
 
-        bool stop_flag;
+        std::atomic<bool> stop_flag;
 
         vector<Key> hash;
 
@@ -785,7 +785,7 @@ namespace Learner
         // Mini batch size size. Be sure to set it on the side that uses this class.
         uint64_t mini_batch_size = LEARN_MINI_BATCH_SIZE;
 
-        bool stop_flag;
+        std::atomic<bool> stop_flag;
 
         // Discount rate
         double discount_rate;

From da28ce3339bd19356ec59d50a897fde3d5e213c1 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Sat, 19 Sep 2020 19:27:21 +0200
Subject: [PATCH 142/398] Add initialization also to learning patch

fixes https://github.com/nodchip/Stockfish/issues/160
---
 src/learn/learn.cpp | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 6142ce6b..c1900af3 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -1985,6 +1985,27 @@ namespace Learner
 
         Threads.main()->ponder = false;
 
+        // init global vars
+        Tablebases::init();
+
+        // About Search::Limits
+        // Be careful because this member variable is global and affects other threads.
+        {
+          auto& limits = Search::Limits;
+
+          // Make the search equivalent to the "go infinite" command. (Because it is troublesome if time management is done)
+          limits.infinite = true;
+
+          // Since PV is an obstacle when displayed, erase it.
+          limits.silent = true;
+
+          // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it.
+          limits.nodes = 0;
+
+          // depth is also processed by the one passed as an argument of Learner::search().
+          limits.depth = 0;
+        }
+
         cout << "init_training.." << endl;
         Eval::NNUE::InitializeTraining(eta1, eta1_epoch, eta2, eta2_epoch, eta3);
         Eval::NNUE::SetBatchSize(nn_batch_size);

From d4737819cd7aea0e7744df9973dd5c1db228000e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 20 Sep 2020 10:39:21 +0200
Subject: [PATCH 143/398] Fix castling rights feature encoding.

---
 src/nnue/features/castling_right.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/nnue/features/castling_right.cpp b/src/nnue/features/castling_right.cpp
index 86fe06fe..ee2c88cf 100644
--- a/src/nnue/features/castling_right.cpp
+++ b/src/nnue/features/castling_right.cpp
@@ -27,7 +27,7 @@ namespace Eval {
         }
 
         for (Eval::NNUE::IndexType i = 0; i < kDimensions; ++i) {
-          if (relative_castling_rights & (i << 1)) {
+          if (relative_castling_rights & (1 << i)) {
             active->push_back(i);
           }
         }
@@ -55,8 +55,8 @@ namespace Eval {
         }
 
         for (Eval::NNUE::IndexType i = 0; i < kDimensions; ++i) {
-          if ((relative_previous_castling_rights & (i << 1)) &&
-            (relative_current_castling_rights & (i << 1)) == 0) {
+          if ((relative_previous_castling_rights & (1 << i)) &&
+            (relative_current_castling_rights & (1 << i)) == 0) {
             removed->push_back(i);
           }
         }

From 2931463d3a8b2ea86ac223842dc775fb0ab68de6 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Sun, 20 Sep 2020 19:43:38 +0200
Subject: [PATCH 144/398] Revert earlier TB changes.

they were not correct. Unfortunately, also restores the race on RootInTB
---
 src/learn/learn.cpp       |  3 ---
 src/learn/multi_think.cpp |  3 ---
 src/search.cpp            | 36 +++++++++++-------------------------
 src/search.h              | 12 ------------
 src/syzygy/tbprobe.h      |  2 ++
 src/thread.cpp            |  2 --
 src/thread.h              |  1 -
 7 files changed, 13 insertions(+), 46 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index c1900af3..ba904e9d 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -1985,9 +1985,6 @@ namespace Learner
 
         Threads.main()->ponder = false;
 
-        // init global vars
-        Tablebases::init();
-
         // About Search::Limits
         // Be careful because this member variable is global and affects other threads.
         {
diff --git a/src/learn/multi_think.cpp b/src/learn/multi_think.cpp
index 22e49e81..7c389d40 100644
--- a/src/learn/multi_think.cpp
+++ b/src/learn/multi_think.cpp
@@ -24,9 +24,6 @@ void MultiThink::go_think()
 	// Call the derived class's init().
 	init();
 
-        // init global vars
-        Tablebases::init();
-
         // About Search::Limits
         // Be careful because this member variable is global and affects other threads.
         {
diff --git a/src/search.cpp b/src/search.cpp
index 9f5119a2..e1616c5c 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -43,24 +43,9 @@ namespace Search {
 namespace Tablebases {
 
   int Cardinality;
+  bool RootInTB;
   bool UseRule50;
   Depth ProbeDepth;
-
-  void init() {
-
-      UseRule50 = bool(Options["Syzygy50MoveRule"]);
-      ProbeDepth = int(Options["SyzygyProbeDepth"]);
-      Cardinality = int(Options["SyzygyProbeLimit"]);
-
-      // Tables with fewer pieces than SyzygyProbeLimit are searched with
-      // ProbeDepth == DEPTH_ZERO
-      if (Cardinality > MaxCardinality)
-      {
-          Cardinality = MaxCardinality;
-          ProbeDepth = 0;
-      }
-  }
-
 }
 
 namespace TB = Tablebases;
@@ -1859,7 +1844,7 @@ string UCI::pv(const Position& pos, Depth depth, Value alpha, Value beta) {
   size_t pvIdx = pos.this_thread()->pvIdx;
   size_t multiPV = std::min((size_t)Options["MultiPV"], rootMoves.size());
   uint64_t nodesSearched = Threads.nodes_searched();
-  uint64_t tbHits = Threads.tb_hits() + (pos.this_thread()->rootInTB ? rootMoves.size() : 0);
+  uint64_t tbHits = Threads.tb_hits() + (TB::RootInTB ? rootMoves.size() : 0);
 
   for (size_t i = 0; i < multiPV; ++i)
   {
@@ -1871,7 +1856,7 @@ string UCI::pv(const Position& pos, Depth depth, Value alpha, Value beta) {
       Depth d = updated ? depth : depth - 1;
       Value v = updated ? rootMoves[i].score : rootMoves[i].previousScore;
 
-      bool tb = pos.this_thread()->rootInTB && abs(v) < VALUE_MATE_IN_MAX_PLY;
+      bool tb = TB::RootInTB && abs(v) < VALUE_MATE_IN_MAX_PLY;
       v = tb ? rootMoves[i].tbScore : v;
 
       if (ss.rdbuf()->in_avail()) // Not at first line
@@ -1938,8 +1923,10 @@ bool RootMove::extract_ponder_from_tt(Position& pos) {
 
 void Tablebases::rank_root_moves(Position& pos, Search::RootMoves& rootMoves) {
 
-    auto& rootInTB = pos.this_thread()->rootInTB;
-    rootInTB = false;
+    RootInTB = false;
+    UseRule50 = bool(Options["Syzygy50MoveRule"]);
+    ProbeDepth = int(Options["SyzygyProbeDepth"]);
+    Cardinality = int(Options["SyzygyProbeLimit"]);
     bool dtz_available = true;
 
     // Tables with fewer pieces than SyzygyProbeLimit are searched with
@@ -1953,17 +1940,17 @@ void Tablebases::rank_root_moves(Position& pos, Search::RootMoves& rootMoves) {
     if (Cardinality >= popcount(pos.pieces()) && !pos.can_castle(ANY_CASTLING))
     {
         // Rank moves using DTZ tables
-        rootInTB = root_probe(pos, rootMoves);
+        RootInTB = root_probe(pos, rootMoves);
 
-        if (!rootInTB)
+        if (!RootInTB)
         {
             // DTZ tables are missing; try to rank moves using WDL tables
             dtz_available = false;
-            rootInTB = root_probe_wdl(pos, rootMoves);
+            RootInTB = root_probe_wdl(pos, rootMoves);
         }
     }
 
-    if (rootInTB)
+    if (RootInTB)
     {
         // Sort moves according to TB rank
         std::sort(rootMoves.begin(), rootMoves.end(),
@@ -1979,7 +1966,6 @@ void Tablebases::rank_root_moves(Position& pos, Search::RootMoves& rootMoves) {
         for (auto& m : rootMoves)
             m.tbRank = 0;
     }
-
 }
 
 // --- expose the functions such as fixed depth search used for learning to the outside
diff --git a/src/search.h b/src/search.h
index fd5814ef..20dfe909 100644
--- a/src/search.h
+++ b/src/search.h
@@ -24,7 +24,6 @@
 #include "misc.h"
 #include "movepick.h"
 #include "types.h"
-#include "uci.h"
 
 class Position;
 
@@ -111,17 +110,6 @@ void clear();
 
 } // namespace Search
 
-namespace Tablebases {
-
-extern int MaxCardinality;
-extern int Cardinality;
-extern bool UseRule50;
-extern Depth ProbeDepth;
-
-void init();
-
-}
-
 namespace Learner {
 
   // A pair of reader and evaluation value. Returned by Learner::search(),Learner::qsearch().
diff --git a/src/syzygy/tbprobe.h b/src/syzygy/tbprobe.h
index 6af5d278..b998989b 100644
--- a/src/syzygy/tbprobe.h
+++ b/src/syzygy/tbprobe.h
@@ -43,6 +43,8 @@ enum ProbeState {
     ZEROING_BEST_MOVE =  2  // Best move zeroes DTZ (capture or pawn move)
 };
 
+extern int MaxCardinality;
+
 void init(const std::string& paths);
 WDLScore probe_wdl(Position& pos, ProbeState* result);
 int probe_dtz(Position& pos, ProbeState* result);
diff --git a/src/thread.cpp b/src/thread.cpp
index ef4cb398..1aa66a81 100644
--- a/src/thread.cpp
+++ b/src/thread.cpp
@@ -192,8 +192,6 @@ void ThreadPool::start_thinking(Position& pos, StateListPtr& states,
           || std::count(limits.searchmoves.begin(), limits.searchmoves.end(), m))
           rootMoves.emplace_back(m);
 
-  Tablebases::init();
-
   if (!rootMoves.empty())
       Tablebases::rank_root_moves(pos, rootMoves);
 
diff --git a/src/thread.h b/src/thread.h
index e0c838c8..042bc2e9 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -74,7 +74,6 @@ public:
   CapturePieceToHistory captureHistory;
   ContinuationHistory continuationHistory[2][2];
   Score contempt;
-  bool rootInTB;
 };
 
 
From 8559c439148d0f183a5d67375c12abe92d63975e Mon Sep 17 00:00:00 2001
From: Unai Corzo <corzounai@gmail.com>
Date: Sun, 20 Sep 2020 09:03:37 +0200
Subject: [PATCH 145/398] Simplify reduced depth search

Simplification in reduced depth search.

STC https://tests.stockfishchess.org/tests/view/5f64c72fbb0cae038ca8f531
LLR: 2.94 (-2.94,2.94) {-1.25,0.25}
Total: 28320 W: 3475 L: 3359 D: 21486
Ptnml(0-2): 170, 2485, 8773, 2523, 209

LTC https://tests.stockfishchess.org/tests/view/5f650cfabb0cae038ca8f585
LLR: 2.95 (-2.94,2.94) {-0.75,0.25}
Total: 58392 W: 3354 L: 3285 D: 51753
Ptnml(0-2): 74, 2826, 23336, 2877, 83

closes https://github.com/official-stockfish/Stockfish/pull/3139

bench: 4201295
---
 src/search.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/search.cpp b/src/search.cpp
index 9c5fb58b..22cb8577 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1151,7 +1151,7 @@ moves_loop: // When in check, search starts from here
       // Step 16. Reduced depth search (LMR, ~200 Elo). If the move fails high it will be
       // re-searched at full depth.
       if (    depth >= 3
-          &&  moveCount > 1 + 2 * rootNode + 2 * (PvNode && abs(bestValue) < 2)
+          &&  moveCount > 1 + 2 * rootNode
           && (  !captureOrPromotion
               || moveCountPruning
               || ss->staticEval + PieceValue[EG][pos.captured_piece()] <= alpha

From 16b4578cc1bb0cc0dead19e7d9248553c977f8ca Mon Sep 17 00:00:00 2001
From: Stefan Geschwentner <stgeschwentner@gmail.com>
Date: Sun, 20 Sep 2020 22:25:19 +0200
Subject: [PATCH 146/398] Tweak hybrid treshold.

Increase the first hybrid threshold with more material.
Rewrite the hybrid rules for clarity.

STC:
LLR: 2.94 (-2.94,2.94) {-0.25,1.25}
Total: 24416 W: 3039 L: 2848 D: 18529
Ptnml(0-2): 135, 2136, 7503, 2271, 163
https://tests.stockfishchess.org/tests/view/5f6451efbb0cae038ca8f4dc

LTC;
LLR: 2.95 (-2.94,2.94) {0.25,1.25}
Total: 65016 W: 3702 L: 3455 D: 57859
Ptnml(0-2): 66, 2991, 26157, 3218, 76
https://tests.stockfishchess.org/tests/view/5f64b143bb0cae038ca8f51f

closes https://github.com/official-stockfish/Stockfish/pull/3140

Bench: 3973739
---
 src/evaluate.cpp | 34 +++++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index faf71d27..a9159477 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -1015,20 +1015,28 @@ make_v:
 
 Value Eval::evaluate(const Position& pos) {
 
-  // Use classical eval if there is a large imbalance
-  // If there is a moderate imbalance, use classical eval with probability (1/8),
-  // as derived from the node counter.
-  bool useClassical = abs(eg_value(pos.psq_score())) * 16 > NNUEThreshold1 * (16 + pos.rule50_count());
-  bool classical = !Eval::useNNUE
-                ||  useClassical
-                || (abs(eg_value(pos.psq_score())) > PawnValueMg / 4 && !(pos.this_thread()->nodes & 0xB));
-  Value v = classical ? Evaluation<NO_TRACE>(pos).value()
-                      : NNUE::evaluate(pos) * 5 / 4 + Tempo;
+  Value v;
 
-  if (   useClassical 
-      && Eval::useNNUE 
-      && abs(v) * 16 < NNUEThreshold2 * (16 + pos.rule50_count()))
-      v = NNUE::evaluate(pos) * 5 / 4 + Tempo;
+  if (!Eval::useNNUE)
+      v = Evaluation<NO_TRACE>(pos).value();
+  else
+  {
+      // scale and shift NNUE for compatibility with search and classical evaluation
+      auto  adjusted_NNUE = [&](){ return NNUE::evaluate(pos) * 5 / 4 + Tempo; };
+
+      // if there is PSQ imbalance use classical eval, with small probability if it is small
+      Value psq = Value(abs(eg_value(pos.psq_score())));
+      int   r50 = 16 + pos.rule50_count();
+      bool  largePsq = psq * 16 > (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50;
+      bool  classical = largePsq || (psq > PawnValueMg / 4 && !(pos.this_thread()->nodes & 0xB));
+
+      v = classical ? Evaluation<NO_TRACE>(pos).value() : adjusted_NNUE();
+
+      // if the classical eval is small and imbalance large, use NNUE nevertheless.
+      if (   largePsq
+          && abs(v) * 16 < NNUEThreshold2 * r50)
+          v = adjusted_NNUE();
+  }
 
   // Damp down the evaluation linearly when shuffling
   v = v * (100 - pos.rule50_count()) / 100;

From 485d517c687a2d3cb0b88cc8c198483759eaf2c7 Mon Sep 17 00:00:00 2001
From: Sami Kiminki <skiminki@users.noreply.github.com>
Date: Sun, 30 Aug 2020 19:41:30 +0300
Subject: [PATCH 147/398] Add large page support for NNUE weights and simplify
 TT mem management

Use TT memory functions to allocate memory for the NNUE weights. This
should provide a small speed-up on systems where large pages are not
automatically used, including Windows and some Linux distributions.

Further, since we now have a wrapper for std::aligned_alloc(), we can
simplify the TT memory management a bit:

- We no longer need to store separate pointers to the hash table and
  its underlying memory allocation.
- We also get to merge the Linux-specific and default implementations
  of aligned_ttmem_alloc().

Finally, we'll enable the VirtualAlloc code path with large page
support also for Win32.

STC: https://tests.stockfishchess.org/tests/view/5f66595823a84a47b9036fba
LLR: 2.94 (-2.94,2.94) {-0.25,1.25}
Total: 14896 W: 1854 L: 1686 D: 11356
Ptnml(0-2): 65, 1224, 4742, 1312, 105

closes https://github.com/official-stockfish/Stockfish/pull/3081

No functional change.
---
 README.md                  |  2 +-
 src/misc.cpp               | 57 +++++++++++++++++---------------------
 src/misc.h                 |  4 +--
 src/nnue/evaluate_nnue.cpp | 18 ++++++++----
 src/nnue/evaluate_nnue.h   | 11 ++++++++
 src/tt.cpp                 |  7 +++--
 src/tt.h                   |  3 +-
 7 files changed, 57 insertions(+), 45 deletions(-)

diff --git a/README.md b/README.md
index 96a495ae..255ebce2 100644
--- a/README.md
+++ b/README.md
@@ -152,7 +152,7 @@ to find the best move. The classical evaluation computes this value as a functio
 of various chess concepts, handcrafted by experts, tested and tuned using fishtest.
 The NNUE evaluation computes this value with a neural network based on basic
 inputs (e.g. piece positions only). The network is optimized and trained
-on the evalutions of millions of positions at moderate search depth.
+on the evaluations of millions of positions at moderate search depth.
 
 The NNUE evaluation was first introduced in shogi, and ported to Stockfish afterward.
 It can be evaluated efficiently on CPUs, and exploits the fact that only parts
diff --git a/src/misc.cpp b/src/misc.cpp
index 3fbdea35..d9bc47e3 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -357,27 +357,11 @@ void std_aligned_free(void* ptr) {
 #endif
 }
 
-/// aligned_ttmem_alloc() will return suitably aligned memory, if possible using large pages.
-/// The returned pointer is the aligned one, while the mem argument is the one that needs
-/// to be passed to free. With c++17 some of this functionality could be simplified.
+/// aligned_large_pages_alloc() will return suitably aligned memory, if possible using large pages.
 
-#if defined(__linux__) && !defined(__ANDROID__)
+#if defined(_WIN32)
 
-void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
-
-  constexpr size_t alignment = 2 * 1024 * 1024; // assumed 2MB page sizes
-  size_t size = ((allocSize + alignment - 1) / alignment) * alignment; // multiple of alignment
-  if (posix_memalign(&mem, alignment, size))
-     mem = nullptr;
-#if defined(MADV_HUGEPAGE)
-  madvise(mem, allocSize, MADV_HUGEPAGE);
-#endif
-  return mem;
-}
-
-#elif defined(_WIN64)
-
-static void* aligned_ttmem_alloc_large_pages(size_t allocSize) {
+static void* aligned_large_pages_alloc_win(size_t allocSize) {
 
   HANDLE hProcessToken { };
   LUID luid { };
@@ -422,12 +406,13 @@ static void* aligned_ttmem_alloc_large_pages(size_t allocSize) {
   return mem;
 }
 
-void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
+void* aligned_large_pages_alloc(size_t allocSize) {
 
   static bool firstCall = true;
+  void* mem;
 
   // Try to allocate large pages
-  mem = aligned_ttmem_alloc_large_pages(allocSize);
+  mem = aligned_large_pages_alloc_win(allocSize);
 
   // Suppress info strings on the first call. The first call occurs before 'uci'
   // is received and in that case this output confuses some GUIs.
@@ -449,23 +434,31 @@ void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
 
 #else
 
-void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
+void* aligned_large_pages_alloc(size_t allocSize) {
 
-  constexpr size_t alignment = 64; // assumed cache line size
-  size_t size = allocSize + alignment - 1; // allocate some extra space
-  mem = malloc(size);
-  void* ret = reinterpret_cast<void*>((uintptr_t(mem) + alignment - 1) & ~uintptr_t(alignment - 1));
-  return ret;
+#if defined(__linux__)
+  constexpr size_t alignment = 2 * 1024 * 1024; // assumed 2MB page size
+#else
+  constexpr size_t alignment = 4096; // assumed small page size
+#endif
+
+  // round up to multiples of alignment
+  size_t size = ((allocSize + alignment - 1) / alignment) * alignment;
+  void *mem = std_aligned_alloc(alignment, size);
+#if defined(MADV_HUGEPAGE)
+  madvise(mem, size, MADV_HUGEPAGE);
+#endif
+  return mem;
 }
 
 #endif
 
 
-/// aligned_ttmem_free() will free the previously allocated ttmem
+/// aligned_large_pages_free() will free the previously allocated ttmem
 
-#if defined(_WIN64)
+#if defined(_WIN32)
 
-void aligned_ttmem_free(void* mem) {
+void aligned_large_pages_free(void* mem) {
 
   if (mem && !VirtualFree(mem, 0, MEM_RELEASE))
   {
@@ -478,8 +471,8 @@ void aligned_ttmem_free(void* mem) {
 
 #else
 
-void aligned_ttmem_free(void *mem) {
-  free(mem);
+void aligned_large_pages_free(void *mem) {
+  std_aligned_free(mem);
 }
 
 #endif
diff --git a/src/misc.h b/src/misc.h
index 68b9c884..bc48f303 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -33,8 +33,8 @@ void prefetch(void* addr);
 void start_logger(const std::string& fname);
 void* std_aligned_alloc(size_t alignment, size_t size);
 void std_aligned_free(void* ptr);
-void* aligned_ttmem_alloc(size_t size, void*& mem);
-void aligned_ttmem_free(void* mem); // nop if mem == nullptr
+void* aligned_large_pages_alloc(size_t size); // memory aligned by page size, min alignment: 4096 bytes
+void aligned_large_pages_free(void* mem); // nop if mem == nullptr
 
 void dbg_hit_on(bool b);
 void dbg_hit_on(bool c, bool b);
diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index ed138881..72d18200 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -52,7 +52,7 @@ namespace Eval::NNUE {
   };
 
   // Input feature converter
-  AlignedPtr<FeatureTransformer> feature_transformer;
+  LargePagePtr<FeatureTransformer> feature_transformer;
 
   // Evaluation function
   AlignedPtr<Network> network;
@@ -70,14 +70,22 @@ namespace Eval::NNUE {
     std::memset(pointer.get(), 0, sizeof(T));
   }
 
+  template <typename T>
+  void Initialize(LargePagePtr<T>& pointer) {
+
+    static_assert(alignof(T) <= 4096, "aligned_large_pages_alloc() may fail for such a big alignment requirement of T");
+    pointer.reset(reinterpret_cast<T*>(aligned_large_pages_alloc(sizeof(T))));
+    std::memset(pointer.get(), 0, sizeof(T));
+  }
+
   // Read evaluation function parameters
   template <typename T>
-  bool ReadParameters(std::istream& stream, const AlignedPtr<T>& pointer) {
+  bool ReadParameters(std::istream& stream, T& reference) {
 
     std::uint32_t header;
     header = read_little_endian<std::uint32_t>(stream);
     if (!stream || header != T::GetHashValue()) return false;
-    return pointer->ReadParameters(stream);
+    return reference.ReadParameters(stream);
   }
 
   }  // namespace Detail
@@ -110,8 +118,8 @@ namespace Eval::NNUE {
     std::string architecture;
     if (!ReadHeader(stream, &hash_value, &architecture)) return false;
     if (hash_value != kHashValue) return false;
-    if (!Detail::ReadParameters(stream, feature_transformer)) return false;
-    if (!Detail::ReadParameters(stream, network)) return false;
+    if (!Detail::ReadParameters(stream, *feature_transformer)) return false;
+    if (!Detail::ReadParameters(stream, *network)) return false;
     return stream && stream.peek() == std::ios::traits_type::eof();
   }
 
diff --git a/src/nnue/evaluate_nnue.h b/src/nnue/evaluate_nnue.h
index 5f0d1855..459a93de 100644
--- a/src/nnue/evaluate_nnue.h
+++ b/src/nnue/evaluate_nnue.h
@@ -40,9 +40,20 @@ namespace Eval::NNUE {
     }
   };
 
+  template <typename T>
+  struct TtmemDeleter {
+    void operator()(T* ptr) const {
+      ptr->~T();
+      aligned_large_pages_free(ptr);
+    }
+  };
+
   template <typename T>
   using AlignedPtr = std::unique_ptr<T, AlignedDeleter<T>>;
 
+  template <typename T>
+  using LargePagePtr = std::unique_ptr<T, TtmemDeleter<T>>;
+
 }  // namespace Eval::NNUE
 
 #endif // #ifndef NNUE_EVALUATE_NNUE_H_INCLUDED
diff --git a/src/tt.cpp b/src/tt.cpp
index 60a3a5f1..dea7c712 100644
--- a/src/tt.cpp
+++ b/src/tt.cpp
@@ -62,11 +62,12 @@ void TranspositionTable::resize(size_t mbSize) {
 
   Threads.main()->wait_for_search_finished();
 
-  aligned_ttmem_free(mem);
+  aligned_large_pages_free(table);
 
   clusterCount = mbSize * 1024 * 1024 / sizeof(Cluster);
-  table = static_cast<Cluster*>(aligned_ttmem_alloc(clusterCount * sizeof(Cluster), mem));
-  if (!mem)
+
+  table = static_cast<Cluster*>(aligned_large_pages_alloc(clusterCount * sizeof(Cluster)));
+  if (!table)
   {
       std::cerr << "Failed to allocate " << mbSize
                 << "MB for transposition table." << std::endl;
diff --git a/src/tt.h b/src/tt.h
index fdfd6769..6aa066c5 100644
--- a/src/tt.h
+++ b/src/tt.h
@@ -73,7 +73,7 @@ class TranspositionTable {
   static_assert(sizeof(Cluster) == 32, "Unexpected Cluster size");
 
 public:
- ~TranspositionTable() { aligned_ttmem_free(mem); }
+ ~TranspositionTable() { aligned_large_pages_free(table); }
   void new_search() { generation8 += 8; } // Lower 3 bits are used by PV flag and Bound
   TTEntry* probe(const Key key, bool& found) const;
   int hashfull() const;
@@ -89,7 +89,6 @@ private:
 
   size_t clusterCount;
   Cluster* table;
-  void* mem;
   uint8_t generation8; // Size must be not bigger than TTEntry::genBound8
 };
 

From 9a64e737cfef639f202787161498ba94466ad730 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ste=CC=81phane=20Nicolet?= <cassio@free.fr>
Date: Wed, 9 Sep 2020 10:49:31 +0200
Subject: [PATCH 148/398] Small cleanups 12

- Clean signature of functions in namespace NNUE
- Add comment for countermove based pruning
- Remove bestMoveCount variable
- Add const qualifier to kpp_board_index array
- Fix spaces in get_best_thread()
- Fix indention in capture LMR code in search.cpp
- Rename TtmemDeleter to LargePageDeleter

Closes https://github.com/official-stockfish/Stockfish/pull/3063

No functional change
---
 src/evaluate.cpp           |  8 ++++----
 src/evaluate.h             |  8 +++-----
 src/main.cpp               |  2 +-
 src/nnue/evaluate_nnue.cpp |  6 +++---
 src/nnue/evaluate_nnue.h   |  4 ++--
 src/nnue/nnue_common.h     |  2 +-
 src/search.cpp             | 20 +++++++++-----------
 src/search.h               |  1 -
 src/thread.cpp             | 20 ++++++++++----------
 src/uci.cpp                |  2 +-
 src/ucioption.cpp          |  4 ++--
 11 files changed, 36 insertions(+), 41 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index a9159477..d3937823 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -60,7 +60,7 @@ namespace Eval {
   bool useNNUE;
   string eval_file_loaded = "None";
 
-  /// init_NNUE() tries to load a nnue network at startup time, or when the engine
+  /// NNUE::init() tries to load a nnue network at startup time, or when the engine
   /// receives a UCI command "setoption name EvalFile value nn-[a-z0-9]{12}.nnue"
   /// The name of the nnue network is always retrieved from the EvalFile option.
   /// We search the given network in three locations: internally (the default
@@ -68,7 +68,7 @@ namespace Eval {
   /// in the engine directory. Distro packagers may define the DEFAULT_NNUE_DIRECTORY
   /// variable to have the engine search in a special directory in their distro.
 
-  void init_NNUE() {
+  void NNUE::init() {
 
     useNNUE = Options["Use NNUE"];
     if (!useNNUE)
@@ -111,8 +111,8 @@ namespace Eval {
         }
   }
 
-  /// verify_NNUE() verifies that the last net used was loaded successfully
-  void verify_NNUE() {
+  /// NNUE::verify() verifies that the last net used was loaded successfully
+  void NNUE::verify() {
 
     string eval_file = string(Options["EvalFile"]);
 
diff --git a/src/evaluate.h b/src/evaluate.h
index c723bd8f..56354cf5 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -32,8 +32,6 @@ namespace Eval {
 
   extern bool useNNUE;
   extern std::string eval_file_loaded;
-  void init_NNUE();
-  void verify_NNUE();
 
   // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
   // for the build process (profile-build and fishtest) to work. Do not change the
@@ -43,9 +41,9 @@ namespace Eval {
   namespace NNUE {
 
     Value evaluate(const Position& pos);
-    Value compute_eval(const Position& pos);
-    void  update_eval(const Position& pos);
-    bool  load_eval(std::string streamName, std::istream& stream);
+    bool load_eval(std::string name, std::istream& stream);
+    void init();
+    void verify();
 
   } // namespace NNUE
 
diff --git a/src/main.cpp b/src/main.cpp
index f95db1c2..e6dff918 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -45,7 +45,7 @@ int main(int argc, char* argv[]) {
   Endgames::init();
   Threads.set(size_t(Options["Threads"]));
   Search::clear(); // After threads are up
-  Eval::init_NNUE();
+  Eval::NNUE::init();
 
   UCI::loop(argc, argv);
 
diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index 72d18200..b5dcd992 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -30,7 +30,7 @@
 
 namespace Eval::NNUE {
 
-  uint32_t kpp_board_index[PIECE_NB][COLOR_NB] = {
+  const uint32_t kpp_board_index[PIECE_NB][COLOR_NB] = {
    // convention: W - us, B - them
    // viewed from other side, W and B are reversed
       { PS_NONE,     PS_NONE     },
@@ -136,10 +136,10 @@ namespace Eval::NNUE {
   }
 
   // Load eval, from a file stream or a memory stream
-  bool load_eval(std::string streamName, std::istream& stream) {
+  bool load_eval(std::string name, std::istream& stream) {
 
     Initialize();
-    fileName = streamName;
+    fileName = name;
     return ReadParameters(stream);
   }
 
diff --git a/src/nnue/evaluate_nnue.h b/src/nnue/evaluate_nnue.h
index 459a93de..6cacf37e 100644
--- a/src/nnue/evaluate_nnue.h
+++ b/src/nnue/evaluate_nnue.h
@@ -41,7 +41,7 @@ namespace Eval::NNUE {
   };
 
   template <typename T>
-  struct TtmemDeleter {
+  struct LargePageDeleter {
     void operator()(T* ptr) const {
       ptr->~T();
       aligned_large_pages_free(ptr);
@@ -52,7 +52,7 @@ namespace Eval::NNUE {
   using AlignedPtr = std::unique_ptr<T, AlignedDeleter<T>>;
 
   template <typename T>
-  using LargePagePtr = std::unique_ptr<T, TtmemDeleter<T>>;
+  using LargePagePtr = std::unique_ptr<T, LargePageDeleter<T>>;
 
 }  // namespace Eval::NNUE
 
diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h
index 7bc905dc..8afea186 100644
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -113,7 +113,7 @@ namespace Eval::NNUE {
     PS_END2     = 12 * SQUARE_NB + 1
   };
 
-  extern uint32_t kpp_board_index[PIECE_NB][COLOR_NB];
+  extern const uint32_t kpp_board_index[PIECE_NB][COLOR_NB];
 
   // Type of input feature after conversion
   using TransformedFeatureType = std::uint8_t;
diff --git a/src/search.cpp b/src/search.cpp
index 22cb8577..edc020fd 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -225,7 +225,7 @@ void MainThread::search() {
   Time.init(Limits, us, rootPos.game_ply());
   TT.new_search();
 
-  Eval::verify_NNUE();
+  Eval::NNUE::verify();
 
   if (rootMoves.empty())
   {
@@ -462,10 +462,7 @@ void Thread::search() {
                   ++failedHighCnt;
               }
               else
-              {
-                  ++rootMoves[pvIdx].bestMoveCount;
                   break;
-              }
 
               delta += delta / 4 + 5;
 
@@ -1218,14 +1215,14 @@ moves_loop: // When in check, search starts from here
           }
           else
           {
-            // Increase reduction for captures/promotions if late move and at low depth
-            if (depth < 8 && moveCount > 2)
-                r++;
+              // Increase reduction for captures/promotions if late move and at low depth
+              if (depth < 8 && moveCount > 2)
+                  r++;
 
-            // Unless giving check, this capture is likely bad
-            if (   !givesCheck
-                && ss->staticEval + PieceValue[EG][pos.captured_piece()] + 213 * depth <= alpha)
-                r++;
+              // Unless giving check, this capture is likely bad
+              if (   !givesCheck
+                  && ss->staticEval + PieceValue[EG][pos.captured_piece()] + 213 * depth <= alpha)
+                  r++;
           }
 
           Depth d = std::clamp(newDepth - r, 1, newDepth);
@@ -1570,6 +1567,7 @@ moves_loop: // When in check, search starts from here
                                                                 [pos.moved_piece(move)]
                                                                 [to_sq(move)];
 
+      // CounterMove based pruning
       if (  !captureOrPromotion
           && moveCount
           && (*contHist[0])[pos.moved_piece(move)][to_sq(move)] < CounterMovePruneThreshold
diff --git a/src/search.h b/src/search.h
index f60da4a5..72d43c31 100644
--- a/src/search.h
+++ b/src/search.h
@@ -71,7 +71,6 @@ struct RootMove {
   Value previousScore = -VALUE_INFINITE;
   int selDepth = 0;
   int tbRank = 0;
-  int bestMoveCount = 0;
   Value tbScore;
   std::vector<Move> pv;
 };
diff --git a/src/thread.cpp b/src/thread.cpp
index b46fce5e..2fbf745d 100644
--- a/src/thread.cpp
+++ b/src/thread.cpp
@@ -224,16 +224,16 @@ Thread* ThreadPool::get_best_thread() const {
         votes[th->rootMoves[0].pv[0]] +=
             (th->rootMoves[0].score - minScore + 14) * int(th->completedDepth);
 
-          if (abs(bestThread->rootMoves[0].score) >= VALUE_TB_WIN_IN_MAX_PLY)
-          {
-              // Make sure we pick the shortest mate / TB conversion or stave off mate the longest
-              if (th->rootMoves[0].score > bestThread->rootMoves[0].score)
-                  bestThread = th;
-          }
-          else if (   th->rootMoves[0].score >= VALUE_TB_WIN_IN_MAX_PLY
-                   || (   th->rootMoves[0].score > VALUE_TB_LOSS_IN_MAX_PLY
-                       && votes[th->rootMoves[0].pv[0]] > votes[bestThread->rootMoves[0].pv[0]]))
-              bestThread = th;
+        if (abs(bestThread->rootMoves[0].score) >= VALUE_TB_WIN_IN_MAX_PLY)
+        {
+            // Make sure we pick the shortest mate / TB conversion or stave off mate the longest
+            if (th->rootMoves[0].score > bestThread->rootMoves[0].score)
+                bestThread = th;
+        }
+        else if (   th->rootMoves[0].score >= VALUE_TB_WIN_IN_MAX_PLY
+                 || (   th->rootMoves[0].score > VALUE_TB_LOSS_IN_MAX_PLY
+                     && votes[th->rootMoves[0].pv[0]] > votes[bestThread->rootMoves[0].pv[0]]))
+            bestThread = th;
     }
 
     return bestThread;
diff --git a/src/uci.cpp b/src/uci.cpp
index 3f3cc458..b63e55ad 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -85,7 +85,7 @@ namespace {
     Position p;
     p.set(pos.fen(), Options["UCI_Chess960"], &states->back(), Threads.main());
 
-    Eval::verify_NNUE();
+    Eval::NNUE::verify();
 
     sync_cout << "\n" << Eval::trace(p) << sync_endl;
   }
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index 5e747a7f..bb0b8311 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -41,8 +41,8 @@ void on_hash_size(const Option& o) { TT.resize(size_t(o)); }
 void on_logger(const Option& o) { start_logger(o); }
 void on_threads(const Option& o) { Threads.set(size_t(o)); }
 void on_tb_path(const Option& o) { Tablebases::init(o); }
-void on_use_NNUE(const Option& ) { Eval::init_NNUE(); }
-void on_eval_file(const Option& ) { Eval::init_NNUE(); }
+void on_use_NNUE(const Option& ) { Eval::NNUE::init(); }
+void on_eval_file(const Option& ) { Eval::NNUE::init(); }
 
 /// Our case insensitive less() function as required by UCI protocol
 bool CaseInsensitiveLess::operator() (const string& s1, const string& s2) const {

From 3d5b2c8a5104888ec4d1ec44c171e29809e836a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ste=CC=81phane=20Nicolet?= <cassio@free.fr>
Date: Tue, 22 Sep 2020 22:43:41 +0200
Subject: [PATCH 149/398] Increase reductions with the number of threads

Passed STC with 8 threads:
LLR: 2.92 (-2.94,2.94) {-0.25,1.25}
Total: 13520 W: 1135 L: 1012 D: 11373
Ptnml(0-2): 39, 815, 4929, 938, 39
https://tests.stockfishchess.org/tests/view/5f68e274ded68c240be73f41

Passed LTC with 8 threads:
LLR: 2.96 (-2.94,2.94) {0.25,1.25}
Total: 48384 W: 2183 L: 1994 D: 44207
Ptnml(0-2): 28, 1777, 20402, 1948, 37
https://tests.stockfishchess.org/tests/view/5f68f068ded68c240be747e9

closes https://github.com/official-stockfish/Stockfish/pull/3142

No functional change (for one thread)
---
 src/search.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/search.cpp b/src/search.cpp
index edc020fd..4650b157 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -192,7 +192,7 @@ namespace {
 void Search::init() {
 
   for (int i = 1; i < MAX_MOVES; ++i)
-      Reductions[i] = int((22.0 + std::log(Threads.size())) * std::log(i));
+      Reductions[i] = int((22.0 + 2 * std::log(Threads.size())) * std::log(i));
 }
 
 
From 9f3de8b40eda71b04e6b88f5deaf45a7d1efb402 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 24 Sep 2020 21:10:10 +0200
Subject: [PATCH 150/398] Revert some unwanted changes from merge conflict
 resolution.

---
 src/evaluate.h        |  2 +-
 src/learn/gensfen.cpp | 21 ++-------------------
 src/misc.cpp          | 12 ++++++++++++
 3 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/src/evaluate.h b/src/evaluate.h
index 74a490f6..e6ac7e1c 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -42,7 +42,7 @@ namespace Eval {
   // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
   // for the build process (profile-build and fishtest) to work. Do not change the
   // name of the macro, as it is used in the Makefile.
-  #define EvalFileDefaultName   "nn.bin"
+  #define EvalFileDefaultName   "nn-03744f8d56d8.nnue"
 
   namespace NNUE {
 
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index ba0c3be8..24d05c96 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -47,7 +47,6 @@ namespace Learner
     static bool detect_draw_by_consecutive_low_score = false;
     static bool detect_draw_by_insufficient_mating_material = false;
 
-    static std::vector<std::string> bookStart;
     static SfenOutputType sfen_output_type = SfenOutputType::Bin;
 
     static bool ends_with(const std::string& lhs, const std::string& end)
@@ -817,7 +816,7 @@ namespace Learner
             auto th = Threads[thread_id];
 
             auto& pos = th->rootPos;
-            pos.set(bookStart[prng.rand(bookStart.size())], false, &si, th);
+            pos.set(StartFEN, false, &si, th);
 
             int resign_counter = 0;
             bool should_resign = prng.rand(10) > 1;
@@ -1127,28 +1126,12 @@ namespace Learner
             output_file_name = output_file_name + "_" + to_hex(r.rand<uint64_t>()) + to_hex(r.rand<uint64_t>());
         }
 
-        bookStart.clear();
-        {
-          std::string line;
-          std::ifstream myfile ("3moves_v2.epd");
-          if (myfile.is_open())
-          {
-            while (getline(myfile,line))
-            {
-                bookStart.push_back(line);
-            }
-            myfile.close();
-          } else {
-            bookStart.push_back(StartFEN);
-          }
-        }
         std::cout << "gensfen : " << endl
             << "  search_depth_min = " << search_depth_min << " to " << search_depth_max << endl
             << "  nodes = " << nodes << endl
             << "  loop_max = " << loop_max << endl
             << "  eval_limit = " << eval_limit << endl
-            << "  thread_num             = " << thread_num << endl
-            << "  bookStart              = " << bookStart.size() << endl
+            << "  thread_num (set by USI setoption) = " << thread_num << endl
             << "  random_move_minply     = " << random_move_minply << endl
             << "  random_move_maxply     = " << random_move_maxply << endl
             << "  random_move_count      = " << random_move_count << endl
diff --git a/src/misc.cpp b/src/misc.cpp
index a0e01820..d31538fa 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -408,11 +408,23 @@ static void* aligned_large_pages_alloc_win(size_t allocSize) {
 
 void* aligned_large_pages_alloc(size_t allocSize) {
 
+  static bool firstCall = true;
   void* mem;
 
   // Try to allocate large pages
   mem = aligned_large_pages_alloc_win(allocSize);
 
+  // Suppress info strings on the first call. The first call occurs before 'uci'
+  // is received and in that case this output confuses some GUIs.
+  if (!firstCall)
+  {
+      if (mem)
+          sync_cout << "info string Hash table allocation: Windows large pages used." << sync_endl;
+      else
+          sync_cout << "info string Hash table allocation: Windows large pages not used." << sync_endl;
+  }
+  firstCall = false;
+
   // Fall back to regular, page aligned, allocation if necessary
   if (!mem)
       mem = VirtualAlloc(NULL, allocSize, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);

From 9f87282c6d2e9c81c1ca8997778ae996c40fbe62 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 24 Sep 2020 21:59:25 +0200
Subject: [PATCH 151/398] Fix net not being downloaded on build. Make PGO build
 faster by reverting gensfen command change.

---
 src/Makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index d069dee6..0b2f99ed 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -41,7 +41,7 @@ BINDIR = $(PREFIX)/bin
 ### Built-in benchmark for pgo-builds
 PGO_TRAINING_DATA_FILE = pgo_training_data.bin
 PGOBENCH = ./$(EXE) bench
-PGOGENSFEN = ./$(EXE) gensfen depth 6 loop 10000 output_file_name $(PGO_TRAINING_DATA_FILE)
+PGOGENSFEN = ./$(EXE) gensfen depth 3 loop 1000 output_file_name $(PGO_TRAINING_DATA_FILE)
 
 ### Source and object files
 SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp \
@@ -746,10 +746,10 @@ endif
         config-sanity icc-profile-use icc-profile-make gcc-profile-use gcc-profile-make \
         clang-profile-use clang-profile-make
 
-build: config-sanity
+build: config-sanity net
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) all
 
-profile-build: config-sanity objclean profileclean
+profile-build: net config-sanity objclean profileclean
 	@echo ""
 	@echo "Step 1/4. Building instrumented executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make)

From d4a5f917663fb1bdb2c085eb93d7791be9aef929 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 24 Sep 2020 22:57:55 +0200
Subject: [PATCH 152/398] Add info string when loading/failing to load an eval
 file.

---
 src/evaluate.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 6996e7ae..aa9bbd67 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -77,7 +77,14 @@ namespace Eval {
         {
             ifstream stream(directory + eval_file, ios::binary);
             if (load_eval(eval_file, stream))
+            {
+                sync_cout << "info string Loaded eval file " << directory + eval_file << sync_endl;
                 eval_file_loaded = eval_file;
+            }
+            else
+            {
+                sync_cout << "info string ERROR: failed to load eval file " << directory + eval_file << sync_endl;
+            }
         }
   }
 

From baf8b5beaf5dff1b335100801a8b88da4ede5813 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 24 Sep 2020 22:58:21 +0200
Subject: [PATCH 153/398] Change default net so that the architecture matches
 the architecture expected by the binary.

---
 src/evaluate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/evaluate.h b/src/evaluate.h
index e6ac7e1c..ac67494d 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -42,7 +42,7 @@ namespace Eval {
   // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
   // for the build process (profile-build and fishtest) to work. Do not change the
   // name of the macro, as it is used in the Makefile.
-  #define EvalFileDefaultName   "nn-03744f8d56d8.nnue"
+  #define EvalFileDefaultName   "nn-28e08a9fe2ad.nnue"
 
   namespace NNUE {
 

From 9955f51215d51c35b63c5c88d5dcadcb314fe2b7 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 24 Sep 2020 23:23:37 +0200
Subject: [PATCH 154/398] Update bench signature. Bench: 4698761


From 0a3e070ffb8e47df46533c65bec638630e049300 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 25 Sep 2020 00:11:24 +0200
Subject: [PATCH 155/398] Adjust instrumented learn test for parameter changes.

---
 tests/instrumented_learn.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/instrumented_learn.sh b/tests/instrumented_learn.sh
index 7f76fd76..edbce5fe 100755
--- a/tests/instrumented_learn.sh
+++ b/tests/instrumented_learn.sh
@@ -127,7 +127,7 @@ cat << EOF > learn01.exp
  send "setoption name Use NNUE value true\n"
  send "setoption name Threads value $threads\n"
  send "isready\n"
- send "learn targetdir training_data loop 2 batchsize 100 use_draw_in_training 1 use_draw_in_validation 1 eta 1 lambda 1 eval_limit 32000 nn_batch_size 30 newbob_decay 0.5 eval_save_interval 30 loss_output_interval 10 mirror_percentage 50 validation_set_file_name validation_data/validation_data.bin\n"
+ send "learn targetdir training_data loop 2 batchsize 100 use_draw_in_training 1 use_draw_in_validation 1 lr 1 eval_limit 32000 nn_batch_size 30 newbob_decay 0.5 eval_save_interval 30 loss_output_interval 10 validation_set_file_name validation_data/validation_data.bin\n"
 
  expect "save_eval() finished."
 

From 654b94f0a7a8384d88b5f46cbbf250cceaa66417 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 25 Sep 2020 10:41:40 +0200
Subject: [PATCH 156/398] Remove old unused `use_raw_nnue_eval` option from
 gensfen tests

---
 tests/instrumented_learn.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/instrumented_learn.sh b/tests/instrumented_learn.sh
index edbce5fe..ce1fc429 100755
--- a/tests/instrumented_learn.sh
+++ b/tests/instrumented_learn.sh
@@ -78,11 +78,11 @@ cat << EOF > gensfen01.exp
  send "setoption name Threads value $threads\n"
  send "setoption name Use NNUE value false\n"
  send "isready\n"
- send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data/training_data.bin use_raw_nnue_eval 0 sfen_format bin\n"
+ send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data/training_data.bin sfen_format bin\n"
  expect "gensfen finished."
  send "learn training_data/training_data.bin convert_plain output_file_name training_data.txt\n"
  expect "all done"
- send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data/training_data.binpack use_raw_nnue_eval 0 sfen_format binpack\n"
+ send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data/training_data.binpack sfen_format binpack\n"
  expect "gensfen finished."
 
  send "quit\n"
@@ -104,9 +104,9 @@ cat << EOF > gensfen02.exp
  send "setoption name Threads value $threads\n"
  send "setoption name Use NNUE value true\n"
  send "isready\n"
- send "gensfen depth 4 loop 50 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name validation_data/valdidation_data.bin use_raw_nnue_eval 0 sfen_format bin\n"
+ send "gensfen depth 4 loop 50 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name validation_data/valdidation_data.bin sfen_format bin\n"
  expect "gensfen finished."
- send "gensfen depth 4 loop 50 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name validation_data/validation_data.binpack use_raw_nnue_eval 0 sfen_format binpack\n"
+ send "gensfen depth 4 loop 50 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name validation_data/validation_data.binpack sfen_format binpack\n"
  expect "gensfen finished."
 
  send "quit\n"

From 89eeb36835fe9987283cad1660bbacc6ff1e8fab Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 25 Sep 2020 13:42:27 +0200
Subject: [PATCH 157/398] Initialize Tablebases::MaxCardinality to 0 to prevent
 uninitialized variable read in rank_root_moves

---
 src/search.h           | 5 -----
 src/syzygy/tbprobe.cpp | 2 +-
 src/syzygy/tbprobe.h   | 2 ++
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/search.h b/src/search.h
index 9e453d9a..ab832ee2 100644
--- a/src/search.h
+++ b/src/search.h
@@ -112,11 +112,6 @@ void clear();
 
 } // namespace Search
 
-namespace Tablebases {
-
-extern int MaxCardinality;
-
-}
 namespace Learner {
 
   // A pair of reader and evaluation value. Returned by Learner::search(),Learner::qsearch().
diff --git a/src/syzygy/tbprobe.cpp b/src/syzygy/tbprobe.cpp
index 4d682f1a..f4b9447f 100644
--- a/src/syzygy/tbprobe.cpp
+++ b/src/syzygy/tbprobe.cpp
@@ -52,7 +52,7 @@
 
 using namespace Tablebases;
 
-int Tablebases::MaxCardinality;
+int Tablebases::MaxCardinality = 0;
 
 namespace {
 
diff --git a/src/syzygy/tbprobe.h b/src/syzygy/tbprobe.h
index 6af5d278..5f97c746 100644
--- a/src/syzygy/tbprobe.h
+++ b/src/syzygy/tbprobe.h
@@ -25,6 +25,8 @@
 
 namespace Tablebases {
 
+extern int MaxCardinality;
+
 enum WDLScore {
     WDLLoss        = -2, // Loss
     WDLBlessedLoss = -1, // Loss, but draw under 50-move rule

From b6e7733b4c047682f467414ba9f2959d67249705 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 25 Sep 2020 15:04:21 +0200
Subject: [PATCH 158/398] In gensfen call search before get_current_game_result
 so that rootMoves is initialized by Learner::init_for_search. Don't call
 Tablebases::rank_root_moves in get_current_game_result because it's called in
 Learner::init_for_search. This fixes accessing uninitialized variables
 related to tablebases.

---
 src/learn/gensfen.cpp | 182 ++++++++++++++++++++----------------------
 1 file changed, 86 insertions(+), 96 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 24d05c96..67d898ba 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -465,18 +465,7 @@ namespace Learner
             return 0;
         }
 
-        // Initialize the Syzygy Ending Tablebase and sort the moves.
-        Search::RootMoves rootMoves;
-        for (const auto& m : MoveList<LEGAL>(pos))
-        {
-            rootMoves.emplace_back(m);
-        }
-
-        if (!rootMoves.empty())
-        {
-            Tablebases::rank_root_moves(pos, rootMoves);
-        }
-        else
+        if(pos.this_thread()->rootMoves.empty())
         {
             // If there is no legal move
             return pos.checkers()
@@ -847,6 +836,11 @@ namespace Learner
                 // Current search depth
                 const int depth = search_depth_min + (int)prng.rand(search_depth_max - search_depth_min + 1);
 
+                // Starting search calls init_for_search
+                auto [search_value, search_pv] = search(pos, depth, 1, nodes);
+
+                // This has to be performed after search because it needs to know
+                // rootMoves which are filled in init_for_search.
                 const auto result = get_current_game_result(pos, move_hist_scores);
                 if (result.has_value())
                 {
@@ -854,102 +848,98 @@ namespace Learner
                     break;
                 }
 
+                // Always adjudivate by eval limit.
+                // Also because of this we don't have to check for TB/MATE scores
+                if (abs(search_value) >= eval_limit)
                 {
-                    auto [search_value, search_pv] = search(pos, depth, 1, nodes);
-
-                    // Always adjudivate by eval limit.
-                    // Also because of this we don't have to check for TB/MATE scores
-                    if (abs(search_value) >= eval_limit)
-                    {
-                        resign_counter++;
-                        if ((should_resign && resign_counter >= 4) || abs(search_value) >= 10000) {
-                            flush_psv((search_value >= eval_limit) ? 1 : -1);
-                            break;
-                        }
-                    } else {
-                        resign_counter = 0;
-                    }
-                    // Verification of a strange move
-                    if (search_pv.size() > 0
-                        && (search_pv[0] == MOVE_NONE || search_pv[0] == MOVE_NULL))
-                    {
-                        // (???)
-                        // MOVE_WIN is checking if it is the declaration victory stage before this
-                        // The declarative winning move should never come back here.
-                        // Also, when MOVE_RESIGN, search_value is a one-stop score, which should be the minimum value of eval_limit (-31998)...
-                        cout << "Error! : " << pos.fen() << next_move << search_value << endl;
+                    resign_counter++;
+                    if ((should_resign && resign_counter >= 4) || abs(search_value) >= 10000) {
+                        flush_psv((search_value >= eval_limit) ? 1 : -1);
                         break;
                     }
+                } else {
+                    resign_counter = 0;
+                }
+                // Verification of a strange move
+                if (search_pv.size() > 0
+                    && (search_pv[0] == MOVE_NONE || search_pv[0] == MOVE_NULL))
+                {
+                    // (???)
+                    // MOVE_WIN is checking if it is the declaration victory stage before this
+                    // The declarative winning move should never come back here.
+                    // Also, when MOVE_RESIGN, search_value is a one-stop score, which should be the minimum value of eval_limit (-31998)...
+                    cout << "Error! : " << pos.fen() << next_move << search_value << endl;
+                    break;
+                }
 
-                    // Save the move score for adjudication.
-                    move_hist_scores.push_back(search_value);
+                // Save the move score for adjudication.
+                move_hist_scores.push_back(search_value);
 
-                    // If depth 0, pv is not obtained, so search again at depth 2.
-                    if (search_depth_min <= 0)
+                // If depth 0, pv is not obtained, so search again at depth 2.
+                if (search_depth_min <= 0)
+                {
+                    auto [research_value, research_pv] = search(pos, 2);
+                    search_pv = research_pv;
+                }
+
+                // Discard stuff before write_minply is reached
+                // because it can harm training due to overfitting.
+                // Initial positions would be too common.
+                if (ply < write_minply - 1)
+                {
+                    a_psv.clear();
+                    goto SKIP_SAVE;
+                }
+
+                // Look into the position hashtable to see if the same
+                // position was seen before.
+                // This is a good heuristic to exlude already seen
+                // positions without many false positives.
+                {
+                    auto key = pos.key();
+                    auto hash_index = (size_t)(key & (GENSFEN_HASH_SIZE - 1));
+                    auto old_key = hash[hash_index];
+                    if (key == old_key)
                     {
-                        auto [research_value, research_pv] = search(pos, 2);
-                        search_pv = research_pv;
-                    }
-
-                    // Discard stuff before write_minply is reached
-                    // because it can harm training due to overfitting.
-                    // Initial positions would be too common.
-                    if (ply < write_minply - 1)
-                    {
-                        a_psv.clear();
                         goto SKIP_SAVE;
                     }
-
-                    // Look into the position hashtable to see if the same
-                    // position was seen before.
-                    // This is a good heuristic to exlude already seen
-                    // positions without many false positives.
+                    else
                     {
-                        auto key = pos.key();
-                        auto hash_index = (size_t)(key & (GENSFEN_HASH_SIZE - 1));
-                        auto old_key = hash[hash_index];
-                        if (key == old_key)
-                        {
-                            goto SKIP_SAVE;
-                        }
-                        else
-                        {
-                            // Replace with the current key.
-                            hash[hash_index] = key;
-                        }
+                        // Replace with the current key.
+                        hash[hash_index] = key;
                     }
-
-                    // Pack the current position into a packed sfen and save it into the buffer.
-                    {
-                        a_psv.emplace_back(PackedSfenValue());
-                        auto& psv = a_psv.back();
-
-                        // Here we only write the position data.
-                        // Result is added after the whole game is done.
-                        pos.sfen_pack(psv.sfen);
-
-                        psv.score = search_value;
-
-                        psv.gamePly = ply;
-
-                        // Take out the first PV move. This should be present unless depth 0.
-                        assert(search_pv.size() >= 1);
-                        psv.move = search_pv[0];
-                    }
-
-                SKIP_SAVE:;
-
-                    // For some reason, We could not get PV (hit the substitution table etc. and got stuck?)
-                    // so go to the next game. It's a rare case, so you can ignore it.
-                    if (search_pv.size() == 0)
-                    {
-                        break;
-                    }
-
-                    // Update the next move according to best search result.
-                    next_move = search_pv[0];
                 }
 
+                // Pack the current position into a packed sfen and save it into the buffer.
+                {
+                    a_psv.emplace_back(PackedSfenValue());
+                    auto& psv = a_psv.back();
+
+                    // Here we only write the position data.
+                    // Result is added after the whole game is done.
+                    pos.sfen_pack(psv.sfen);
+
+                    psv.score = search_value;
+
+                    psv.gamePly = ply;
+
+                    // Take out the first PV move. This should be present unless depth 0.
+                    assert(search_pv.size() >= 1);
+                    psv.move = search_pv[0];
+                }
+
+            SKIP_SAVE:;
+
+                // For some reason, We could not get PV (hit the substitution table etc. and got stuck?)
+                // so go to the next game. It's a rare case, so you can ignore it.
+                if (search_pv.size() == 0)
+                {
+                    break;
+                }
+
+                // Update the next move according to best search result.
+                next_move = search_pv[0];
+
                 // Random move.
                 auto random_move = choose_random_move(pos, random_move_flag, ply, actual_random_move_count);
                 if (random_move.has_value())

From c99541828fbf9bf529bdb1675bf67debe39ce48e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 25 Sep 2020 16:06:33 +0200
Subject: [PATCH 159/398] Remove the re-search on depth 0. It is correctly
 handled by search now.

---
 src/learn/gensfen.cpp | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 67d898ba..7e931726 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -875,13 +875,6 @@ namespace Learner
                 // Save the move score for adjudication.
                 move_hist_scores.push_back(search_value);
 
-                // If depth 0, pv is not obtained, so search again at depth 2.
-                if (search_depth_min <= 0)
-                {
-                    auto [research_value, research_pv] = search(pos, 2);
-                    search_pv = research_pv;
-                }
-
                 // Discard stuff before write_minply is reached
                 // because it can harm training due to overfitting.
                 // Initial positions would be too common.

From 5e6a5e48e636babe1c2ba1fc63422e84c0eee942 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ste=CC=81phane=20Nicolet?= <cassio@free.fr>
Date: Thu, 24 Sep 2020 11:38:35 +0200
Subject: [PATCH 160/398] Suppress info strings before 'uci'

On Windows, Stockfish wouldn't launch in some GUI because we output some
info strings (about the use of large pages) before sending the 'uci'
command. It seems more robust to suppress these info strings, and instead
to add a proper section section in the Readme about large pages use.

fixes https://github.com/official-stockfish/Stockfish/issues/3052
closes https://github.com/official-stockfish/Stockfish/pull/3147

No functional change
---
 README.md    | 16 ++++++++--------
 src/misc.cpp | 16 +---------------
 2 files changed, 9 insertions(+), 23 deletions(-)

diff --git a/README.md b/README.md
index 255ebce2..409d0a10 100644
--- a/README.md
+++ b/README.md
@@ -198,8 +198,8 @@ the 50-move rule.
 
 Stockfish supports large pages on Linux and Windows. Large pages make
 the hash access more efficient, improving the engine speed, especially
-on large hash sizes. Typical increases are 5..10% in terms of nps, but
-speed increases up to 30% have been measured. The support is
+on large hash sizes. Typical increases are 5..10% in terms of nodes per
+second, but speed increases up to 30% have been measured. The support is
 automatic. Stockfish attempts to use large pages when available and
 will fall back to regular memory allocation when this is not the case.
 
@@ -213,11 +213,11 @@ are already enabled and no configuration is needed.
 
 The use of large pages requires "Lock Pages in Memory" privilege. See
 [Enable the Lock Pages in Memory Option (Windows)](https://docs.microsoft.com/en-us/sql/database-engine/configure-windows/enable-the-lock-pages-in-memory-option-windows)
-on how to enable this privilege. Logout/login may be needed
-afterwards. Due to memory fragmentation, it may not always be
-possible to allocate large pages even when enabled. A reboot
-might alleviate this problem. To determine whether large pages
-are in use, see the engine log.
+on how to enable this privilege, then run [RAMMap](https://docs.microsoft.com/en-us/sysinternals/downloads/rammap)
+to double-check that large pages are used. We suggest that you reboot
+your computer after you have enabled large pages, because long Windows
+sessions suffer from memory fragmentation which may prevent Stockfish
+from getting large pages: a fresh session is better in this regard.
 
 ## Compiling Stockfish yourself from the sources
 
@@ -232,8 +232,8 @@ targets with corresponding descriptions.
 ```
     cd src
     make help
-    make build ARCH=x86-64-modern
     make net
+    make build ARCH=x86-64-modern
 ```
 
 When not using the Makefile to compile (for instance with Microsoft MSVC) you
diff --git a/src/misc.cpp b/src/misc.cpp
index d9bc47e3..a16a6e90 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -408,22 +408,8 @@ static void* aligned_large_pages_alloc_win(size_t allocSize) {
 
 void* aligned_large_pages_alloc(size_t allocSize) {
 
-  static bool firstCall = true;
-  void* mem;
-
   // Try to allocate large pages
-  mem = aligned_large_pages_alloc_win(allocSize);
-
-  // Suppress info strings on the first call. The first call occurs before 'uci'
-  // is received and in that case this output confuses some GUIs.
-  if (!firstCall)
-  {
-      if (mem)
-          sync_cout << "info string Hash table allocation: Windows large pages used." << sync_endl;
-      else
-          sync_cout << "info string Hash table allocation: Windows large pages not used." << sync_endl;
-  }
-  firstCall = false;
+  void* mem = aligned_large_pages_alloc_win(allocSize);
 
   // Fall back to regular, page aligned, allocation if necessary
   if (!mem)

From f66c381f11b8603e2449b200227c8cfd7382b3ba Mon Sep 17 00:00:00 2001
From: SFisGOD <jonathandumale@gmail.com>
Date: Wed, 23 Sep 2020 14:00:42 +0800
Subject: [PATCH 161/398] Switch to NNUE eval probabilistically for OCB

Introduce a small chance of switching to NNUE if PSQ imbalance is large but we have opposite colored bishops and the classical eval is struggling to win.

STC:
LLR: 2.94 (-2.94,2.94) {-0.25,1.25}
Total: 25304 W: 3179 L: 2983 D: 19142
Ptnml(0-2): 172, 2171, 7781, 2345, 183
https://tests.stockfishchess.org/tests/view/5f6b14dec7759d4ee307cfe3

LTC:
LLR: 2.94 (-2.94,2.94) {0.25,1.25}
Total: 84680 W: 4846 L: 4556 D: 75278
Ptnml(0-2): 89, 3933, 34011, 4213, 94
https://tests.stockfishchess.org/tests/view/5f6b3fb6c7759d4ee307cff9

closes https://github.com/official-stockfish/Stockfish/pull/3146

Bench: 3865413
---
 src/evaluate.cpp | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index d3937823..1503be2d 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -1021,10 +1021,10 @@ Value Eval::evaluate(const Position& pos) {
       v = Evaluation<NO_TRACE>(pos).value();
   else
   {
-      // scale and shift NNUE for compatibility with search and classical evaluation
+      // Scale and shift NNUE for compatibility with search and classical evaluation
       auto  adjusted_NNUE = [&](){ return NNUE::evaluate(pos) * 5 / 4 + Tempo; };
 
-      // if there is PSQ imbalance use classical eval, with small probability if it is small
+      // If there is PSQ imbalance use classical eval, with small probability if it is small
       Value psq = Value(abs(eg_value(pos.psq_score())));
       int   r50 = 16 + pos.rule50_count();
       bool  largePsq = psq * 16 > (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50;
@@ -1032,9 +1032,14 @@ Value Eval::evaluate(const Position& pos) {
 
       v = classical ? Evaluation<NO_TRACE>(pos).value() : adjusted_NNUE();
 
-      // if the classical eval is small and imbalance large, use NNUE nevertheless.
+      // If the classical eval is small and imbalance large, use NNUE nevertheless.
+      // For the case of opposite colored bishops, switch to NNUE eval with
+      // small probability if the classical eval is less than the threshold.
       if (   largePsq
-          && abs(v) * 16 < NNUEThreshold2 * r50)
+          && (abs(v) * 16 < NNUEThreshold2 * r50
+          || (   pos.opposite_bishops() 
+              && abs(v) * 16 < (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50
+              && !(pos.this_thread()->nodes & 0xB))))
           v = adjusted_NNUE();
   }
 

From 5e8a49f7f23489605435b0f359c3c70116bec5e3 Mon Sep 17 00:00:00 2001
From: noobpwnftw <guo.bojun@gmail.com>
Date: Sat, 26 Sep 2020 10:03:03 +0800
Subject: [PATCH 162/398] Restore lambda and gradient function post-merge and
 minor fixes.

bench: 3788313
---
 README.md                              |   5 +-
 src/evaluate.h                         |   2 +-
 src/learn/learn.cpp                    | 110 ++++++++++++++++++++++---
 src/learn/learn.h                      |   6 +-
 src/nnue/features/castling_right.cpp   |  28 ++++++-
 src/nnue/features/castling_right.h     |   2 +-
 src/nnue/features/enpassant.cpp        |  20 ++++-
 src/nnue/features/enpassant.h          |   4 +-
 src/nnue/features/feature_set.h        |   6 +-
 src/nnue/features/half_kp.cpp          |   5 +-
 src/nnue/features/half_kp.h            |   7 +-
 src/nnue/features/half_relative_kp.cpp |   4 +-
 src/nnue/nnue_architecture.h           |   2 +-
 src/position.cpp                       |   1 +
 src/syzygy/tbprobe.h                   |   4 +-
 15 files changed, 170 insertions(+), 36 deletions(-)

diff --git a/README.md b/README.md
index 1bad4b06..f84a544a 100644
--- a/README.md
+++ b/README.md
@@ -37,6 +37,7 @@ Additional options:
 To generate training data from the classic eval, use the gensfen command with the setting "Use NNUE" set to "false". The given example is generation in its simplest form. There are more commands. 
 ```
 uci
+setoption name PruneAtShallowDepth value false
 setoption name Use NNUE value false
 setoption name Threads value x
 setoption name Hash value y
@@ -56,11 +57,13 @@ The process is the same as the generation of training data, except for the fact
 Use the "learn" binary. Create an empty folder named "evalsave" in the same directory as the binaries.
 ```
 uci
+setoption name EnableTranspositionTable value false
+setoption name PruneAtShallowDepth value false
 setoption name SkipLoadingEval value true
 setoption name Use NNUE value pure
 setoption name Threads value x
 isready
-learn targetdir trainingdata loop 100 batchsize 1000000 use_draw_in_training 1 use_draw_in_validation 1 eta 1 lambda 1 eval_limit 32000 nn_batch_size 1000 newbob_decay 0.5 eval_save_interval 250000000 loss_output_interval 1000000 mirror_percentage 50 validation_set_file_name validationdata\val.bin
+learn targetdir trainingdata loop 100 batchsize 1000000 use_draw_in_training 1 use_draw_in_validation 1 lr 1 lambda 1 eval_limit 32000 nn_batch_size 1000 newbob_decay 0.5 eval_save_interval 250000000 loss_output_interval 1000000 validation_set_file_name validationdata\val.bin
 ```
 Nets get saved in the "evalsave" folder. 
 
diff --git a/src/evaluate.h b/src/evaluate.h
index ac67494d..0c99fb5b 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -42,7 +42,7 @@ namespace Eval {
   // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
   // for the build process (profile-build and fishtest) to work. Do not change the
   // name of the macro, as it is used in the Makefile.
-  #define EvalFileDefaultName   "nn-28e08a9fe2ad.nnue"
+  #define EvalFileDefaultName   "nn-54f88d1580b4.nnue"
 
   namespace NNUE {
 
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index e2d9af1b..5320aaf8 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -157,6 +157,14 @@ namespace Learner
         return ((y2 - y1) / epsilon) / winning_probability_coefficient;
     }
 
+    // A constant used in elmo (WCSC27). Adjustment required.
+    // Since elmo does not internally divide the expression, the value is different.
+    // You can set this value with the learn command.
+    // 0.33 is equivalent to the constant (0.5) used in elmo (WCSC27)
+    double ELMO_LAMBDA = 0.33;
+    double ELMO_LAMBDA2 = 0.33;
+    double ELMO_LAMBDA_LIMIT = 32000;
+
     // Training Formula · Issue #71 · nodchip/Stockfish https://github.com/nodchip/Stockfish/issues/71
     double get_scaled_signal(double signal)
     {
@@ -182,6 +190,18 @@ namespace Learner
         return winning_percentage(scaled_teacher_signal, ply);
     }
 
+    double calculate_lambda(double teacher_signal)
+    {
+        // If the evaluation value in deep search exceeds ELMO_LAMBDA_LIMIT
+        // then apply ELMO_LAMBDA2 instead of ELMO_LAMBDA.
+        const double lambda =
+            (std::abs(teacher_signal) >= ELMO_LAMBDA_LIMIT)
+            ? ELMO_LAMBDA2
+            : ELMO_LAMBDA;
+
+        return lambda;
+    }
+
     double calculate_t(int game_result)
     {
         // Use 1 as the correction term if the expected win rate is 1,
@@ -192,6 +212,32 @@ namespace Learner
         return t;
     }
 
+    double calc_grad(Value teacher_signal, Value shallow, const PackedSfenValue& psv)
+    {
+        // elmo (WCSC27) method
+        // Correct with the actual game wins and losses.
+        const double q = winning_percentage(shallow, psv.gamePly);
+        const double p = calculate_p(teacher_signal, psv.gamePly);
+        const double t = calculate_t(psv.game_result);
+        const double lambda = calculate_lambda(teacher_signal);
+
+        double grad;
+        if (use_wdl)
+        {
+            const double dce_p = calc_d_cross_entropy_of_winning_percentage(p, shallow, psv.gamePly);
+            const double dce_t = calc_d_cross_entropy_of_winning_percentage(t, shallow, psv.gamePly);
+            grad = lambda * dce_p + (1.0 - lambda) * dce_t;
+        }
+        else
+        {
+            // Use the actual win rate as a correction term.
+            // This is the idea of ​​elmo (WCSC27), modern O-parts.
+            grad = lambda * (q - p) + (1.0 - lambda) * (q - t);
+        }
+
+        return grad;
+    }
+
     // Calculate cross entropy during learning
     // The individual cross entropy of the win/loss term and win
     // rate term of the elmo expression is returned
@@ -202,16 +248,21 @@ namespace Learner
         const PackedSfenValue& psv,
         double& cross_entropy_eval,
         double& cross_entropy_win,
+        double& cross_entropy,
         double& entropy_eval,
-        double& entropy_win)
+        double& entropy_win,
+        double& entropy)
     {
         // Teacher winning probability.
         const double q = winning_percentage(shallow, psv.gamePly);
         const double p = calculate_p(teacher_signal, psv.gamePly);
         const double t = calculate_t(psv.game_result);
+        const double lambda = calculate_lambda(teacher_signal);
 
         constexpr double epsilon = 0.000001;
 
+        const double m = (1.0 - lambda) * t + lambda * p;
+
         cross_entropy_eval =
             (-p * std::log(q + epsilon) - (1.0 - p) * std::log(1.0 - q + epsilon));
         cross_entropy_win =
@@ -220,12 +271,17 @@ namespace Learner
             (-p * std::log(p + epsilon) - (1.0 - p) * std::log(1.0 - p + epsilon));
         entropy_win =
             (-t * std::log(t + epsilon) - (1.0 - t) * std::log(1.0 - t + epsilon));
+
+        cross_entropy =
+            (-m * std::log(q + epsilon) - (1.0 - m) * std::log(1.0 - q + epsilon));
+        entropy =
+            (-m * std::log(m + epsilon) - (1.0 - m) * std::log(1.0 - m + epsilon));
     }
 
     // Other objective functions may be considered in the future...
     double calc_grad(Value shallow, const PackedSfenValue& psv)
     {
-        return (double)(shallow - (Value)psv.score) / 2400.0;
+        return calc_grad((Value)psv.score, shallow, psv);
     }
 
     struct BasicSfenInputStream
@@ -798,12 +854,14 @@ namespace Learner
         cout << ", learning rate = " << global_learning_rate << ", ";
 
         // For calculation of verification data loss
-        atomic<double> test_sum_cross_entropy_eval, test_sum_cross_entropy_win;
-        atomic<double> test_sum_entropy_eval, test_sum_entropy_win;
+        atomic<double> test_sum_cross_entropy_eval, test_sum_cross_entropy_win, test_sum_cross_entropy;
+        atomic<double> test_sum_entropy_eval, test_sum_entropy_win, test_sum_entropy;
         test_sum_cross_entropy_eval = 0;
         test_sum_cross_entropy_win = 0;
+        test_sum_cross_entropy = 0;
         test_sum_entropy_eval = 0;
         test_sum_entropy_win = 0;
+        test_sum_entropy = 0;
 
         // norm for learning
         atomic<double> sum_norm;
@@ -843,8 +901,10 @@ namespace Learner
                     &ps,
                     &test_sum_cross_entropy_eval,
                     &test_sum_cross_entropy_win,
+                    &test_sum_cross_entropy,
                     &test_sum_entropy_eval,
                     &test_sum_entropy_win,
+                    &test_sum_entropy,
                     &sum_norm,
                     &task_count,
                     &move_accord_count
@@ -872,22 +932,26 @@ namespace Learner
                 // For the time being, regarding the win rate and loss terms only in the elmo method
                 // Calculate and display the cross entropy.
 
-                double test_cross_entropy_eval, test_cross_entropy_win;
-                double test_entropy_eval, test_entropy_win;
+                double test_cross_entropy_eval, test_cross_entropy_win, test_cross_entropy;
+                double test_entropy_eval, test_entropy_win, test_entropy;
                 calc_cross_entropy(
                     deep_value,
                     shallow_value,
                     ps,
                     test_cross_entropy_eval,
                     test_cross_entropy_win,
+                    test_cross_entropy,
                     test_entropy_eval,
-                    test_entropy_win);
+                    test_entropy_win,
+                    test_entropy);
 
                 // The total cross entropy need not be abs() by definition.
                 test_sum_cross_entropy_eval += test_cross_entropy_eval;
                 test_sum_cross_entropy_win += test_cross_entropy_win;
+                test_sum_cross_entropy += test_cross_entropy;
                 test_sum_entropy_eval += test_entropy_eval;
                 test_sum_entropy_win += test_entropy_win;
+                test_sum_entropy += test_entropy;
                 sum_norm += (double)abs(shallow_value);
 
                 // Determine if the teacher's move and the score of the shallow search match
@@ -912,7 +976,7 @@ namespace Learner
         while (task_count)
             sleep(1);
 
-        latest_loss_sum += test_sum_cross_entropy_eval - test_sum_entropy_eval;
+        latest_loss_sum += test_sum_cross_entropy - test_sum_entropy;
         latest_loss_count += sr.sfen_for_mse.size();
 
         // learn_cross_entropy may be called train cross
@@ -927,6 +991,8 @@ namespace Learner
                 << " , test_cross_entropy_win = " << test_sum_cross_entropy_win / sr.sfen_for_mse.size()
                 << " , test_entropy_eval = " << test_sum_entropy_eval / sr.sfen_for_mse.size()
                 << " , test_entropy_win = " << test_sum_entropy_win / sr.sfen_for_mse.size()
+                << " , test_cross_entropy = " << test_sum_cross_entropy / sr.sfen_for_mse.size()
+                << " , test_entropy = " << test_sum_entropy / sr.sfen_for_mse.size()
                 << " , norm = " << sum_norm
                 << " , move accuracy = " << (move_accord_count * 100.0 / sr.sfen_for_mse.size()) << "%"
                 << endl;
@@ -938,6 +1004,8 @@ namespace Learner
                     << " , learn_cross_entropy_win = " << learn_sum_cross_entropy_win / done
                     << " , learn_entropy_eval = " << learn_sum_entropy_eval / done
                     << " , learn_entropy_win = " << learn_sum_entropy_win / done
+                    << " , learn_cross_entropy = " << learn_sum_cross_entropy / done
+                    << " , learn_entropy = " << learn_sum_entropy / done
                     << endl;
             }
         }
@@ -949,8 +1017,10 @@ namespace Learner
         // Clear 0 for next time.
         learn_sum_cross_entropy_eval = 0.0;
         learn_sum_cross_entropy_win = 0.0;
+        learn_sum_cross_entropy = 0.0;
         learn_sum_entropy_eval = 0.0;
         learn_sum_entropy_win = 0.0;
+        learn_sum_entropy = 0.0;
     }
 
     void LearnerThink::thread_worker(size_t thread_id)
@@ -1142,21 +1212,25 @@ namespace Learner
                     : -Eval::evaluate(pos);
 
                 // Calculate loss for training data
-                double learn_cross_entropy_eval, learn_cross_entropy_win;
-                double learn_entropy_eval, learn_entropy_win;
+                double learn_cross_entropy_eval, learn_cross_entropy_win, learn_cross_entropy;
+                double learn_entropy_eval, learn_entropy_win, learn_entropy;
                 calc_cross_entropy(
                     deep_value,
                     shallow_value,
                     ps,
                     learn_cross_entropy_eval,
                     learn_cross_entropy_win,
+                    learn_cross_entropy,
                     learn_entropy_eval,
-                    learn_entropy_win);
+                    learn_entropy_win,
+                    learn_entropy);
 
                 learn_sum_cross_entropy_eval += learn_cross_entropy_eval;
                 learn_sum_cross_entropy_win += learn_cross_entropy_win;
+                learn_sum_cross_entropy += learn_cross_entropy;
                 learn_sum_entropy_eval += learn_entropy_eval;
                 learn_sum_entropy_win += learn_entropy_win;
+                learn_sum_entropy += learn_entropy;
 
                 Eval::NNUE::AddExample(pos, rootColor, ps, 1.0);
 
@@ -1560,6 +1634,11 @@ namespace Learner
 
         global_learning_rate = 1.0;
 
+        // elmo lambda
+        ELMO_LAMBDA = 0.33;
+        ELMO_LAMBDA2 = 0.33;
+        ELMO_LAMBDA_LIMIT = 32000;
+
         // if (gamePly <rand(reduction_gameply)) continue;
         // An option to exclude the early stage from the learning target moderately like
         // If set to 1, rand(1)==0, so nothing is excluded.
@@ -1627,6 +1706,12 @@ namespace Learner
             // Using WDL with win rate model instead of sigmoid
             else if (option == "use_wdl") is >> use_wdl;
 
+
+            // LAMBDA
+            else if (option == "lambda")       is >> ELMO_LAMBDA;
+            else if (option == "lambda2")      is >> ELMO_LAMBDA2;
+            else if (option == "lambda_limit") is >> ELMO_LAMBDA_LIMIT;
+
             else if (option == "reduction_gameply") is >> reduction_gameply;
 
             // shuffle related
@@ -1814,6 +1899,9 @@ namespace Learner
         reduction_gameply = max(reduction_gameply, 1);
         cout << "reduction_gameply : " << reduction_gameply << endl;
 
+        cout << "LAMBDA            : " << ELMO_LAMBDA << endl;
+        cout << "LAMBDA2           : " << ELMO_LAMBDA2 << endl;
+        cout << "LAMBDA_LIMIT      : " << ELMO_LAMBDA_LIMIT << endl;
         cout << "eval_save_interval  : " << eval_save_interval << " sfens" << endl;
         cout << "loss_output_interval: " << loss_output_interval << " sfens" << endl;
 
diff --git a/src/learn/learn.h b/src/learn/learn.h
index c76d76c5..4b09f825 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -23,7 +23,11 @@ using LearnFloatType = float;
 // configure
 // ======================
 
-#define LOSS_FUNCTION "cross_entropy_eval"
+// ----------------------
+// Learning with the method of elmo (WCSC27)
+// ----------------------
+
+#define LOSS_FUNCTION "ELMO_METHOD(WCSC27)"
 
 // ----------------------
 // Definition of struct used in Learner
diff --git a/src/nnue/features/castling_right.cpp b/src/nnue/features/castling_right.cpp
index 2d7f563a..2b3f3209 100644
--- a/src/nnue/features/castling_right.cpp
+++ b/src/nnue/features/castling_right.cpp
@@ -31,10 +31,30 @@ namespace Eval::NNUE::Features {
 
   // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
   void CastlingRight::AppendChangedIndices(
-    const Position& /* pos */, Color /* perspective */,
-    IndexList* /* removed */, IndexList* /* added */) {
-    // Not implemented.
-    assert(false);
+      const Position& pos, Color perspective,
+      IndexList* removed, IndexList* /* added */) {
+    int previous_castling_rights = pos.state()->previous->castlingRights;
+    int current_castling_rights = pos.state()->castlingRights;
+    int relative_previous_castling_rights;
+    int relative_current_castling_rights;
+    if (perspective == WHITE) {
+      relative_previous_castling_rights = previous_castling_rights;
+      relative_current_castling_rights = current_castling_rights;
+    }
+    else {
+      // Invert the perspective.
+      relative_previous_castling_rights = ((previous_castling_rights & 3) << 2)
+        & ((previous_castling_rights >> 2) & 3);
+      relative_current_castling_rights = ((current_castling_rights & 3) << 2)
+        & ((current_castling_rights >> 2) & 3);
+    }
+
+    for (Eval::NNUE::IndexType i = 0; i < kDimensions; ++i) {
+      if ((relative_previous_castling_rights & (1 << i)) &&
+        (relative_current_castling_rights & (1 << i)) == 0) {
+        removed->push_back(i);
+      }
+    }
   }
 
 }  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/castling_right.h b/src/nnue/features/castling_right.h
index 3a09e14b..2d8c5322 100644
--- a/src/nnue/features/castling_right.h
+++ b/src/nnue/features/castling_right.h
@@ -19,7 +19,7 @@ namespace Eval::NNUE::Features {
     // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
     static constexpr IndexType kMaxActiveDimensions = 4;
     // Timing of full calculation instead of difference calculation
-    static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kAnyPieceMoved;
+    static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
 
     // Get a list of indices with a value of 1 among the features
     static void AppendActiveIndices(const Position& pos, Color perspective,
diff --git a/src/nnue/features/enpassant.cpp b/src/nnue/features/enpassant.cpp
index d771a85c..e5ceed5c 100644
--- a/src/nnue/features/enpassant.cpp
+++ b/src/nnue/features/enpassant.cpp
@@ -21,10 +21,22 @@ namespace Eval::NNUE::Features {
 
   // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
   void EnPassant::AppendChangedIndices(
-    const Position& /* pos */, Color /* perspective */,
-    IndexList* /* removed */, IndexList* /* added */) {
-    // Not implemented.
-    assert(false);
+      const Position& pos, Color /* perspective */,
+      IndexList* removed, IndexList* added) {
+
+    auto previous_epSquare = pos.state()->previous->epSquare;
+    auto epSquare = pos.state()->epSquare;
+
+    if (previous_epSquare != SQ_NONE) {
+      if (epSquare != SQ_NONE && file_of(epSquare) == file_of(previous_epSquare))
+        return;
+      auto file = file_of(previous_epSquare);
+      removed->push_back(file);
+    }
+    if (epSquare != SQ_NONE) {
+      auto file = file_of(epSquare);
+      added->push_back(file);
+    }
   }
 
 }  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/enpassant.h b/src/nnue/features/enpassant.h
index efa5eae9..065e74a0 100644
--- a/src/nnue/features/enpassant.h
+++ b/src/nnue/features/enpassant.h
@@ -19,13 +19,13 @@ namespace Eval::NNUE::Features {
     // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
     static constexpr IndexType kMaxActiveDimensions = 1;
     // Timing of full calculation instead of difference calculation
-    static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kAnyPieceMoved;
+    static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
 
     // Get a list of indices with a value of 1 among the features
     static void AppendActiveIndices(const Position& pos, Color perspective,
       IndexList* active);
 
-    // Get a list of indices whose values ??have changed from the previous one in the feature quantity
+    // Get a list of indices whose values have changed from the previous one in the feature quantity
     static void AppendChangedIndices(const Position& pos, Color perspective,
       IndexList* removed, IndexList* added);
   };
diff --git a/src/nnue/features/feature_set.h b/src/nnue/features/feature_set.h
index 2ef92e8e..24cdeb66 100644
--- a/src/nnue/features/feature_set.h
+++ b/src/nnue/features/feature_set.h
@@ -100,7 +100,6 @@ namespace Eval::NNUE::Features {
         IndexListType removed[2], IndexListType added[2], bool reset[2]) {
 
       const auto& dp = pos.state()->dirtyPiece;
-      if (dp.dirty_num == 0) return;
 
       for (Color perspective : { WHITE, BLACK }) {
         reset[perspective] = false;
@@ -108,12 +107,15 @@ namespace Eval::NNUE::Features {
           case TriggerEvent::kNone:
             break;
           case TriggerEvent::kFriendKingMoved:
+            if (dp.dirty_num == 0) continue;
             reset[perspective] = dp.piece[0] == make_piece(perspective, KING);
             break;
           case TriggerEvent::kEnemyKingMoved:
-              reset[perspective] = dp.piece[0] == make_piece(~perspective, KING);
+            if (dp.dirty_num == 0) continue;
+            reset[perspective] = dp.piece[0] == make_piece(~perspective, KING);
             break;
           case TriggerEvent::kAnyKingMoved:
+            if (dp.dirty_num == 0) continue;
             reset[perspective] = type_of(dp.piece[0]) == KING;
             break;
           case TriggerEvent::kAnyPieceMoved:
diff --git a/src/nnue/features/half_kp.cpp b/src/nnue/features/half_kp.cpp
index ff20a00a..ae1d697f 100644
--- a/src/nnue/features/half_kp.cpp
+++ b/src/nnue/features/half_kp.cpp
@@ -41,7 +41,7 @@ namespace Eval::NNUE::Features {
   void HalfKP<AssociatedKing>::AppendActiveIndices(
       const Position& pos, Color perspective, IndexList* active) {
 
-    Square ksq = orient(perspective, pos.square<KING>(perspective));
+    Square ksq = orient(perspective, pos.square<KING>(AssociatedKing == Side::kFriend ? perspective : ~perspective));
     Bitboard bb = pos.pieces() & ~pos.pieces(KING);
     while (bb) {
       Square s = pop_lsb(&bb);
@@ -55,7 +55,7 @@ namespace Eval::NNUE::Features {
       const Position& pos, Color perspective,
       IndexList* removed, IndexList* added) {
 
-    Square ksq = orient(perspective, pos.square<KING>(perspective));
+    Square ksq = orient(perspective, pos.square<KING>(AssociatedKing == Side::kFriend ? perspective : ~perspective));
     const auto& dp = pos.state()->dirtyPiece;
     for (int i = 0; i < dp.dirty_num; ++i) {
       Piece pc = dp.piece[i];
@@ -68,5 +68,6 @@ namespace Eval::NNUE::Features {
   }
 
   template class HalfKP<Side::kFriend>;
+  template class HalfKP<Side::kEnemy>;
 
 }  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/half_kp.h b/src/nnue/features/half_kp.h
index ee6a8df3..23e8beb6 100644
--- a/src/nnue/features/half_kp.h
+++ b/src/nnue/features/half_kp.h
@@ -33,7 +33,8 @@ namespace Eval::NNUE::Features {
 
    public:
     // Feature name
-    static constexpr const char* kName = "HalfKP(Friend)";
+    static constexpr const char* kName = (AssociatedKing == Side::kFriend) ?
+        "HalfKP(Friend)" : "HalfKP(Enemy)";
     // Hash value embedded in the evaluation file
     static constexpr std::uint32_t kHashValue =
         0x5D69D5B9u ^ (AssociatedKing == Side::kFriend);
@@ -43,7 +44,9 @@ namespace Eval::NNUE::Features {
     // Maximum number of simultaneously active features
     static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
     // Trigger for full calculation instead of difference calculation
-    static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kFriendKingMoved;
+    static constexpr TriggerEvent kRefreshTrigger =
+        (AssociatedKing == Side::kFriend) ?
+        TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
 
     // Get a list of indices for active features
     static void AppendActiveIndices(const Position& pos, Color perspective,
diff --git a/src/nnue/features/half_relative_kp.cpp b/src/nnue/features/half_relative_kp.cpp
index 8a61bada..6b456a1f 100644
--- a/src/nnue/features/half_relative_kp.cpp
+++ b/src/nnue/features/half_relative_kp.cpp
@@ -39,7 +39,7 @@ inline IndexType HalfRelativeKP<AssociatedKing>::MakeIndex(
 template <Side AssociatedKing>
 void HalfRelativeKP<AssociatedKing>::AppendActiveIndices(
     const Position& pos, Color perspective, IndexList* active) {
-  Square ksq = orient(perspective, pos.square<KING>(perspective));
+  Square ksq = orient(perspective, pos.square<KING>(AssociatedKing == Side::kFriend ? perspective : ~perspective));
   Bitboard bb = pos.pieces() & ~pos.pieces(KING);
   while (bb) {
     Square s = pop_lsb(&bb);
@@ -52,7 +52,7 @@ template <Side AssociatedKing>
 void HalfRelativeKP<AssociatedKing>::AppendChangedIndices(
     const Position& pos, Color perspective,
     IndexList* removed, IndexList* added) {
-  Square ksq = orient(perspective, pos.square<KING>(perspective));
+  Square ksq = orient(perspective, pos.square<KING>(AssociatedKing == Side::kFriend ? perspective : ~perspective));
   const auto& dp = pos.state()->dirtyPiece;
   for (int i = 0; i < dp.dirty_num; ++i) {
     Piece pc = dp.piece[i];
diff --git a/src/nnue/nnue_architecture.h b/src/nnue/nnue_architecture.h
index c395d515..91cdc4bd 100644
--- a/src/nnue/nnue_architecture.h
+++ b/src/nnue/nnue_architecture.h
@@ -22,7 +22,7 @@
 #define NNUE_ARCHITECTURE_H_INCLUDED
 
 // Defines the network structure
-#include "architectures/halfkp-cr-ep_256x2-32-32.h"
+#include "architectures/halfkp_256x2-32-32.h"
 
 namespace Eval::NNUE {
 
diff --git a/src/position.cpp b/src/position.cpp
index 52c47f66..5be655be 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -1013,6 +1013,7 @@ void Position::do_null_move(StateInfo& newSt) {
   {
       st->key ^= Zobrist::enpassant[file_of(st->epSquare)];
       st->epSquare = SQ_NONE;
+      st->accumulator.computed_accumulation = false;
   }
 
   st->key ^= Zobrist::side;
diff --git a/src/syzygy/tbprobe.h b/src/syzygy/tbprobe.h
index 5f97c746..b998989b 100644
--- a/src/syzygy/tbprobe.h
+++ b/src/syzygy/tbprobe.h
@@ -25,8 +25,6 @@
 
 namespace Tablebases {
 
-extern int MaxCardinality;
-
 enum WDLScore {
     WDLLoss        = -2, // Loss
     WDLBlessedLoss = -1, // Loss, but draw under 50-move rule
@@ -45,6 +43,8 @@ enum ProbeState {
     ZEROING_BEST_MOVE =  2  // Best move zeroes DTZ (capture or pawn move)
 };
 
+extern int MaxCardinality;
+
 void init(const std::string& paths);
 WDLScore probe_wdl(Position& pos, ProbeState* result);
 int probe_dtz(Position& pos, ProbeState* result);

From 96a31807705eefe85b5be19322a7ff0ba5588f5f Mon Sep 17 00:00:00 2001
From: noobpwnftw <noobpwnftw@users.noreply.github.com>
Date: Sun, 27 Sep 2020 02:17:30 +0800
Subject: [PATCH 163/398] Update instrumented_learn.sh

Fix typo.
---
 tests/instrumented_learn.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/instrumented_learn.sh b/tests/instrumented_learn.sh
index ce1fc429..44c5d7fa 100755
--- a/tests/instrumented_learn.sh
+++ b/tests/instrumented_learn.sh
@@ -104,7 +104,7 @@ cat << EOF > gensfen02.exp
  send "setoption name Threads value $threads\n"
  send "setoption name Use NNUE value true\n"
  send "isready\n"
- send "gensfen depth 4 loop 50 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name validation_data/valdidation_data.bin sfen_format bin\n"
+ send "gensfen depth 4 loop 50 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name validation_data/validation_data.bin sfen_format bin\n"
  expect "gensfen finished."
  send "gensfen depth 4 loop 50 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name validation_data/validation_data.binpack sfen_format binpack\n"
  expect "gensfen finished."

From 9d4bf4fe0c7cb2b3e207bf50a28fed958c6ffa27 Mon Sep 17 00:00:00 2001
From: noobpwnftw <guo.bojun@gmail.com>
Date: Sun, 27 Sep 2020 02:28:28 +0800
Subject: [PATCH 164/398] Optimize accumulators for null move.

---
 src/position.cpp | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/position.cpp b/src/position.cpp
index 5be655be..4e47f772 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -999,21 +999,20 @@ void Position::do_null_move(StateInfo& newSt) {
   assert(!checkers());
   assert(&newSt != st);
 
-  if (Eval::useNNUE != Eval::UseNNUEMode::False)
-  {
-      std::memcpy(&newSt, st, sizeof(StateInfo));
-  }
-  else
-      std::memcpy(&newSt, st, offsetof(StateInfo, accumulator));
+  std::memcpy(&newSt, st, offsetof(StateInfo, accumulator));
 
   newSt.previous = st;
   st = &newSt;
 
+  // Used by NNUE
+  st->accumulator.computed_accumulation = false;
+  auto& dp = st->dirtyPiece;
+  dp.dirty_num = 0;
+
   if (st->epSquare != SQ_NONE)
   {
       st->key ^= Zobrist::enpassant[file_of(st->epSquare)];
       st->epSquare = SQ_NONE;
-      st->accumulator.computed_accumulation = false;
   }
 
   st->key ^= Zobrist::side;

From 1dbd2a1ad548b3ca676f7da949e1a998c64b836b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ste=CC=81phane=20Nicolet?= <cassio@free.fr>
Date: Sat, 26 Sep 2020 23:19:53 +0200
Subject: [PATCH 165/398] Tweak nnue scaling to keep more material

Current master uses a constant scale factor of 5/4 = 1.25 for the output
of the NNUE network, for compatibility with search and classical evaluation.
We modify this scale factor to make it dependent on the phase of the game,
going from about 1.5 in the opening to 1.0 for pure pawn endgames.

This helps Stockfish to avoid exchanges of pieces (heavy pieces in particular)
when she has the advantage, keeping more material on the board when attacking.

Passed STC:
LLR: 2.95 (-2.94,2.94) {-0.25,1.25}
Total: 14744 W: 1771 L: 1599 D: 11374
Ptnml(0-2): 87, 1184, 4664, 1344, 93
https://tests.stockfishchess.org/tests/view/5f6fb0a63b22d6afa506904f

Passed LTC:
LLR: 2.95 (-2.94,2.94) {0.25,1.25}
Total: 8912 W: 512 L: 393 D: 8007
Ptnml(0-2): 7, 344, 3637, 459, 9
https://tests.stockfishchess.org/tests/view/5f6fcf533b22d6afa5069066

closes https://github.com/official-stockfish/Stockfish/pull/3154

Bench: 3943952
---
 src/evaluate.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 1503be2d..710898bc 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -1022,7 +1022,10 @@ Value Eval::evaluate(const Position& pos) {
   else
   {
       // Scale and shift NNUE for compatibility with search and classical evaluation
-      auto  adjusted_NNUE = [&](){ return NNUE::evaluate(pos) * 5 / 4 + Tempo; };
+      auto  adjusted_NNUE = [&](){
+         int mat = pos.non_pawn_material();
+         return NNUE::evaluate(pos) * (1024 + mat / 32) / 1024 + Tempo;
+      };
 
       // If there is PSQ imbalance use classical eval, with small probability if it is small
       Value psq = Value(abs(eg_value(pos.psq_score())));
@@ -1037,7 +1040,7 @@ Value Eval::evaluate(const Position& pos) {
       // small probability if the classical eval is less than the threshold.
       if (   largePsq
           && (abs(v) * 16 < NNUEThreshold2 * r50
-          || (   pos.opposite_bishops() 
+          || (   pos.opposite_bishops()
               && abs(v) * 16 < (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50
               && !(pos.this_thread()->nodes & 0xB))))
           v = adjusted_NNUE();

From c065abdcafe0486cb5cfa7de12a4ac6a905a54c5 Mon Sep 17 00:00:00 2001
From: noobpwnftw <guo.bojun@gmail.com>
Date: Mon, 28 Sep 2020 02:29:21 +0800
Subject: [PATCH 166/398] Use incremental updates more often

Use incremental updates for accumulators for up to 2 plies.
Do not copy accumulator. About 2% speedup.

Passed STC:
LLR: 2.95 (-2.94,2.94) {-0.25,1.25}
Total: 21752 W: 2583 L: 2403 D: 16766
Ptnml(0-2): 128, 1761, 6923, 1931, 133
https://tests.stockfishchess.org/tests/view/5f7150cf3b22d6afa5069412

closes https://github.com/official-stockfish/Stockfish/pull/3157

No functional change
---
 src/nnue/features/feature_set.h     | 83 ++++++++++++++++++++++-------
 src/nnue/features/half_kp.cpp       |  3 +-
 src/nnue/features/half_kp.h         |  2 +-
 src/nnue/nnue_feature_transformer.h | 29 +++++++---
 4 files changed, 87 insertions(+), 30 deletions(-)

diff --git a/src/nnue/features/feature_set.h b/src/nnue/features/feature_set.h
index 558a6b22..26198114 100644
--- a/src/nnue/features/feature_set.h
+++ b/src/nnue/features/feature_set.h
@@ -61,26 +61,69 @@ namespace Eval::NNUE::Features {
         const PositionType& pos, TriggerEvent trigger,
         IndexListType removed[2], IndexListType added[2], bool reset[2]) {
 
-      const auto& dp = pos.state()->dirtyPiece;
-      if (dp.dirty_num == 0) return;
-
-      for (Color perspective : { WHITE, BLACK }) {
-        reset[perspective] = false;
-        switch (trigger) {
-          case TriggerEvent::kFriendKingMoved:
-            reset[perspective] = dp.piece[0] == make_piece(perspective, KING);
-            break;
-          default:
-            assert(false);
-            break;
+      auto collect_for_one = [&](const DirtyPiece& dp) {
+        for (Color perspective : { WHITE, BLACK }) {
+          switch (trigger) {
+            case TriggerEvent::kFriendKingMoved:
+              reset[perspective] = dp.piece[0] == make_piece(perspective, KING);
+              break;
+            default:
+              assert(false);
+              break;
+          }
+          if (reset[perspective]) {
+            Derived::CollectActiveIndices(
+                pos, trigger, perspective, &added[perspective]);
+          } else {
+            Derived::CollectChangedIndices(
+                pos, dp, trigger, perspective,
+                &removed[perspective], &added[perspective]);
+          }
         }
-        if (reset[perspective]) {
-          Derived::CollectActiveIndices(
-              pos, trigger, perspective, &added[perspective]);
+      };
+
+      auto collect_for_two = [&](const DirtyPiece& dp1, const DirtyPiece& dp2) {
+        for (Color perspective : { WHITE, BLACK }) {
+          switch (trigger) {
+            case TriggerEvent::kFriendKingMoved:
+              reset[perspective] = dp1.piece[0] == make_piece(perspective, KING)
+                                || dp2.piece[0] == make_piece(perspective, KING);
+              break;
+            default:
+              assert(false);
+              break;
+          }
+          if (reset[perspective]) {
+            Derived::CollectActiveIndices(
+                pos, trigger, perspective, &added[perspective]);
+          } else {
+            Derived::CollectChangedIndices(
+                pos, dp1, trigger, perspective,
+                &removed[perspective], &added[perspective]);
+            Derived::CollectChangedIndices(
+                pos, dp2, trigger, perspective,
+                &removed[perspective], &added[perspective]);
+          }
+        }
+      };
+
+      if (pos.state()->previous->accumulator.computed_accumulation) {
+        const auto& prev_dp = pos.state()->dirtyPiece;
+        if (prev_dp.dirty_num == 0) return;
+        collect_for_one(prev_dp);
+      } else {
+        const auto& prev_dp = pos.state()->previous->dirtyPiece;
+        if (prev_dp.dirty_num == 0) {
+          const auto& prev2_dp = pos.state()->dirtyPiece;
+          if (prev2_dp.dirty_num == 0) return;
+          collect_for_one(prev2_dp);
         } else {
-          Derived::CollectChangedIndices(
-              pos, trigger, perspective,
-              &removed[perspective], &added[perspective]);
+          const auto& prev2_dp = pos.state()->dirtyPiece;
+          if (prev2_dp.dirty_num == 0) {
+            collect_for_one(prev_dp);
+          } else {
+            collect_for_two(prev_dp, prev2_dp);
+          }
         }
       }
     }
@@ -115,11 +158,11 @@ namespace Eval::NNUE::Features {
 
     // Get a list of indices for recently changed features
     static void CollectChangedIndices(
-        const Position& pos, const TriggerEvent trigger, const Color perspective,
+        const Position& pos, const DirtyPiece& dp, const TriggerEvent trigger, const Color perspective,
         IndexList* const removed, IndexList* const added) {
 
       if (FeatureType::kRefreshTrigger == trigger) {
-        FeatureType::AppendChangedIndices(pos, perspective, removed, added);
+        FeatureType::AppendChangedIndices(pos, dp, perspective, removed, added);
       }
     }
 
diff --git a/src/nnue/features/half_kp.cpp b/src/nnue/features/half_kp.cpp
index 88e384a3..116157cc 100644
--- a/src/nnue/features/half_kp.cpp
+++ b/src/nnue/features/half_kp.cpp
@@ -52,11 +52,10 @@ namespace Eval::NNUE::Features {
   // Get a list of indices for recently changed features
   template <Side AssociatedKing>
   void HalfKP<AssociatedKing>::AppendChangedIndices(
-      const Position& pos, Color perspective,
+      const Position& pos, const DirtyPiece& dp, Color perspective,
       IndexList* removed, IndexList* added) {
 
     Square ksq = orient(perspective, pos.square<KING>(perspective));
-    const auto& dp = pos.state()->dirtyPiece;
     for (int i = 0; i < dp.dirty_num; ++i) {
       Piece pc = dp.piece[i];
       if (type_of(pc) == KING) continue;
diff --git a/src/nnue/features/half_kp.h b/src/nnue/features/half_kp.h
index ee6a8df3..52a83eec 100644
--- a/src/nnue/features/half_kp.h
+++ b/src/nnue/features/half_kp.h
@@ -50,7 +50,7 @@ namespace Eval::NNUE::Features {
                                     IndexList* active);
 
     // Get a list of indices for recently changed features
-    static void AppendChangedIndices(const Position& pos, Color perspective,
+    static void AppendChangedIndices(const Position& pos, const DirtyPiece& dp, Color perspective,
                                      IndexList* removed, IndexList* added);
 
    private:
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index e71ee60d..2f86d20a 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -127,9 +127,14 @@ namespace Eval::NNUE {
         return true;
 
       const auto prev = now->previous;
-      if (prev && prev->accumulator.computed_accumulation) {
-        UpdateAccumulator(pos);
-        return true;
+      if (prev) {
+        if (prev->accumulator.computed_accumulation) {
+          UpdateAccumulator(pos);
+          return true;
+        } else if (prev->previous && prev->previous->accumulator.computed_accumulation) {
+          UpdateAccumulator(pos);
+          return true;
+        }
       }
 
       return false;
@@ -289,11 +294,21 @@ namespace Eval::NNUE {
     // Calculate cumulative value using difference calculation
     void UpdateAccumulator(const Position& pos) const {
 
-      const auto prev_accumulator = pos.state()->previous->accumulator;
+      Accumulator* prev_accumulator;
+      assert(pos.state()->previous);
+      if (pos.state()->previous->accumulator.computed_accumulation) {
+        prev_accumulator = &pos.state()->previous->accumulator;
+      }
+      else {
+        assert(pos.state()->previous->previous);
+        assert(pos.state()->previous->previous->accumulator.computed_accumulation);
+        prev_accumulator = &pos.state()->previous->previous->accumulator;
+      }
+
       auto& accumulator = pos.state()->accumulator;
       IndexType i = 0;
       Features::IndexList removed_indices[2], added_indices[2];
-      bool reset[2];
+      bool reset[2] = { false, false };
       RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i],
                                         removed_indices, added_indices, reset);
 
@@ -311,7 +326,7 @@ namespace Eval::NNUE {
               acc[k] = biasesTile[k];
           } else {
             auto prevAccTile = reinterpret_cast<const vec_t*>(
-                &prev_accumulator.accumulation[perspective][i][j * kTileHeight]);
+                &prev_accumulator->accumulation[perspective][i][j * kTileHeight]);
             for (IndexType k = 0; k < kNumRegs; ++k)
               acc[k] = vec_load(&prevAccTile[k]);
 
@@ -350,7 +365,7 @@ namespace Eval::NNUE {
                       kHalfDimensions * sizeof(BiasType));
         } else {
           std::memcpy(accumulator.accumulation[perspective][i],
-                      prev_accumulator.accumulation[perspective][i],
+                      prev_accumulator->accumulation[perspective][i],
                       kHalfDimensions * sizeof(BiasType));
           // Difference calculation for the deactivated features
           for (const auto index : removed_indices[perspective]) {

From 36c2886302ff3f6b730fc5f69d738a5d61be8c46 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Sat, 26 Sep 2020 17:47:52 +0200
Subject: [PATCH 167/398] Update default net to nn-04a843f8932e.nnue

an optimization of Sergio's nn-03744f8d56d8.nnue tuning the output layer (33 parameters) on game play.

WIP code to make layer parameters tunable is https://github.com/vondele/Stockfish/tree/optionOutput
Optimization itself is using https://github.com/vondele/nevergrad4sf
Writing of the modified net using WIP code based on the learner code https://github.com/vondele/Stockfish/tree/evalWrite

Most parameters in the output layer are changed only little (~5 for int8_t).

passed STC:
https://tests.stockfishchess.org/tests/view/5f716f6b3b22d6afa506941a
LLR: 2.94 (-2.94,2.94) {-0.25,1.25}
Total: 15488 W: 1859 L: 1689 D: 11940
Ptnml(0-2): 79, 1260, 4917, 1388, 100

passed LTC:
https://tests.stockfishchess.org/tests/view/5f71908e3b22d6afa506942e
LLR: 2.93 (-2.94,2.94) {0.25,1.25}
Total: 8728 W: 518 L: 400 D: 7810
Ptnml(0-2): 7, 338, 3556, 456, 7

closes https://github.com/official-stockfish/Stockfish/pull/3158

Bench: 3789924
---
 src/evaluate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/evaluate.h b/src/evaluate.h
index 56354cf5..503aa975 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -36,7 +36,7 @@ namespace Eval {
   // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
   // for the build process (profile-build and fishtest) to work. Do not change the
   // name of the macro, as it is used in the Makefile.
-  #define EvalFileDefaultName   "nn-03744f8d56d8.nnue"
+  #define EvalFileDefaultName   "nn-04a843f8932e.nnue"
 
   namespace NNUE {
 

From a5e68d9b25539b86304a9fb26afc616dc8126a1c Mon Sep 17 00:00:00 2001
From: Vizvezdenec <Vizvezdenec@gmail.com>
Date: Mon, 28 Sep 2020 20:20:06 +0300
Subject: [PATCH 168/398] Adjust null move pruning constants

Idea is that division by fraction of 2 is slightly faster than by other numbers so parameters are adjusted in a way that division in null move pruning depth reduction features dividing by 256 instead of dividing by 213.
Other than this patch is almost non-functional - difference starts to exist by depth 133.

passed STC
https://tests.stockfishchess.org/tests/view/5f70dd943b22d6afa50693c5
LLR: 2.95 (-2.94,2.94) {-0.25,1.25}
Total: 57048 W: 6616 L: 6392 D: 44040
Ptnml(0-2): 304, 4583, 18531, 4797, 309

passed LTC
https://tests.stockfishchess.org/tests/view/5f7180db3b22d6afa506941f
LLR: 2.95 (-2.94,2.94) {0.25,1.25}
Total: 45960 W: 2419 L: 2229 D: 41312
Ptnml(0-2): 43, 1779, 19137, 1987, 34

closes https://github.com/official-stockfish/Stockfish/pull/3159

bench 3789924
---
 src/search.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/search.cpp b/src/search.cpp
index 4650b157..e5f286e4 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -829,7 +829,7 @@ namespace {
         assert(eval - beta >= 0);
 
         // Null move dynamic reduction based on depth and value
-        Depth R = (817 + 71 * depth) / 213 + std::min(int(eval - beta) / 192, 3);
+        Depth R = (982 + 85 * depth) / 256 + std::min(int(eval - beta) / 192, 3);
 
         ss->currentMove = MOVE_NULL;
         ss->continuationHistory = &thisThread->continuationHistory[0][0][NO_PIECE][0];

From ba46599aa2224a78106346fb0615b0be174374f5 Mon Sep 17 00:00:00 2001
From: FauziAkram <fauzi.dabat@hotmail.com>
Date: Mon, 28 Sep 2020 22:09:43 +0300
Subject: [PATCH 169/398] Tweaking Mobility and Safe Check

Passed STC:
https://tests.stockfishchess.org/tests/view/5f70d86d3b22d6afa50693b9
LLR: 2.94 (-2.94,2.94) {-0.25,1.25}
Total: 100368 W: 20323 L: 19914 D: 60131
Ptnml(0-2): 1927, 11641, 22605, 12118, 1893

Passed LTC:
https://tests.stockfishchess.org/tests/view/5f71bb553b22d6afa5069457
LLR: 2.94 (-2.94,2.94) {0.25,1.25}
Total: 77648 W: 10613 L: 10181 D: 56854
Ptnml(0-2): 634, 7280, 22594, 7652, 664

closes https://github.com/official-stockfish/Stockfish/pull/3160

Bench: 3861984
---
 src/evaluate.cpp | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 710898bc..fe92f7d7 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -199,7 +199,7 @@ namespace {
   // SafeCheck[PieceType][single/multiple] contains safe check bonus by piece type,
   // higher if multiple safe checks are possible for that piece type.
   constexpr int SafeCheck[][2] = {
-      {}, {}, {792, 1283}, {645, 967}, {1084, 1897}, {772, 1119}
+      {}, {}, {803, 1292}, {639, 974}, {1087, 1878}, {759, 1132}
   };
 
 #define S(mg, eg) make_score(mg, eg)
@@ -207,19 +207,19 @@ namespace {
   // MobilityBonus[PieceType-2][attacked] contains bonuses for middle and end game,
   // indexed by piece type and number of attacked squares in the mobility area.
   constexpr Score MobilityBonus[][32] = {
-    { S(-62,-81), S(-53,-56), S(-12,-31), S( -4,-16), S(  3,  5), S( 13, 11), // Knight
-      S( 22, 17), S( 28, 20), S( 33, 25) },
-    { S(-48,-59), S(-20,-23), S( 16, -3), S( 26, 13), S( 38, 24), S( 51, 42), // Bishop
-      S( 55, 54), S( 63, 57), S( 63, 65), S( 68, 73), S( 81, 78), S( 81, 86),
-      S( 91, 88), S( 98, 97) },
-    { S(-60,-78), S(-20,-17), S(  2, 23), S(  3, 39), S(  3, 70), S( 11, 99), // Rook
-      S( 22,103), S( 31,121), S( 40,134), S( 40,139), S( 41,158), S( 48,164),
-      S( 57,168), S( 57,169), S( 62,172) },
-    { S(-30,-48), S(-12,-30), S( -8, -7), S( -9, 19), S( 20, 40), S( 23, 55), // Queen
-      S( 23, 59), S( 35, 75), S( 38, 78), S( 53, 96), S( 64, 96), S( 65,100),
-      S( 65,121), S( 66,127), S( 67,131), S( 67,133), S( 72,136), S( 72,141),
-      S( 77,147), S( 79,150), S( 93,151), S(108,168), S(108,168), S(108,171),
-      S(110,182), S(114,182), S(114,192), S(116,219) }
+    { S(-62,-79), S(-53,-57), S(-12,-31), S( -3,-17), S(  3,  7), S( 12, 13), // Knight
+      S( 21, 16), S( 28, 21), S( 37, 26) },
+    { S(-47,-59), S(-20,-25), S( 14, -8), S( 29, 12), S( 39, 21), S( 53, 40), // Bishop
+      S( 53, 56), S( 60, 58), S( 62, 65), S( 69, 72), S( 78, 78), S( 83, 87),
+      S( 91, 88), S( 96, 98) },
+    { S(-61,-82), S(-20,-17), S(  2, 23) ,S(  3, 40), S(  4, 72), S( 11,100), // Rook
+      S( 22,104), S( 31,120), S( 39,134), S(40 ,138), S( 41,158), S( 47,163),
+      S( 59,168), S( 60,169), S( 64,173) },
+    { S(-29,-49), S(-16,-29), S( -8, -8), S( -8, 17), S( 18, 39), S( 25, 54), // Queen
+      S( 23, 59), S( 37, 73), S( 41, 76), S( 54, 95), S( 65, 95) ,S( 68,101),
+      S( 69,124), S( 70,128), S( 70,132), S( 70,133) ,S( 71,136), S( 72,140),
+      S( 74,147), S( 76,149), S( 90,153), S(104,169), S(105,171), S(106,171),
+      S(112,178), S(114,185), S(114,187), S(119,221) }
   };
 
   // KingProtector[knight/bishop] contains penalty for each distance unit to own king

From 5efbaaba77b338dae7121c41f6590f6abc96912c Mon Sep 17 00:00:00 2001
From: SFisGOD <jonathandumale@gmail.com>
Date: Tue, 29 Sep 2020 02:24:26 +0800
Subject: [PATCH 170/398] Update default net to nn-baeb9ef2d183.nnue

Further optimization of Sergio's nn-03744f8d56d8.nnue
This patch is the result of collaboration with Joost VandeVondele.

STC:
LLR: 2.96 (-2.94,2.94) {-0.25,1.25}
Total: 37000 W: 4145 L: 3947 D: 28908
Ptnml(0-2): 191, 3016, 11912, 3166, 215
https://tests.stockfishchess.org/tests/view/5f71e7983b22d6afa5069475

LTC:
LLR: 2.96 (-2.94,2.94) {0.25,1.25}
Total: 60224 W: 2992 L: 2769 D: 54463
Ptnml(0-2): 48, 2420, 24956, 2637, 51
https://tests.stockfishchess.org/tests/view/5f722bb83b22d6afa506998f

closes https://github.com/official-stockfish/Stockfish/pull/3161

Bench: 3720073
---
 src/evaluate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/evaluate.h b/src/evaluate.h
index 503aa975..4b57a050 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -36,7 +36,7 @@ namespace Eval {
   // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
   // for the build process (profile-build and fishtest) to work. Do not change the
   // name of the macro, as it is used in the Makefile.
-  #define EvalFileDefaultName   "nn-04a843f8932e.nnue"
+  #define EvalFileDefaultName   "nn-baeb9ef2d183.nnue"
 
   namespace NNUE {
 

From 6f0aa186d8c9ead30a107634c438c6339b9cba09 Mon Sep 17 00:00:00 2001
From: Stefan Geschwentner <stgeschwentner@gmail.com>
Date: Mon, 28 Sep 2020 22:21:14 +0200
Subject: [PATCH 171/398] Tweak reduction formula.

Replace log(i) with log(i + 0.25 * log(i)). This increases especially for low values the reductions. But for bigger values there are nearly no changes.

STC:
LLR: 2.94 (-2.94,2.94) {-0.25,1.25}
Total: 49640 W: 5505 L: 5289 D: 38846
Ptnml(0-2): 270, 4074, 15924, 4274, 278
https://tests.stockfishchess.org/tests/view/5f71f04d3b22d6afa5069478

LTC:
LLR: 2.94 (-2.94,2.94) {0.25,1.25}
Total: 43856 W: 2209 L: 2021 D: 39626
Ptnml(0-2): 32, 1776, 18128, 1956, 36
https://tests.stockfishchess.org/tests/view/5f7232ee3b22d6afa50699a2

closes https://github.com/official-stockfish/Stockfish/pull/3163

Bench: 3555769
---
 src/search.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/search.cpp b/src/search.cpp
index e5f286e4..c7343ce8 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -192,7 +192,7 @@ namespace {
 void Search::init() {
 
   for (int i = 1; i < MAX_MOVES; ++i)
-      Reductions[i] = int((22.0 + 2 * std::log(Threads.size())) * std::log(i));
+      Reductions[i] = int((22.0 + 2 * std::log(Threads.size())) * std::log(i + 0.25 * std::log(i)));
 }
 
 
From 5af09cfda5b71f9470ef233298e0f4233651337d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ste=CC=81phane=20Nicolet?= <cassio@free.fr>
Date: Mon, 28 Sep 2020 22:32:55 +0200
Subject: [PATCH 172/398] Include pawns in NNUE scaling

We now include the total pawn count in the scaling factor for the output
of the NNUE evaluation network. This should have the effect of trying to
keep more pawns when SF has the advantage, but exchange them when she
is defending.

Thanks to Alexander Pagel (Lolligerhans) for the idea of using the
value of pawns to ease the comparison with the rest of the material
estimation.

Passed STC:
LLR: 2.93 (-2.94,2.94) {-0.25,1.25}
Total: 15072 W: 1700 L: 1539 D: 11833
Ptnml(0-2): 65, 1202, 4845, 1355, 69
https://tests.stockfishchess.org/tests/view/5f7235a63b22d6afa50699b3

Passed LTC:
LLR: 2.93 (-2.94,2.94) {0.25,1.25}
Total: 25880 W: 1270 L: 1124 D: 23486
Ptnml(0-2): 23, 980, 10788, 1126, 23
https://tests.stockfishchess.org/tests/view/5f723b483b22d6afa5069a99

closes https://github.com/official-stockfish/Stockfish/pull/3164

Bench: 3776081
---
 src/evaluate.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index fe92f7d7..25e3bdc1 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -1023,8 +1023,8 @@ Value Eval::evaluate(const Position& pos) {
   {
       // Scale and shift NNUE for compatibility with search and classical evaluation
       auto  adjusted_NNUE = [&](){
-         int mat = pos.non_pawn_material();
-         return NNUE::evaluate(pos) * (1024 + mat / 32) / 1024 + Tempo;
+         int mat = pos.non_pawn_material() + PieceValue[MG][PAWN] * pos.count<PAWN>();
+         return NNUE::evaluate(pos) * (720 + mat / 32) / 1024 + Tempo;
       };
 
       // If there is PSQ imbalance use classical eval, with small probability if it is small

From b44d539c945d16508bafba375bd4d98c19ac1624 Mon Sep 17 00:00:00 2001
From: noobpwnftw <guo.bojun@gmail.com>
Date: Sun, 27 Sep 2020 23:13:13 +0800
Subject: [PATCH 173/398] Fix a bug that LR is not correctly scaled when
 initial LR is not 1.0

---
 src/learn/learn.cpp | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 5320aaf8..80de6a57 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -741,7 +741,6 @@ namespace Learner
             learn_sum_entropy_win = 0.0;
             learn_sum_entropy = 0.0;
 
-            newbob_scale = 1.0;
             newbob_decay = 1.0;
             newbob_num_trials = 2;
             best_loss = std::numeric_limits<double>::infinity();
@@ -795,7 +794,6 @@ namespace Learner
         atomic<double> learn_sum_entropy;
 
         shared_timed_mutex nn_mutex;
-        double newbob_scale;
         double newbob_decay;
         int newbob_num_trials;
         double best_loss;
@@ -1309,12 +1307,11 @@ namespace Learner
                     if (--trials > 0 && !is_final)
                     {
                         cout
-                            << "reducing learning rate from " << newbob_scale
-                            << " to " << (newbob_scale * newbob_decay)
+                            << "reducing learning rate from " << global_learning_rate
+                            << " to " << (global_learning_rate * newbob_decay)
                             << " (" << trials << " more trials)" << endl;
 
-                        newbob_scale *= newbob_decay;
-                        global_learning_rate = newbob_scale;
+                        global_learning_rate *= newbob_decay;
                     }
                 }
 
@@ -1956,7 +1953,6 @@ namespace Learner
         learn_think.sr.no_shuffle = no_shuffle;
         learn_think.reduction_gameply = reduction_gameply;
 
-        learn_think.newbob_scale = 1.0;
         learn_think.newbob_decay = newbob_decay;
         learn_think.newbob_num_trials = newbob_num_trials;
 

From d865159bd6e30b7c3b284286e3c8a2ce2cc21f8d Mon Sep 17 00:00:00 2001
From: noobpwnftw <guo.bojun@gmail.com>
Date: Tue, 29 Sep 2020 17:30:08 +0800
Subject: [PATCH 174/398] Fix variable initialization in test commands

---
 src/nnue/nnue_test_command.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nnue/nnue_test_command.cpp b/src/nnue/nnue_test_command.cpp
index 5f0776ef..f6f05c2e 100644
--- a/src/nnue/nnue_test_command.cpp
+++ b/src/nnue/nnue_test_command.cpp
@@ -56,7 +56,7 @@ void TestFeatures(Position& pos) {
   auto update_index_sets = [&](const Position& position, auto* index_sets) {
     for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
       Features::IndexList removed_indices[2], added_indices[2];
-      bool reset[2];
+      bool reset[2] = { false, false };
       RawFeatures::AppendChangedIndices(position, kRefreshTriggers[i],
                                         removed_indices, added_indices, reset);
       for (const auto perspective : Colors) {

From f848d67341afb078df3f8de8095e07204bc3e044 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Wed, 30 Sep 2020 20:18:15 +0200
Subject: [PATCH 175/398] Use fair scheduling of threads under valgrind

fixes some rare case where the master search thread makes no progress,
observed in CI.
---
 tests/instrumented.sh       | 2 +-
 tests/instrumented_learn.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/instrumented.sh b/tests/instrumented.sh
index 03ded74a..03e9c9de 100755
--- a/tests/instrumented.sh
+++ b/tests/instrumented.sh
@@ -20,7 +20,7 @@ case $1 in
   --valgrind-thread)
     echo "valgrind-thread testing started"
     prefix=''
-    exeprefix='valgrind --error-exitcode=42'
+    exeprefix='valgrind --fair-sched=try --error-exitcode=42'
     postfix='1>/dev/null'
     threads="2"
   ;;
diff --git a/tests/instrumented_learn.sh b/tests/instrumented_learn.sh
index 44c5d7fa..267a3bb6 100755
--- a/tests/instrumented_learn.sh
+++ b/tests/instrumented_learn.sh
@@ -20,7 +20,7 @@ case $1 in
   --valgrind-thread)
     echo "valgrind-thread testing started"
     prefix=''
-    exeprefix='valgrind --error-exitcode=42'
+    exeprefix='valgrind --fair-sched=try --error-exitcode=42'
     postfix='1>/dev/null'
     threads="2"
   ;;

From 6f7a2287079682e5710c10106dd60e3c76abcc3e Mon Sep 17 00:00:00 2001
From: noobpwnftw <noobpwnftw@users.noreply.github.com>
Date: Thu, 1 Oct 2020 14:58:53 +0800
Subject: [PATCH 176/398] Minor cleanups

Remove unused code and magic numbers
---
 src/learn/gensfen.cpp | 58 +------------------------------------------
 1 file changed, 1 insertion(+), 57 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 7e931726..6fc59be9 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -387,12 +387,6 @@ namespace Learner
             int ply,
             int& random_move_c);
 
-        Value evaluate_leaf(
-            Position& pos,
-            std::vector<StateInfo, AlignedAllocator<StateInfo>>& states,
-            int ply,
-            vector<Move>& pv);
-
         // Min and max depths for search during gensfen
         int search_depth_min;
         int search_depth_max;
@@ -732,56 +726,6 @@ namespace Learner
         return random_move_flag;
     }
 
-    Value MultiThinkGenSfen::evaluate_leaf(
-        Position& pos,
-        std::vector<StateInfo, AlignedAllocator<StateInfo>>& states,
-        int ply,
-        vector<Move>& pv)
-    {
-        auto rootColor = pos.side_to_move();
-
-        for (auto m : pv)
-        {
-            // There should be no illegal move. This is as a debugging precaution.
-            if (!pos.pseudo_legal(m) || !pos.legal(m))
-            {
-                cout << "Error! : " << pos.fen() << m << endl;
-            }
-
-            pos.do_move(m, states[ply++]);
-        }
-
-        // Reach leaf
-        Value v;
-        if (pos.checkers())
-        {
-            // Sometime a king is checked.  An example is a case that a checkmate is
-            // found in the search.  If Eval::evaluate() is called whne a king is
-            // checked, classic eval crashes by an assertion. To avoid crashes, return
-            // VALUE_NONE and let the caller assign a value to the position.
-            v = VALUE_NONE;
-        }
-        else
-        {
-            v = Eval::evaluate(pos);
-
-            // evaluate() returns the evaluation value on the turn side, so
-            // If it's a turn different from root_color, you must invert v and return it.
-            if (rootColor != pos.side_to_move())
-            {
-                v = -v;
-            }
-        }
-
-        // Rewind the pv moves.
-        for (auto it = pv.rbegin(); it != pv.rend(); ++it)
-        {
-            pos.undo_move(*it);
-        }
-
-        return v;
-    }
-
     // thread_id = 0..Threads.size()-1
     void MultiThinkGenSfen::thread_worker(size_t thread_id)
     {
@@ -853,7 +797,7 @@ namespace Learner
                 if (abs(search_value) >= eval_limit)
                 {
                     resign_counter++;
-                    if ((should_resign && resign_counter >= 4) || abs(search_value) >= 10000) {
+                    if ((should_resign && resign_counter >= 4) || abs(search_value) >= VALUE_KNOWN_WIN) {
                         flush_psv((search_value >= eval_limit) ? 1 : -1);
                         break;
                     }

From 91cb4a6770fee0f8e586c3df5fd31f0f22dc7018 Mon Sep 17 00:00:00 2001
From: noobpwnftw <guo.bojun@gmail.com>
Date: Sat, 3 Oct 2020 15:35:54 +0800
Subject: [PATCH 177/398] Skip eval dampening in Use NNUE = pure case

---
 src/evaluate.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 607ff7eb..b3894fe8 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -996,6 +996,11 @@ Value Eval::evaluate(const Position& pos) {
 
   if (Eval::useNNUE == UseNNUEMode::Pure) {
       v = NNUE::evaluate(pos);
+
+      // Guarantee evaluation does not hit the tablebase range
+      v = std::clamp(v, VALUE_TB_LOSS_IN_MAX_PLY + 1, VALUE_TB_WIN_IN_MAX_PLY - 1);
+
+      return v;
   }
   else if (Eval::useNNUE == UseNNUEMode::False)
       v = Evaluation<NO_TRACE>(pos).value();

From 9382f854b3a67c5a970ad3342a3c12454974eccd Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Wed, 30 Sep 2020 21:22:36 +0200
Subject: [PATCH 178/398] Schedule threads fairly under valgrind

fixes a rare case that can cause CI to fail when running multithreaded under valgrind.

closes https://github.com/official-stockfish/Stockfish/pull/3165

No functional change.
---
 tests/instrumented.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/instrumented.sh b/tests/instrumented.sh
index 03ded74a..03e9c9de 100755
--- a/tests/instrumented.sh
+++ b/tests/instrumented.sh
@@ -20,7 +20,7 @@ case $1 in
   --valgrind-thread)
     echo "valgrind-thread testing started"
     prefix=''
-    exeprefix='valgrind --error-exitcode=42'
+    exeprefix='valgrind --fair-sched=try --error-exitcode=42'
     postfix='1>/dev/null'
     threads="2"
   ;;

From 17fb3a8ce0ccd2532f667fe685c4189d0bfe3b5b Mon Sep 17 00:00:00 2001
From: Unai Corzo <corzounai@gmail.com>
Date: Fri, 2 Oct 2020 22:00:55 +0200
Subject: [PATCH 179/398] Simplify away futility pruning for captures

Remove futility pruning for captures.

STC https://tests.stockfishchess.org/tests/view/5f749bfed930428c36d34c56
LLR: 2.94 (-2.94,2.94) {-1.25,0.25}
Total: 38064 W: 4011 L: 3929 D: 30124
Ptnml(0-2): 192, 3004, 12567, 3068, 201

LTC https://tests.stockfishchess.org/tests/view/5f74d99bf18675b1ce2f7412
LLR: 2.94 (-2.94,2.94) {-0.75,0.25}
Total: 184984 W: 8567 L: 8610 D: 167807
Ptnml(0-2): 146, 7593, 77058, 7548, 147

closes https://github.com/official-stockfish/Stockfish/pull/3166

bench: 3890648
---
 src/search.cpp | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index c7343ce8..eaa79fb9 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1049,15 +1049,6 @@ moves_loop: // When in check, search starts from here
                   && captureHistory[movedPiece][to_sq(move)][type_of(pos.piece_on(to_sq(move)))] < 0)
                   continue;
 
-              // Futility pruning for captures
-              if (   !givesCheck
-                  && lmrDepth < 6
-                  && !(PvNode && abs(bestValue) < 2)
-                  && !ss->inCheck
-                  && ss->staticEval + 169 + 244 * lmrDepth
-                     + PieceValue[MG][type_of(pos.piece_on(to_sq(move)))] <= alpha)
-                  continue;
-
               // See based pruning
               if (!pos.see_ge(move, Value(-221) * depth)) // (~25 Elo)
                   continue;

From 767b4f4fbe5ab2e63aceabd9005f4e1eb7cbcb51 Mon Sep 17 00:00:00 2001
From: FauziAkram <fauzi.dabat@hotmail.com>
Date: Fri, 2 Oct 2020 15:32:19 +0300
Subject: [PATCH 180/398] Pawn Tuning

Tuning of pawns, for classical evaluation:

Passed STC:
https://tests.stockfishchess.org/tests/view/5f771f0e52560f5fc78559ec
LLR: 2.96 (-2.94,2.94) {-0.25,1.25}
Total: 252696 W: 50321 L: 49692 D: 152683
Ptnml(0-2): 4614, 29845, 57049, 29978, 4862

Passed LTC:
https://tests.stockfishchess.org/tests/view/5f77cfef090dcf9aaa16d38b
LLR: 2.94 (-2.94,2.94) {0.25,1.25}
Total: 48184 W: 6556 L: 6193 D: 35435
Ptnml(0-2): 335, 4516, 14100, 4733, 408

closes https://github.com/official-stockfish/Stockfish/pull/3169

bench: 4016121
---
 src/pawns.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/pawns.cpp b/src/pawns.cpp
index af0f6618..a5102db8 100644
--- a/src/pawns.cpp
+++ b/src/pawns.cpp
@@ -30,21 +30,21 @@ namespace {
   #define S(mg, eg) make_score(mg, eg)
 
   // Pawn penalties
-  constexpr Score Backward      = S( 8, 27);
-  constexpr Score Doubled       = S(11, 55);
-  constexpr Score Isolated      = S( 5, 17);
-  constexpr Score WeakLever     = S( 2, 54);
-  constexpr Score WeakUnopposed = S(15, 25);
+  constexpr Score Backward      = S( 8, 25);
+  constexpr Score Doubled       = S(10, 55);
+  constexpr Score Isolated      = S( 3, 15);
+  constexpr Score WeakLever     = S( 3, 55);
+  constexpr Score WeakUnopposed = S(13, 25);
 
   // Bonus for blocked pawns at 5th or 6th rank
-  constexpr Score BlockedPawn[2] = { S(-13, -4), S(-4, 3) };
+  constexpr Score BlockedPawn[2] = { S(-13, -4), S(-5, 2) };
 
   constexpr Score BlockedStorm[RANK_NB] = {
     S(0, 0), S(0, 0), S(76, 78), S(-10, 15), S(-7, 10), S(-4, 6), S(-1, 2)
   };
 
   // Connected pawn bonus
-  constexpr int Connected[RANK_NB] = { 0, 7, 8, 11, 24, 45, 85 };
+  constexpr int Connected[RANK_NB] = { 0, 5, 7, 11, 24, 48, 86 };
 
   // Strength of pawn shelter for our king by [distance from edge][rank].
   // RANK_1 = 0 is used for files where we have no pawn, or pawn is behind our king.
@@ -147,7 +147,7 @@ namespace {
         if (support | phalanx)
         {
             int v =  Connected[r] * (2 + bool(phalanx) - bool(opposed))
-                   + 21 * popcount(support);
+                   + 22 * popcount(support);
 
             score += make_score(v, v * (r - 2) / 4);
         }

From 31f9d66f120f499f20b859a1e143fca0560b88a6 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 5 Oct 2020 20:33:47 +0200
Subject: [PATCH 181/398] Initial documentation for learn, gensfen, convert,
 and binpack.

---
 src/docs/binpack.md | 42 +++++++++++++++++++++
 src/docs/convert.md | 15 ++++++++
 src/docs/gensfen.md | 57 ++++++++++++++++++++++++++++
 src/docs/learn.md   | 92 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 206 insertions(+)
 create mode 100644 src/docs/binpack.md
 create mode 100644 src/docs/convert.md
 create mode 100644 src/docs/gensfen.md
 create mode 100644 src/docs/learn.md

diff --git a/src/docs/binpack.md b/src/docs/binpack.md
new file mode 100644
index 00000000..1940a5dc
--- /dev/null
+++ b/src/docs/binpack.md
@@ -0,0 +1,42 @@
+# Binpack
+
+Binpack is a binary training data storage format designed to take advantage of position chains differing by a single move. Therefore it is very good at compactly storing data generated from real games (as opposed to random positions for example sourced from an opening book).
+
+It is currently implemented through a single header library in `extra/nnue_data_binpack_format.h`.
+
+Below follows a rough description of the format in a BNF-like notation.
+
+```
+[[nodiscard]] std::uint16_t signedToUnsigned(std::int16_t a) {
+    std::uint16_t r;
+    std::memcpy(&r, &a, sizeof(std::uint16_t));
+    if (r & 0x8000) r ^= 0x7FFF; // flip value bits if negative
+    r = (r << 1) | (r >> 15); // store sign bit at bit 0
+    return r;
+}
+
+file := <block>*
+block := BINP<chain>*
+chain := <stem><movetext>
+stem := <pos><move><score><ply_and_result><rule50> (32 bytes)
+pos := https://github.com/Sopel97/nnue_data_compress/blob/master/src/chess/Position.h#L1166 (24 bytes)
+move := https://github.com/Sopel97/nnue_data_compress/blob/master/src/chess/Chess.h#L1044 (2 bytes)
+score := signedToUnsigned(score) (2 bytes, big endian)
+ply_and_result := ply bitwise_or (signedToUnsigned(result) << 14) (2 bytes, big endian)
+rule50 := rule_50_counter (2 bytes, big endian)
+    // this is a small defect from old version,
+    I didn't want to break backwards compatibility. Effectively means that there's
+    one byte left for something else in the future because rule50 always fits in one byte.
+
+movetext := <count><move_and_score>*
+count := number of plies in the movetext (2 bytes, big endian). Can be 0.
+move_and_score := <encoded_move><encoded_score> (~2 bytes)
+encoded_move := oof this one is complicated to explain.
+    https://github.com/Sopel97/nnue_data_compress/blob/master/src/compress_file.cpp#L827.
+    https://github.com/Sopel97/chess_pos_db/blob/master/docs/bcgn/variable_length.md
+
+encoded_score := https://en.wikipedia.org/wiki/Variable-width_encoding
+    with block size of 4 bits + 1 bit for extension bit.
+    Encoded value is signedToUnsigned(-prev_score - current_score)
+    (scores are always seen from the perspective of side to move in <pos>, that's why the '-' before prev_score)
+```
\ No newline at end of file
diff --git a/src/docs/convert.md b/src/docs/convert.md
new file mode 100644
index 00000000..05d230b2
--- /dev/null
+++ b/src/docs/convert.md
@@ -0,0 +1,15 @@
+# Convert
+
+`convert` allows conversion of training between any of `.plain`, `.bin`, and `.binpack`.
+
+As all commands in stockfish `convert` can be invoked either from command line (as `stockfish.exe convert ...`) or in the interactive prompt.
+
+The syntax of this command is as follows:
+```
+convert from_path to_path [append]
+```
+
+`from_path` is the path to the file to convert from. The type of the data is deduced based on its extension (one of `.plain`, `.bin`, `.binpack`).
+`to_path` is the path to an output file. The type of the data is deduced from its extension. If the file does not exist it is created.
+
+Last argument is optional. If not specified then they output file will be truncated prior to any writes. If the last argument is `append` then the converted training data will be appended to the end of the output file.
\ No newline at end of file
diff --git a/src/docs/gensfen.md b/src/docs/gensfen.md
new file mode 100644
index 00000000..c3e0a9c2
--- /dev/null
+++ b/src/docs/gensfen.md
@@ -0,0 +1,57 @@
+# Gensfen
+
+`gensfen` command allows generation of training data from self-play in a manner that suits training better than traditional games. It introduces random moves to diversify openings, allows reduced pruning, disabling of TT for less interference between searches, and fixed depth evaluation.
+
+As all commands in stockfish `gensfen` can be invoked either from command line (as `stockfish.exe gensfen ...`, but this is not recommended because it's not possible to specify UCI options before `gensfen` executes) or in the interactive prompt.
+
+`gensfen` takes named parameters in form `gensfen param_1_name param_1_value param_2_name param_2_value ...`.
+
+Currently the following options are available:
+
+`depth` - minimum depth of evaluation of each position. Default: 3.
+
+`depth2` - maximum depth of evaluation of each position. If not specified then the same as `depth`.
+
+`nodes` - the number of nodes to use for evaluation of each position. This number is multiplied by the number of PVs of the current search. This does NOT override the `depth` and `depth2` options. If specified then whichever of depth or nodes limit is reached first applies.
+
+`loop` - the number of training data entries to generate. 1 entry == 1 position.
+
+`output_file_name` - the name of the file to output to. If the extension is not present or doesn't match the selected training data format the right extension will be appened.
+
+`eval_limit` - evaluations with higher absolute value than this will not be written and will terminate a self-play game. Should not exceed 10000 which VALUE_KNOWN_WIN, but is only hardcapped at mate in 2 (\~30000).
+
+`random_move_minply` - the minimal ply at which a random move may be executed instead of a move chosen by search
+
+`random_move_maxply` - the maximal ply at which a random move may be executed instead of a move chosen by search
+
+`random_move_count` - maximum number of random moves in a single self-play game
+
+`random_move_like_apery` - either 0 or 1. If 1 then random king moves will be followed by a random king move from the opponent whenever possible with 50% probability.
+
+`random_multi_pv` - the number of PVs used for determining the random move. If not specified then a truly random move will be chosen. If specified then a multiPV search will be performed the random move will be one of the moves chosen by the search.
+
+`random_multi_pv_diff` - Makes the multiPV random move selection consider only moves that are at most `random_multi_pv_diff` worse than the next best move. Default: 30000 (all multiPV moves).
+
+`random_multi_pv_depth` - the depth to use for multiPV search for random move. Defaults to `depth2`.
+
+`write_minply` - minimum ply for which the training data entry will be emitted.
+
+`write_maxply` - maximum ply for which the training data entry will be emitted.
+
+`save_every` - the number of training data entries per file. If not specified then there will be always one file. If specified there may be more than one file generated (each having at most `save_every` training data entries) and each file will have a unique number attached.
+
+`random_file_name` - if specified then the output filename will be chosen randomly. Overrides `output_file_name`.
+
+`use_draw_in_training_data_generation` - either 0 or 1. If 1 then training data from drawn games will be emitted too. Default: 0.
+
+`write_out_draw_game_in_training_data_generation` - deprecated, alias for `use_draw_in_training_data_generation`
+
+`detect_draw_by_consecutive_low_score` - either 0 or 1. If 1 then drawn games will be adjudicated when the score remains 0 for at least 8 plies after ply 80. Default: 0.
+
+`use_game_draw_adjudication` - deprecated, alias for `detect_draw_by_consecutive_low_score`
+
+`detect_draw_by_insufficient_mating_material` - either 0 or 1. If 1 then position with insufficient material will be adjudicated as draws. Default: 0.
+
+`sfen_format` - format of the training data to use. Either `bin` or `binpack`. Default: `bin`.
+
+`seed` - seed for the PRNG. Can be either a number or a string. If it's a string then its hash will be used. If not specified then the current time will be used.
diff --git a/src/docs/learn.md b/src/docs/learn.md
new file mode 100644
index 00000000..d1347db1
--- /dev/null
+++ b/src/docs/learn.md
@@ -0,0 +1,92 @@
+# Learn
+
+`learn` command allows allows training a network from training data.
+
+As all commands in stockfish `learn` can be invoked either from command line (as `stockfish.exe learn ...`, but this is not recommended because it's not possible to specify UCI options before `learn` executes) or in the interactive prompt.
+
+`learn` takes named parameters in form `learn param_1_name param_1_value param_2_name param_2_value ...`. Unrecognized parameters form a list of paths to training data files.
+
+Currently the following options are available:
+
+`bat` - the size of a minibatch in multiples of 10000. The number of positions inbetween weights updates. Default: 1000 (meaning mini batch size of 1000000).
+
+`targetdir` - path to the direction from which training data will be read. All files in this directory are read sequentially. If not specified then only the list of files from positional arguments will be used. If specified then files from the given directory will be used after the explicitly specified files.
+
+`loop` - the number of times to loop over all training data.
+
+`basedir` - the base directory for the paths. Default: "" (current directory)
+
+`batchsize` - same as `bat` but doesn't scale by 10000
+
+`lr` - initial learning rate. Default: 1.
+
+`use_draw_games_in_training` - either 0 or 1. If 1 then draws will be used in training too. Default: 0.
+
+`use_draw_in_training` - deprecated, alias for `use_draw_games_in_training`
+
+`use_draw_games_in_validation` - either 0 or 1. If 1 then draws will be used in validation too. Default: 0.
+
+`use_draw_in_validation` - deprecated, alias for `use_draw_games_in_validation`
+
+`skip_duplicated_positions_in_training` - either 0 or 1. If 1 then a small hashtable will be used to try to eliminate duplicated position from training. Default: 0.
+
+`use_hash_in_training` - deprecated, alias for `skip_duplicated_positions_in_training`
+
+`winning_probability_coefficient` - some magic value for winning probability. If you need to read this then don't touch it. Default: 1.0 / PawnValueEg / 4.0 * std::log(10.0)
+
+`use_wdl` - either 0 or 1. If 1 then the evaluations will be converted to win/draw/loss percentages prior to learning on them. (Slightly changes the gradient because eval has a different derivative than wdl). Default: 0.
+
+`lambda` - value in range [0..1]. 1 means that only evaluation is used for learning, 0 means that only game result is used. Values inbetween result in interpolation between the two contributions. See `lambda_limit` for when this is applied. Default: 0.33.
+
+`lambda2` - value in range [0..1]. 1 means that only evaluation is used for learning, 0 means that only game result is used. Values inbetween result in interpolation between the two contributions. See `lambda_limit` for when this is applied. Default: 0.33.
+
+`lambda_limit` - the maximum absolute score value for which `lambda` is used as opposed to `lambda2`. For positions with absolute evaluation higher than `lambda_limit` `lambda2` will be used. Default: 32000 (so always `lambda`).
+
+`reduction_gameply` - the minimum ply after which positions won't be skipped. Positions at plies below this value are skipped with a probability that lessens linearly with the ply (reaching 0 at `reduction_gameply`). Default: 1.
+
+`eval_limit` - positions with absolute evaluation higher than this will be skipped. Default: 32000 (nothing is skipped).
+
+`save_only_once` - this is a modifier not a parameter, no value follows it. If specified then there will be only one network file generated.
+
+`no_shuffle` - this is a modifier not a parameter, no value follows it. If specified then data within a batch won't be shuffled.
+
+`nn_batch_size` - batch size used for learning. Default: 1000.
+
+`newbob_decay` - learning rate will be multiplied by this factor every time a net is rejected (so in other words it controls LR drops). Default: 1.0 (no LR drops)
+
+`newbob_num_trials` - determines after how many subsequent rejected nets the training process will be terminated. Default: 2.
+
+`nn_options` - if you're reading this you don't use it. It passes messages directly to the network evaluation. I don't know what it can do either.
+
+`eval_save_interval` - every `eval_save_interval` positions the network will be saved and either accepted or rejected (in which case an LR drop follows). Default: 1000000000 (1B). (generally people use values in 10M-100M range)
+
+`loss_output_interval` - every `loss_output_interval` fittness statistics are displayed. Default: `batchsize`
+
+`validation_set_file_name` - path to the file with training data to be used for validation (loss computation and move accuracy)
+
+`seed` - seed for the PRNG. Can be either a number or a string. If it's a string then its hash will be used. If not specified then the current time will be used.
+
+## Legacy subcommands and parameters
+
+### Convert
+
+`convert_plain`
+`convert_bin`
+`interpolate_eval`
+`check_invalid_fen`
+`check_illegal_move`
+`convert_bin_from_pgn-extract`
+`pgn_eval_side_to_move`
+`convert_no_eval_fens_as_score_zero`
+`src_score_min_value`
+`src_score_max_value`
+`dest_score_min_value`
+`dest_score_max_value`
+
+### Shuffle
+
+`shuffle`
+`buffer_size`
+`shuffleq`
+`shufflem`
+`output_file_name`
\ No newline at end of file

From 80cbc3ffee9f3c6d048107b437cdddbc3d69b34a Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Tue, 6 Oct 2020 10:55:42 +0200
Subject: [PATCH 182/398] Fix grammar and spelling. Add recommendations for UCI
 options.

---
 src/docs/convert.md |  4 ++--
 src/docs/gensfen.md | 14 +++++++++-----
 src/docs/learn.md   | 16 +++++++++++-----
 3 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/src/docs/convert.md b/src/docs/convert.md
index 05d230b2..2e07ec52 100644
--- a/src/docs/convert.md
+++ b/src/docs/convert.md
@@ -1,6 +1,6 @@
 # Convert
 
-`convert` allows conversion of training between any of `.plain`, `.bin`, and `.binpack`.
+`convert` allows conversion of training data between any of `.plain`, `.bin`, and `.binpack`.
 
 As all commands in stockfish `convert` can be invoked either from command line (as `stockfish.exe convert ...`) or in the interactive prompt.
 
@@ -12,4 +12,4 @@ convert from_path to_path [append]
 `from_path` is the path to the file to convert from. The type of the data is deduced based on its extension (one of `.plain`, `.bin`, `.binpack`).
 `to_path` is the path to an output file. The type of the data is deduced from its extension. If the file does not exist it is created.
 
-Last argument is optional. If not specified then they output file will be truncated prior to any writes. If the last argument is `append` then the converted training data will be appended to the end of the output file.
\ No newline at end of file
+The last argument is optional. If not specified then the output file will be truncated prior to any writes. If the last argument is `append` then the converted training data will be appended to the end of the output file.
\ No newline at end of file
diff --git a/src/docs/gensfen.md b/src/docs/gensfen.md
index c3e0a9c2..35c08582 100644
--- a/src/docs/gensfen.md
+++ b/src/docs/gensfen.md
@@ -1,10 +1,14 @@
 # Gensfen
 
-`gensfen` command allows generation of training data from self-play in a manner that suits training better than traditional games. It introduces random moves to diversify openings, allows reduced pruning, disabling of TT for less interference between searches, and fixed depth evaluation.
+`gensfen` command allows generation of training data from self-play in a manner that suits training better than traditional games. It introduces random moves to diversify openings, and fixed depth evaluation.
 
 As all commands in stockfish `gensfen` can be invoked either from command line (as `stockfish.exe gensfen ...`, but this is not recommended because it's not possible to specify UCI options before `gensfen` executes) or in the interactive prompt.
 
-`gensfen` takes named parameters in form `gensfen param_1_name param_1_value param_2_name param_2_value ...`.
+It is recommended to set the `PruneAtShallowDepth` UCI option to `false` as it will increase the quality of fixed depth searches.
+
+It is recommended to keep the `EnableTranspositionTable` UCI option at the default `true` value as it will make the generation process faster without noticably harming the uniformity of the data.
+
+`gensfen` takes named parameters in the form of `gensfen param_1_name param_1_value param_2_name param_2_value ...`.
 
 Currently the following options are available:
 
@@ -18,7 +22,7 @@ Currently the following options are available:
 
 `output_file_name` - the name of the file to output to. If the extension is not present or doesn't match the selected training data format the right extension will be appened.
 
-`eval_limit` - evaluations with higher absolute value than this will not be written and will terminate a self-play game. Should not exceed 10000 which VALUE_KNOWN_WIN, but is only hardcapped at mate in 2 (\~30000).
+`eval_limit` - evaluations with higher absolute value than this will not be written and will terminate a self-play game. Should not exceed 10000 which is VALUE_KNOWN_WIN, but is only hardcapped at mate in 2 (\~30000).
 
 `random_move_minply` - the minimal ply at which a random move may be executed instead of a move chosen by search
 
@@ -42,9 +46,9 @@ Currently the following options are available:
 
 `random_file_name` - if specified then the output filename will be chosen randomly. Overrides `output_file_name`.
 
-`use_draw_in_training_data_generation` - either 0 or 1. If 1 then training data from drawn games will be emitted too. Default: 0.
+`write_out_draw_game_in_training_data_generation` - either 0 or 1. If 1 then training data from drawn games will be emitted too. Default: 0.
 
-`write_out_draw_game_in_training_data_generation` - deprecated, alias for `use_draw_in_training_data_generation`
+`use_draw_in_training_data_generation` - deprecated, alias for `write_out_draw_game_in_training_data_generation`
 
 `detect_draw_by_consecutive_low_score` - either 0 or 1. If 1 then drawn games will be adjudicated when the score remains 0 for at least 8 plies after ply 80. Default: 0.
 
diff --git a/src/docs/learn.md b/src/docs/learn.md
index d1347db1..eab33607 100644
--- a/src/docs/learn.md
+++ b/src/docs/learn.md
@@ -1,14 +1,20 @@
 # Learn
 
-`learn` command allows allows training a network from training data.
+`learn` command allows training a network from training data.
 
 As all commands in stockfish `learn` can be invoked either from command line (as `stockfish.exe learn ...`, but this is not recommended because it's not possible to specify UCI options before `learn` executes) or in the interactive prompt.
 
-`learn` takes named parameters in form `learn param_1_name param_1_value param_2_name param_2_value ...`. Unrecognized parameters form a list of paths to training data files.
+`learn` takes named parameters in the form of `learn param_1_name param_1_value param_2_name param_2_value ...`. Unrecognized parameters form a list of paths to training data files.
+
+It is recommended to set the `EnableTranspositionTable` UCI option to `false` to reduce the interference between qsearches which are used to provide shallow evaluation. Using TT may cause the shallow evaluation to diverge from the real evaluation of the net, hiding imperfections.
+
+It is recommended to set the `PruneAtShallowDepth` UCI option to `false` as it will provide more accurate shallow evaluation.
+
+It is **required** to set the `Use NNUE` UCI option to `pure` as otherwise the function being optimized will not always match the function being probed, in which case not much can be learned.
 
 Currently the following options are available:
 
-`bat` - the size of a minibatch in multiples of 10000. The number of positions inbetween weights updates. Default: 1000 (meaning mini batch size of 1000000).
+`bat` - the size of a batch in multiples of 10000. This determines how many entries are read and shuffled at once during training. Default: 1000 (meaning batch size of 1000000).
 
 `targetdir` - path to the direction from which training data will be read. All files in this directory are read sequentially. If not specified then only the list of files from positional arguments will be used. If specified then files from the given directory will be used after the explicitly specified files.
 
@@ -50,7 +56,7 @@ Currently the following options are available:
 
 `no_shuffle` - this is a modifier not a parameter, no value follows it. If specified then data within a batch won't be shuffled.
 
-`nn_batch_size` - batch size used for learning. Default: 1000.
+`nn_batch_size` - minibatch size used for learning. Should be smaller than batch size. Default: 1000.
 
 `newbob_decay` - learning rate will be multiplied by this factor every time a net is rejected (so in other words it controls LR drops). Default: 1.0 (no LR drops)
 
@@ -60,7 +66,7 @@ Currently the following options are available:
 
 `eval_save_interval` - every `eval_save_interval` positions the network will be saved and either accepted or rejected (in which case an LR drop follows). Default: 1000000000 (1B). (generally people use values in 10M-100M range)
 
-`loss_output_interval` - every `loss_output_interval` fittness statistics are displayed. Default: `batchsize`
+`loss_output_interval` - every `loss_output_interval` fitness statistics are displayed. Default: `batchsize`
 
 `validation_set_file_name` - path to the file with training data to be used for validation (loss computation and move accuracy)
 

From 5fa28b12fa4dcece84db555ca19d15308b2f1e1a Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 7 Oct 2020 11:20:53 +0200
Subject: [PATCH 183/398] Allow setting UCI options programmatically.

---
 src/uci.cpp | 19 ++++++++++++-------
 src/uci.h   |  1 +
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/uci.cpp b/src/uci.cpp
index a123bbc0..166e437c 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -109,7 +109,7 @@ namespace {
   // setoption() is called when engine receives the "setoption" UCI command. The
   // function updates the UCI option ("name") to the given value ("value").
 
-  void setoption(istringstream& is) {
+  void setoption_from_stream(istringstream& is) {
 
     string token, name, value;
 
@@ -123,10 +123,7 @@ namespace {
     while (is >> token)
         value += (value.empty() ? "" : " ") + token;
 
-    if (Options.count(name))
-        Options[name] = value;
-    else
-        sync_cout << "No such option: " << name << sync_endl;
+    UCI::setoption(name, value);
   }
 
 
@@ -195,7 +192,7 @@ namespace {
             else
                trace_eval(pos);
         }
-        else if (token == "setoption")  setoption(is);
+        else if (token == "setoption")  setoption_from_stream(is);
         else if (token == "position")   position(pos, is, states);
         else if (token == "ucinewgame") { Search::clear(); elapsed = now(); } // Search::clear() may take some while
     }
@@ -212,6 +209,14 @@ namespace {
 
 } // namespace
 
+void UCI::setoption(const std::string& name, const std::string& value)
+{
+    if (Options.count(name))
+        Options[name] = value;
+    else
+        sync_cout << "No such option: " << name << sync_endl;
+}
+
 // The win rate model returns the probability (per mille) of winning given an eval
 // and a game-ply. The model fits rather accurately the LTC fishtest statistics.
 int UCI::win_rate_model(Value v, int ply) {
@@ -318,7 +323,7 @@ void UCI::loop(int argc, char* argv[]) {
                     << "\n"       << Options
                     << "\nuciok"  << sync_endl;
 
-      else if (token == "setoption")  setoption(is);
+      else if (token == "setoption")  setoption_from_stream(is);
       else if (token == "go")         go(pos, is, states);
       else if (token == "position")   position(pos, is, states);
       else if (token == "ucinewgame") Search::clear();
diff --git a/src/uci.h b/src/uci.h
index 2e0f5c11..192963cb 100644
--- a/src/uci.h
+++ b/src/uci.h
@@ -75,6 +75,7 @@ std::string wdl(Value v, int ply);
 int win_rate_model(Value v, int ply);
 double win_rate_model_double(double v, int ply);
 Move to_move(const Position& pos, std::string& str);
+void setoption(const std::string& name, const std::string& value);
 
 } // namespace UCI
 

From d1c44dca042392b2bf0ceb0c8901d52ca92fa023 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 7 Oct 2020 11:27:52 +0200
Subject: [PATCH 184/398] Switch to set recommended gensfen UCI options

---
 src/docs/gensfen.md   |  2 ++
 src/learn/gensfen.cpp | 10 ++++++++++
 2 files changed, 12 insertions(+)

diff --git a/src/docs/gensfen.md b/src/docs/gensfen.md
index 35c08582..e4263a92 100644
--- a/src/docs/gensfen.md
+++ b/src/docs/gensfen.md
@@ -12,6 +12,8 @@ It is recommended to keep the `EnableTranspositionTable` UCI option at the defau
 
 Currently the following options are available:
 
+`set_recommended_uci_options` - this is a modifier not a parameter, no value follows it. If specified then some UCI options are set to recommended values.
+
 `depth` - minimum depth of evaluation of each position. Default: 3.
 
 `depth2` - maximum depth of evaluation of each position. If not specified then the same as `depth`.
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 6fc59be9..5720236d 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -1014,6 +1014,16 @@ namespace Learner
                 is >> sfen_format;
             else if (token == "seed")
                 is >> seed;
+            else if (token == "set_recommended_uci_options")
+            {
+                UCI::setoption("Contempt", "0");
+                UCI::setoption("Skill Level", "20");
+                UCI::setoption("UCI_Chess960", "false");
+                UCI::setoption("UCI_AnalyseMode", "false");
+                UCI::setoption("UCI_LimitStrength", "false");
+                UCI::setoption("PruneAtShallowDepth", "false");
+                UCI::setoption("EnableTranspositionTable", "true");
+            }
             else
                 cout << "Error! : Illegal token " << token << endl;
         }

From 2e57f3fa222d9dd879662864799c1896f732de11 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 7 Oct 2020 11:30:46 +0200
Subject: [PATCH 185/398] Switch to set recommended learn UCI options

---
 src/docs/learn.md   |  2 ++
 src/learn/learn.cpp | 12 ++++++++++++
 2 files changed, 14 insertions(+)

diff --git a/src/docs/learn.md b/src/docs/learn.md
index eab33607..5cd2e8b5 100644
--- a/src/docs/learn.md
+++ b/src/docs/learn.md
@@ -14,6 +14,8 @@ It is **required** to set the `Use NNUE` UCI option to `pure` as otherwise the f
 
 Currently the following options are available:
 
+`set_recommended_uci_options` - this is a modifier not a parameter, no value follows it. If specified then some UCI options are set to recommended values.
+
 `bat` - the size of a batch in multiples of 10000. This determines how many entries are read and shuffled at once during training. Default: 1000 (meaning batch size of 1000000).
 
 `targetdir` - path to the direction from which training data will be read. All files in this directory are read sequentially. If not specified then only the list of files from positional arguments will be used. If specified then files from the given directory will be used after the explicitly specified files.
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 80de6a57..6bba1dda 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -1745,6 +1745,18 @@ namespace Learner
             else if (option == "dest_score_min_value") is >> dest_score_min_value;
             else if (option == "dest_score_max_value") is >> dest_score_max_value;
             else if (option == "seed") is >> seed;
+            else if (option == "set_recommended_uci_options")
+            {
+                UCI::setoption("Use NNUE", "pure");
+                UCI::setoption("MultiPV", "1");
+                UCI::setoption("Contempt", "0");
+                UCI::setoption("Skill Level", "20");
+                UCI::setoption("UCI_Chess960", "false");
+                UCI::setoption("UCI_AnalyseMode", "false");
+                UCI::setoption("UCI_LimitStrength", "false");
+                UCI::setoption("PruneAtShallowDepth", "false");
+                UCI::setoption("EnableTranspositionTable", "false");
+            }
             // Otherwise, it's a filename.
             else
                 filenames.push_back(option);

From 8830209125bcf9a5aca8eba995c0687ed3c93ab2 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 7 Oct 2020 11:46:26 +0200
Subject: [PATCH 186/398] Change some learn parameter defaults.

---
 src/docs/learn.md   | 16 ++++++++--------
 src/learn/learn.cpp | 14 +++++++-------
 src/learn/learn.h   |  2 +-
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/docs/learn.md b/src/docs/learn.md
index 5cd2e8b5..4c8c3fc1 100644
--- a/src/docs/learn.md
+++ b/src/docs/learn.md
@@ -28,11 +28,11 @@ Currently the following options are available:
 
 `lr` - initial learning rate. Default: 1.
 
-`use_draw_games_in_training` - either 0 or 1. If 1 then draws will be used in training too. Default: 0.
+`use_draw_games_in_training` - either 0 or 1. If 1 then draws will be used in training too. Default: 1.
 
 `use_draw_in_training` - deprecated, alias for `use_draw_games_in_training`
 
-`use_draw_games_in_validation` - either 0 or 1. If 1 then draws will be used in validation too. Default: 0.
+`use_draw_games_in_validation` - either 0 or 1. If 1 then draws will be used in validation too. Default: 1.
 
 `use_draw_in_validation` - deprecated, alias for `use_draw_games_in_validation`
 
@@ -44,9 +44,9 @@ Currently the following options are available:
 
 `use_wdl` - either 0 or 1. If 1 then the evaluations will be converted to win/draw/loss percentages prior to learning on them. (Slightly changes the gradient because eval has a different derivative than wdl). Default: 0.
 
-`lambda` - value in range [0..1]. 1 means that only evaluation is used for learning, 0 means that only game result is used. Values inbetween result in interpolation between the two contributions. See `lambda_limit` for when this is applied. Default: 0.33.
+`lambda` - value in range [0..1]. 1 means that only evaluation is used for learning, 0 means that only game result is used. Values inbetween result in interpolation between the two contributions. See `lambda_limit` for when this is applied. Default: 1.0.
 
-`lambda2` - value in range [0..1]. 1 means that only evaluation is used for learning, 0 means that only game result is used. Values inbetween result in interpolation between the two contributions. See `lambda_limit` for when this is applied. Default: 0.33.
+`lambda2` - value in range [0..1]. 1 means that only evaluation is used for learning, 0 means that only game result is used. Values inbetween result in interpolation between the two contributions. See `lambda_limit` for when this is applied. Default: 1.0.
 
 `lambda_limit` - the maximum absolute score value for which `lambda` is used as opposed to `lambda2`. For positions with absolute evaluation higher than `lambda_limit` `lambda2` will be used. Default: 32000 (so always `lambda`).
 
@@ -60,15 +60,15 @@ Currently the following options are available:
 
 `nn_batch_size` - minibatch size used for learning. Should be smaller than batch size. Default: 1000.
 
-`newbob_decay` - learning rate will be multiplied by this factor every time a net is rejected (so in other words it controls LR drops). Default: 1.0 (no LR drops)
+`newbob_decay` - learning rate will be multiplied by this factor every time a net is rejected (so in other words it controls LR drops). Default: 0.5 (no LR drops)
 
-`newbob_num_trials` - determines after how many subsequent rejected nets the training process will be terminated. Default: 2.
+`newbob_num_trials` - determines after how many subsequent rejected nets the training process will be terminated. Default: 4.
 
 `nn_options` - if you're reading this you don't use it. It passes messages directly to the network evaluation. I don't know what it can do either.
 
-`eval_save_interval` - every `eval_save_interval` positions the network will be saved and either accepted or rejected (in which case an LR drop follows). Default: 1000000000 (1B). (generally people use values in 10M-100M range)
+`eval_save_interval` - every `eval_save_interval` positions the network will be saved and either accepted or rejected (in which case an LR drop follows). Default: 100000000 (100M). (generally people use values in 10M-100M range)
 
-`loss_output_interval` - every `loss_output_interval` fitness statistics are displayed. Default: `batchsize`
+`loss_output_interval` - every `loss_output_interval` fitness statistics are displayed. Default: 1000000 (1M)
 
 `validation_set_file_name` - path to the file with training data to be used for validation (loss computation and move accuracy)
 
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 6bba1dda..c3335e37 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -77,8 +77,8 @@ T operator -= (std::atomic<T>& x, const T rhs) { return x += -rhs; }
 
 namespace Learner
 {
-    static bool use_draw_games_in_training = false;
-    static bool use_draw_games_in_validation = false;
+    static bool use_draw_games_in_training = true;
+    static bool use_draw_games_in_validation = true;
     static bool skip_duplicated_positions_in_training = true;
 
     static double winning_probability_coefficient = 1.0 / PawnValueEg / 4.0 * std::log(10.0);
@@ -1632,8 +1632,8 @@ namespace Learner
         global_learning_rate = 1.0;
 
         // elmo lambda
-        ELMO_LAMBDA = 0.33;
-        ELMO_LAMBDA2 = 0.33;
+        ELMO_LAMBDA = 1.0;
+        ELMO_LAMBDA2 = 1.0;
         ELMO_LAMBDA_LIMIT = 32000;
 
         // if (gamePly <rand(reduction_gameply)) continue;
@@ -1642,12 +1642,12 @@ namespace Learner
         int reduction_gameply = 1;
 
         uint64_t nn_batch_size = 1000;
-        double newbob_decay = 1.0;
-        int newbob_num_trials = 2;
+        double newbob_decay = 0.5;
+        int newbob_num_trials = 4;
         string nn_options;
 
         uint64_t eval_save_interval = LEARN_EVAL_SAVE_INTERVAL;
-        uint64_t loss_output_interval = 0;
+        uint64_t loss_output_interval = 1'000'000;
 
         string validation_set_file_name;
         string seed;
diff --git a/src/learn/learn.h b/src/learn/learn.h
index 4b09f825..3ba75ce3 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -64,7 +64,7 @@ namespace Learner
     // Needless to say, the longer the saving interval, the shorter the learning time.
     // Folder name is incremented for each save like 0/, 1/, 2/...
     // By default, once every 1 billion phases.
-    constexpr std::size_t LEARN_EVAL_SAVE_INTERVAL = 1000000000ULL;
+    constexpr std::size_t LEARN_EVAL_SAVE_INTERVAL = 100'000'000ULL;
 
     // Reduce the output of rmse during learning to 1 for this number of times.
     // rmse calculation is done in one thread, so it takes some time, so reducing the output is effective.

From 3f55b3af42fc569dfae7b6f5bd3d946ba4d5891e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 7 Oct 2020 11:46:34 +0200
Subject: [PATCH 187/398] Change some gensfen parameter defaults.

---
 src/docs/gensfen.md   | 28 ++++++++++++++--------------
 src/learn/gensfen.cpp |  8 ++++----
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/src/docs/gensfen.md b/src/docs/gensfen.md
index e4263a92..ce0f365c 100644
--- a/src/docs/gensfen.md
+++ b/src/docs/gensfen.md
@@ -20,44 +20,44 @@ Currently the following options are available:
 
 `nodes` - the number of nodes to use for evaluation of each position. This number is multiplied by the number of PVs of the current search. This does NOT override the `depth` and `depth2` options. If specified then whichever of depth or nodes limit is reached first applies.
 
-`loop` - the number of training data entries to generate. 1 entry == 1 position.
+`loop` - the number of training data entries to generate. 1 entry == 1 position. Default: 8000000000 (8B).
 
-`output_file_name` - the name of the file to output to. If the extension is not present or doesn't match the selected training data format the right extension will be appened.
+`output_file_name` - the name of the file to output to. If the extension is not present or doesn't match the selected training data format the right extension will be appened. Default: generated_kifu
 
-`eval_limit` - evaluations with higher absolute value than this will not be written and will terminate a self-play game. Should not exceed 10000 which is VALUE_KNOWN_WIN, but is only hardcapped at mate in 2 (\~30000).
+`eval_limit` - evaluations with higher absolute value than this will not be written and will terminate a self-play game. Should not exceed 10000 which is VALUE_KNOWN_WIN, but is only hardcapped at mate in 2 (\~30000). Default: 3000
 
-`random_move_minply` - the minimal ply at which a random move may be executed instead of a move chosen by search
+`random_move_minply` - the minimal ply at which a random move may be executed instead of a move chosen by search. Default: 1.
 
-`random_move_maxply` - the maximal ply at which a random move may be executed instead of a move chosen by search
+`random_move_maxply` - the maximal ply at which a random move may be executed instead of a move chosen by search. Default: 24.
 
-`random_move_count` - maximum number of random moves in a single self-play game
+`random_move_count` - maximum number of random moves in a single self-play game. Default: 5.
 
-`random_move_like_apery` - either 0 or 1. If 1 then random king moves will be followed by a random king move from the opponent whenever possible with 50% probability.
+`random_move_like_apery` - either 0 or 1. If 1 then random king moves will be followed by a random king move from the opponent whenever possible with 50% probability. Default: 0.
 
 `random_multi_pv` - the number of PVs used for determining the random move. If not specified then a truly random move will be chosen. If specified then a multiPV search will be performed the random move will be one of the moves chosen by the search.
 
 `random_multi_pv_diff` - Makes the multiPV random move selection consider only moves that are at most `random_multi_pv_diff` worse than the next best move. Default: 30000 (all multiPV moves).
 
-`random_multi_pv_depth` - the depth to use for multiPV search for random move. Defaults to `depth2`.
+`random_multi_pv_depth` - the depth to use for multiPV search for random move. Default: `depth2`.
 
-`write_minply` - minimum ply for which the training data entry will be emitted.
+`write_minply` - minimum ply for which the training data entry will be emitted. Default: 16.
 
-`write_maxply` - maximum ply for which the training data entry will be emitted.
+`write_maxply` - maximum ply for which the training data entry will be emitted. Default: 400.
 
 `save_every` - the number of training data entries per file. If not specified then there will be always one file. If specified there may be more than one file generated (each having at most `save_every` training data entries) and each file will have a unique number attached.
 
 `random_file_name` - if specified then the output filename will be chosen randomly. Overrides `output_file_name`.
 
-`write_out_draw_game_in_training_data_generation` - either 0 or 1. If 1 then training data from drawn games will be emitted too. Default: 0.
+`write_out_draw_game_in_training_data_generation` - either 0 or 1. If 1 then training data from drawn games will be emitted too. Default: 1.
 
 `use_draw_in_training_data_generation` - deprecated, alias for `write_out_draw_game_in_training_data_generation`
 
-`detect_draw_by_consecutive_low_score` - either 0 or 1. If 1 then drawn games will be adjudicated when the score remains 0 for at least 8 plies after ply 80. Default: 0.
+`detect_draw_by_consecutive_low_score` - either 0 or 1. If 1 then drawn games will be adjudicated when the score remains 0 for at least 8 plies after ply 80. Default: 1.
 
 `use_game_draw_adjudication` - deprecated, alias for `detect_draw_by_consecutive_low_score`
 
-`detect_draw_by_insufficient_mating_material` - either 0 or 1. If 1 then position with insufficient material will be adjudicated as draws. Default: 0.
+`detect_draw_by_insufficient_mating_material` - either 0 or 1. If 1 then position with insufficient material will be adjudicated as draws. Default: 1.
 
-`sfen_format` - format of the training data to use. Either `bin` or `binpack`. Default: `bin`.
+`sfen_format` - format of the training data to use. Either `bin` or `binpack`. Default: `binpack`.
 
 `seed` - seed for the PRNG. Can be either a number or a string. If it's a string then its hash will be used. If not specified then the current time will be used.
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 5720236d..8ceb04e2 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -43,9 +43,9 @@ namespace Learner
         Binpack
     };
 
-    static bool write_out_draw_game_in_training_data_generation = false;
-    static bool detect_draw_by_consecutive_low_score = false;
-    static bool detect_draw_by_insufficient_mating_material = false;
+    static bool write_out_draw_game_in_training_data_generation = true;
+    static bool detect_draw_by_consecutive_low_score = true;
+    static bool detect_draw_by_insufficient_mating_material = true;
 
     static SfenOutputType sfen_output_type = SfenOutputType::Bin;
 
@@ -954,7 +954,7 @@ namespace Learner
         // Add a random number to the end of the file name.
         bool random_file_name = false;
 
-        std::string sfen_format;
+        std::string sfen_format = "binpack";
         std::string seed;
 
         while (true)

From adddf339bba43f0b8210f8ccef376966bcc1ac61 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Wed, 7 Oct 2020 16:07:29 +0200
Subject: [PATCH 188/398] Output sfens/second in the trainer, to track
 performance more easily

---
 src/learn/learn.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index c3335e37..5a540d31 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -28,6 +28,7 @@
 #include "tt.h"
 #include "uci.h"
 #include "search.h"
+#include "timeman.h"
 
 #include "extra/nnue_data_binpack_format.h"
 
@@ -845,9 +846,11 @@ namespace Learner
         // so at this timing the generation of the replacement table is updated.
         // It doesn't matter if you have disabled the substitution table.
         TT.new_search();
+        TimePoint elapsed = now() - Search::Limits.startTime + 1;
 
         cout << "PROGRESS: " << now_string() << ", ";
-        cout << sr.total_done << " sfens";
+        cout << sr.total_done << " sfens, ";
+        cout << sr.total_done * 1000 / elapsed  << " sfens/second";
         cout << ", iteration " << epoch;
         cout << ", learning rate = " << global_learning_rate << ", ";
 
@@ -1930,6 +1933,8 @@ namespace Learner
         {
           auto& limits = Search::Limits;
 
+          limits.startTime = now();
+
           // Make the search equivalent to the "go infinite" command. (Because it is troublesome if time management is done)
           limits.infinite = true;
 

From ef57ac78a339f2233242aed1a04838d0727296eb Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 8 Oct 2020 17:07:07 +0200
Subject: [PATCH 189/398] Print gensfen speed when outputting status.

---
 src/learn/gensfen.cpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 8ceb04e2..5f7541f5 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -229,10 +229,18 @@ namespace Learner
         // Dedicated thread to write to file
         void file_write_worker()
         {
+            auto startTime = now();
+
             auto output_status = [&]()
             {
                 // Also output the current time to console.
-                sync_cout << endl << sfen_write_count << " sfens , at " << now_string() << sync_endl;
+                const auto nowTime = now();
+                const TimePoint elapsed = nowTime - startTime + 1;
+
+                sync_cout << endl
+                    << sfen_write_count << " sfens, "
+                    << sfen_write_count * 1000 / elapsed << " sfens/second, "
+                    << "at " << now_string() << sync_endl;
             };
 
             while (!finished || sfen_buffers_pool.size())

From 2af4bf7eacdfbe02bde6ce714bf2f91d19119e89 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 9 Oct 2020 10:16:02 +0200
Subject: [PATCH 190/398] Move the docs folder one above, it was in src by
 mistake.

---
 {src/docs => docs}/binpack.md | 0
 {src/docs => docs}/convert.md | 0
 {src/docs => docs}/gensfen.md | 0
 {src/docs => docs}/learn.md   | 0
 4 files changed, 0 insertions(+), 0 deletions(-)
 rename {src/docs => docs}/binpack.md (100%)
 rename {src/docs => docs}/convert.md (100%)
 rename {src/docs => docs}/gensfen.md (100%)
 rename {src/docs => docs}/learn.md (100%)

diff --git a/src/docs/binpack.md b/docs/binpack.md
similarity index 100%
rename from src/docs/binpack.md
rename to docs/binpack.md
diff --git a/src/docs/convert.md b/docs/convert.md
similarity index 100%
rename from src/docs/convert.md
rename to docs/convert.md
diff --git a/src/docs/gensfen.md b/docs/gensfen.md
similarity index 100%
rename from src/docs/gensfen.md
rename to docs/gensfen.md
diff --git a/src/docs/learn.md b/docs/learn.md
similarity index 100%
rename from src/docs/learn.md
rename to docs/learn.md

From de20887e110bc70eaeb4b52e33694fc4a3b22738 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 9 Oct 2020 10:53:47 +0200
Subject: [PATCH 191/398] Update readme. Link to docs.

---
 README.md | 71 +++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 50 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index f84a544a..84898792 100644
--- a/README.md
+++ b/README.md
@@ -5,9 +5,11 @@
 <h1 align="center">Stockfish NNUE</h1>
 
 ## Overview
+
 Stockfish NNUE is a port of a shogi neural network named NNUE (efficiently updateable neural network backwards) to Stockfish 11. To learn more about the Stockfish chess engine, look [here](stockfish.md) for an overview and [here](https://github.com/official-stockfish/Stockfish) for the official repository.
 
 ## Building
+
 To compile:
 ```
 make -jN ARCH=... build
@@ -33,8 +35,11 @@ Additional options:
 - `blas=[yes/no]` - whether to use an external BLAS library. Default is `no`. Using an external BLAS library may have a significantly improve learning performance and by default expects openBLAS to be installed.
 
 ## Training Guide
+
 ### Generating Training Data
-To generate training data from the classic eval, use the gensfen command with the setting "Use NNUE" set to "false". The given example is generation in its simplest form. There are more commands. 
+
+To generate training data from the classic eval, use the gensfen command with the setting "Use NNUE" set to "false". The given example is generation in its simplest form. There are more commands.
+
 ```
 uci
 setoption name PruneAtShallowDepth value false
@@ -45,16 +50,26 @@ setoption name SyzygyPath value path
 isready
 gensfen depth a loop b use_draw_in_training_data_generation 1 eval_limit 32000
 ```
-Specify how many threads and how much memory you would like to use with the x and y values. The option SyzygyPath is not necessary, but if you would like to use it, you must first have Syzygy endgame tablebases on your computer, which you can find [here](http://oics.olympuschess.com/tracker/index.php). You will need to have a torrent client to download these tablebases, as that is probably the fastest way to obtain them. The path is the path to the folder containing those tablebases. It does not have to be surrounded in quotes.
 
-This will save a file named "generated_kifu.bin" in the same folder as the binary. Once generation is done, rename the file to something like "1billiondepth12.bin" to remember the depth and quantity of the positions and move it to a folder named "trainingdata" in the same directory as the binaries.
-#### Generation Parameters
-- Depth is the searched depth per move, or how far the engine looks forward. This value is an integer.
-- Loop is the amount of positions generated. This value is also an integer
-### Generating Validation Data
-The process is the same as the generation of training data, except for the fact that you need to set loop to 1 million, because you don't need a lot of validation data. The depth should be the same as before or slightly higher than the depth of the training data. After generation rename the validation data file to val.bin and drop it in a folder named "validationdata" in the same directory to make it easier. 
-### Training a Completely New Network
-Use the "learn" binary. Create an empty folder named "evalsave" in the same directory as the binaries.
+- `depth` is the searched depth per move, or how far the engine looks forward. This value is an integer.
+- `loop` is the amount of positions generated. This value is also an integer.
+
+Specify how many threads and how much memory you would like to use with the `x` and `y` values. The option SyzygyPath is not necessary, but if you would like to use it, you must first have Syzygy endgame tablebases on your computer, which you can find [here](http://oics.olympuschess.com/tracker/index.php). You will need to have a torrent client to download these tablebases, as that is probably the fastest way to obtain them. The `path` is the path to the folder containing those tablebases. It does not have to be surrounded in quotes.
+
+This will create a file named "generated_kifu.binpack" in the same folder as the binary containing the generated training data. Once generation is done, you can rename the file to something like "1billiondepth12.binpack" to remember the depth and quantity of the positions and move it to a folder named "trainingdata" in the same directory as the binaries.
+
+You will also need validation data that is used for loss calculation and accuracy computation. Validation data is generated in the same way as training data, but generally at most 1 million positions should be used as there's no need for more and it would just slow the learning process down. It may also be better to slightly increase the depth for validation data. After generation you can rename the validation data file to "val.binpack" and drop it in a folder named "validationdata" in the same directory to make it easier.
+
+More information about gensfen and available options can be found in the [docs](docs/gensfen.md)
+
+### Training a network
+
+#### Training a Completely New Network
+
+Whether a new network is created or not is controlled by the UCI option `SkipLoadingEval`. If set to true then a new network will be created, which allows learning from scratch. If left at its default (false) then a network will be loaded and trained further. The second scenario is described in the reinforcement learning paragraph.
+
+A simple command chain to start with training could look like this:
+
 ```
 uci
 setoption name EnableTranspositionTable value false
@@ -63,31 +78,45 @@ setoption name SkipLoadingEval value true
 setoption name Use NNUE value pure
 setoption name Threads value x
 isready
-learn targetdir trainingdata loop 100 batchsize 1000000 use_draw_in_training 1 use_draw_in_validation 1 lr 1 lambda 1 eval_limit 32000 nn_batch_size 1000 newbob_decay 0.5 eval_save_interval 250000000 loss_output_interval 1000000 validation_set_file_name validationdata\val.bin
+learn targetdir trainingdata loop 100 batchsize 1000000 use_draw_in_training 1 use_draw_in_validation 1 lr 1 lambda 1 eval_limit 32000 nn_batch_size 1000 newbob_decay 0.5 eval_save_interval 250000000 loss_output_interval 1000000 validation_set_file_name validationdata\val.binpack
 ```
-Nets get saved in the "evalsave" folder. 
 
-#### Training Parameters
-- eta is the learning rate
-- lambda is the amount of weight it puts to eval of learning data vs win/draw/loss results. 1 puts all weight on eval, lambda 0 puts all weight on WDL results.
+This will utilize training data files in the "trainingdata" directory and validation data from file "validationdata\val.bin". Produced nets are saved in the "evalsave" folder.
 
-### Reinforcement Learning
-If you would like to do some reinforcement learning on your original network, you must first generate training data using the learn binaries with the setting `Use NNUE` set to `pure`. Make sure that your previously trained network is in the eval folder. Use the commands specified above. Make sure `SkipLoadingEval` is set to false so that the data generated is using the neural net's eval by typing the command `setoption name SkipLoadingEval value false` before typing the `isready` command. You should aim to generate less positions than the first run, around 1/10 of the number of positions generated in the first run. The depth should be higher as well. You should also do the same for validation data, with the depth being higher than the last run.
+More information about learn and available parameters can be found in the [docs](docs/learn.md)
 
-After you have generated the training data, you must move it into your training data folder and delete the older data so that the binary does not accidentally train on the same data again. Do the same for the validation data and name it to val-1.bin to make it less confusing. Make sure the evalsave folder is empty. Then, using the same binary, type in the training commands shown above. Do __NOT__ set `SkipLoadingEval` to true, it must be false or you will get a completely new network, instead of a network trained with reinforcement learning. You should also set eval_save_interval to a number that is lower than the amount of positions in your training data, perhaps also 1/10 of the original value. The validation file should be set to the new validation data, not the old data.
+#### Reinforcement Learning
 
-After training is finished, your new net should be located in the "final" folder under the "evalsave" directory. You should test this new network against the older network to see if there are any improvements.
+If you would like to do some reinforcement learning on your original network, you must first generate training data with the setting `Use NNUE` set to `pure` and using the previous network (either name it "nn.bin" and put into alongside the binary or provide the `EvalFile` UCI option). Use the commands specified above. You should aim to generate less positions than the first run, around 1/10 of the number of positions generated in the first run. The depth should be higher as well. You should also do the same for validation data, with the depth being higher than the last run.
+
+After you have generated the training data, you must move it into your training data folder and move the older data so that the binary does not train on the same data again. Do the same for the validation data. Make sure the "evalsave" folder is empty. Then, using the same binary, type in the training commands shown above. Do __NOT__ set `SkipLoadingEval` to true, it must be false or you will get a completely new network, instead of a network trained with reinforcement learning. You should also set `eval_save_interval` to a number that is lower than the amount of positions in your training data, perhaps also 1/10 of the original value.
+
+After training is finished, your new net should be located in the "final" folder under the "evalsave" directory. You should test this new network against the older network to see if there are any improvements. Don't rely on the automatic rejection for network quality, sometimes even rejected nets can be better than the previous ones.
 
 ## Using Your Trained Net
+
 If you want to use your generated net, copy the net located in the "final" folder under the "evalsave" directory and move it into a new folder named "eval" under the directory with the binaries. You can then use the halfkp_256x2 binaries pertaining to your CPU with a standard chess GUI, such as Cutechess. Refer to the [releases page](https://abrok.eu/stockfish) to find out which binary is best for your CPU.
 
-If the engine does not load any net file, or shows "Error! *** not found or wrong format", please try to specify the net with the full file path with the "EvalFile" option by typing the command `setoption name EvalFile value path` where path is the full file path. The "Use NNUE" option must be set to true with the command `setoption name Use NNUE value true`.
+If the engine does not load any net file, or shows "Error! *** not found or wrong format", please try to specify the net with the full file path with the `EvalFile` UCI option by typing the command `setoption name EvalFile value path` where path is the full file path. The `Use NNUE` UCI option must be set either to `true` or `pure` with the command `setoption name Use NNUE value true/pure`.
+
+## Training data formats.
+
+Currently there are 3 training data formats. Two of them are supported directly.
+
+- `.bin` - the original training data format. Uses 40 bytes per entry. Is supported directly by the `gensfen` and `learn` commands.
+- `.plain` - a human readable training data format. This one is not supported directly by the `gensfen` and `learn` commands. It should not be used for data exchange because it's less compact than other formats. It is mostly useful for inspection of the data.
+- `.binpack` - a compact binary training data format that exploits positions chains to further reduce size. It uses on average between 2 to 3 bytes per entry when generating data with `gensfen`. It is supported directly by `gensfen` and `learn` commands. It is currently the default for the `gensfen` command. A more in depth description can be found [here](docs/binpack.md)
+
+### Conversion between formats.
+
+There is a builting converted that support all 3 formats described above. Any of them can be converted to any other. For more information and usage guide see [here](docs/convert.md).
 
 ## Resources
+
 - [Stockfish NNUE Wiki](https://www.qhapaq.org/shogi/shogiwiki/stockfish-nnue/)
 - [Training instructions](https://twitter.com/mktakizawa/status/1273042640280252416) from the creator of the Elmo shogi engine
 - [Original Talkchess thread](http://talkchess.com/forum3/viewtopic.php?t=74059) discussing Stockfish NNUE
-- [Guide to Stockfish NNUE](http://yaneuraou.yaneu.com/2020/06/19/stockfish-nnue-the-complete-guide/) 
+- [Guide to Stockfish NNUE](http://yaneuraou.yaneu.com/2020/06/19/stockfish-nnue-the-complete-guide/)
 - [Unofficial Stockfish Discord](https://discord.gg/nv8gDtt)
 
 A more updated list can be found in the #sf-nnue-resources channel in the Discord.

From 7d62b3f79959c2c5d44bdd3118734a5f8dd7bc26 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 11 Oct 2020 12:01:23 +0200
Subject: [PATCH 192/398] Store additional bits for fullmove clock and 50 more
 rule halfmove clock at the end of the bit stream. This change keeps backwards
 compatibility.

---
 src/extra/nnue_data_binpack_format.h | 90 +++++++++++++++-------------
 src/learn/sfen_packer.cpp            | 76 +++++++++++------------
 2 files changed, 86 insertions(+), 80 deletions(-)

diff --git a/src/extra/nnue_data_binpack_format.h b/src/extra/nnue_data_binpack_format.h
index 7ceafbc0..826b2959 100644
--- a/src/extra/nnue_data_binpack_format.h
+++ b/src/extra/nnue_data_binpack_format.h
@@ -4482,12 +4482,12 @@ namespace chess
             return m_ply;
         }
 
-        [[nodiscard]] inline std::uint16_t halfMove() const
+        [[nodiscard]] inline std::uint16_t fullMove() const
         {
             return (m_ply + 1) / 2;
         }
 
-        inline void setHalfMove(std::uint16_t hm)
+        inline void setFullMove(std::uint16_t hm)
         {
             m_ply = 2 * hm - 1 + (m_sideToMove == Color::Black);
         }
@@ -5366,10 +5366,10 @@ namespace chess
         }
 
         {
-            const auto halfMove = nextPart();
-            if (!halfMove.empty())
+            const auto fullMove = nextPart();
+            if (!fullMove.empty())
             {
-                m_ply = std::stoi(halfMove.data()) * 2 - (m_sideToMove == Color::White);
+                m_ply = std::stoi(fullMove.data()) * 2 - (m_sideToMove == Color::White);
             }
             else
             {
@@ -5419,7 +5419,7 @@ namespace chess
         fen += std::to_string(m_rule50Counter);
 
         fen += ' ';
-        fen += std::to_string(halfMove());
+        fen += std::to_string(fullMove());
 
         return fen;
     }
@@ -5862,43 +5862,24 @@ namespace binpack
         // Huffman coding
         // * is simplified from mini encoding to make conversion easier.
         //
-        // 1 box on the board (other than NO_PIECE) = 2 to 6 bits (+ 1-bit flag + 1-bit forward and backward)
-        // 1 piece of hand piece = 1-5bit (+ 1-bit flag + 1bit ahead and behind)
-        //
-        // empty xxxxx0 + 0 (none)
-        // step xxxx01 + 2 xxxx0 + 2
-        // incense xx0011 + 2 xx001 + 2
-        // Katsura xx1011 + 2 xx101 + 2
-        // silver xx0111 + 2 xx011 + 2
-        // Gold x01111 + 1 x0111 + 1 // Gold is valid and has no flags.
-        // corner 011111 + 2 01111 + 2
-        // Fly 111111 + 2 11111 + 2
-        //
-        // Assuming all pieces are on the board,
-        // Sky 81-40 pieces = 41 boxes = 41bit
-        // Walk 4bit*18 pieces = 72bit
-        // Incense 6bit*4 pieces = 24bit
-        // Katsura 6bit*4 pieces = 24bit
-        // Silver 6bit*4 pieces = 24bit
-        // Gold 6bit* 4 pieces = 24bit
-        // corner 8bit* 2 pieces = 16bit
-        // Fly 8bit* 2 pieces = 16bit
-        // -------
-        // 241bit + 1bit (turn) + 7bit × 2 (King's position after) = 256bit
-        //
-        // When the piece on the board moves to the hand piece, the piece on the board becomes empty, so the box on the board can be expressed with 1 bit,
-        // Since the hand piece can be expressed by 1 bit less than the piece on the board, the total number of bits does not change in the end.
-        // Therefore, in this expression, any aspect can be expressed by this bit number.
-        // It is a hand piece and no flag is required, but if you include this, the bit number of the piece on the board will be -1
-        // Since the total number of bits can be fixed, we will include this as well.
-
         // Huffman Encoding
         //
         // Empty  xxxxxxx0
-        // Pawn   xxxxx001 + 1 bit (Side to move)
-        // Knight xxxxx011 + 1 bit (Side to move)
-        // Bishop xxxxx101 + 1 bit (Side to move)
-        // Rook   xxxxx111 + 1 bit (Side to move)
+        // Pawn   xxxxx001 + 1 bit (Color)
+        // Knight xxxxx011 + 1 bit (Color)
+        // Bishop xxxxx101 + 1 bit (Color)
+        // Rook   xxxxx111 + 1 bit (Color)
+        // Queen   xxxx1001 + 1 bit (Color)
+        //
+        // Worst case:
+        // - 32 empty squares    32 bits
+        // - 30 pieces           150 bits
+        // - 2 kings             12 bits
+        // - castling rights     4 bits
+        // - ep square           7 bits
+        // - rule50              7 bits
+        // - game ply            16 bits
+        // - TOTAL               228 bits < 256 bits
 
         struct HuffmanedPiece
         {
@@ -5980,7 +5961,17 @@ namespace binpack
 
                 stream.write_n_bit(pos.rule50Counter(), 6);
 
-                stream.write_n_bit(pos.halfMove(), 8);
+                stream.write_n_bit(pos.fullMove(), 8);
+
+                // Write high bits of half move. This is a fix for the
+                // limited range of half move counter.
+                // This is backwards compatibile.
+                stream.write_n_bit(pos.fullMove() >> 8, 8);
+
+                // Write the highest bit of rule50 at the end. This is a backwards
+                // compatibile fix for rule50 having only 6 bits stored.
+                // This bit is just ignored by the old parsers.
+                stream.write_n_bit(pos.rule50Counter() >> 6, 1);
 
                 assert(stream.get_cursor() <= 256);
             }
@@ -6105,10 +6096,23 @@ namespace binpack
             }
 
             // Halfmove clock
-            pos.setRule50Counter(stream.read_n_bit(6));
+            std::uint8_t rule50 = stream.read_n_bit(6);
 
             // Fullmove number
-            pos.setHalfMove(stream.read_n_bit(8));
+            std::uint16_t fullmove = stream.read_n_bit(8);
+
+            // Fullmove number, high bits
+            // This was added as a fix for fullmove clock
+            // overflowing at 256. This change is backwards compatibile.
+            fullmove |= stream.read_n_bit(8) << 8;
+
+            // Read the highest bit of rule50. This was added as a fix for rule50
+            // counter having only 6 bits stored.
+            // In older entries this will just be a zero bit.
+            rule50 |= stream.read_n_bit(1) << 6;
+
+            pos.setFullMove(fullmove);
+            pos.setRule50Counter(rule50);
 
             assert(stream.get_cursor() <= 256);
 
diff --git a/src/learn/sfen_packer.cpp b/src/learn/sfen_packer.cpp
index 19c745ad..2de7efa4 100644
--- a/src/learn/sfen_packer.cpp
+++ b/src/learn/sfen_packer.cpp
@@ -113,43 +113,24 @@ namespace Learner {
   // Huffman coding
   // * is simplified from mini encoding to make conversion easier.
   //
-  // 1 box on the board (other than NO_PIECE) = 2 to 6 bits (+ 1-bit flag + 1-bit forward and backward)
-  // 1 piece of hand piece = 1-5bit (+ 1-bit flag + 1bit ahead and behind)
-  //
-  // empty xxxxx0 + 0 (none)
-  // step xxxx01 + 2 xxxx0 + 2
-  // incense xx0011 + 2 xx001 + 2
-  // Katsura xx1011 + 2 xx101 + 2
-  // silver xx0111 + 2 xx011 + 2
-  // Gold x01111 + 1 x0111 + 1 // Gold is valid and has no flags.
-  // corner 011111 + 2 01111 + 2
-  // Fly 111111 + 2 11111 + 2
-  //
-  // Assuming all pieces are on the board,
-  // Sky 81-40 pieces = 41 boxes = 41bit
-  // Walk 4bit*18 pieces = 72bit
-  // Incense 6bit*4 pieces = 24bit
-  // Katsura 6bit*4 pieces = 24bit
-  // Silver 6bit*4 pieces = 24bit
-  // Gold 6bit* 4 pieces = 24bit
-  // corner 8bit* 2 pieces = 16bit
-  // Fly 8bit* 2 pieces = 16bit
-  // -------
-  // 241bit + 1bit (turn) + 7bit × 2 (King's position after) = 256bit
-  //
-  // When the piece on the board moves to the hand piece, the piece on the board becomes empty, so the box on the board can be expressed with 1 bit,
-  // Since the hand piece can be expressed by 1 bit less than the piece on the board, the total number of bits does not change in the end.
-  // Therefore, in this expression, any aspect can be expressed by this bit number.
-  // It is a hand piece and no flag is required, but if you include this, the bit number of the piece on the board will be -1
-  // Since the total number of bits can be fixed, we will include this as well.
-
   // Huffman Encoding
   //
   // Empty  xxxxxxx0
-  // Pawn   xxxxx001 + 1 bit (Side to move)
-  // Knight xxxxx011 + 1 bit (Side to move)
-  // Bishop xxxxx101 + 1 bit (Side to move)
-  // Rook   xxxxx111 + 1 bit (Side to move)
+  // Pawn   xxxxx001 + 1 bit (Color)
+  // Knight xxxxx011 + 1 bit (Color)
+  // Bishop xxxxx101 + 1 bit (Color)
+  // Rook   xxxxx111 + 1 bit (Color)
+  // Queen   xxxx1001 + 1 bit (Color)
+  //
+  // Worst case:
+  // - 32 empty squares    32 bits
+  // - 30 pieces           150 bits
+  // - 2 kings             12 bits
+  // - castling rights     4 bits
+  // - ep square           7 bits
+  // - rule50              7 bits
+  // - game ply            16 bits
+  // - TOTAL               228 bits < 256 bits
 
   struct HuffmanedPiece
   {
@@ -212,7 +193,18 @@ namespace Learner {
 
     stream.write_n_bit(pos.state()->rule50, 6);
 
-    stream.write_n_bit(1 + (pos.game_ply()-(pos.side_to_move() == BLACK)) / 2, 8);
+    const int fm = 1 + (pos.game_ply()-(pos.side_to_move() == BLACK)) / 2;
+    stream.write_n_bit(fm, 8);
+
+    // Write high bits of half move. This is a fix for the
+    // limited range of half move counter.
+    // This is backwards compatibile.
+    stream.write_n_bit(fm >> 8, 8);
+
+    // Write the highest bit of rule50 at the end. This is a backwards
+    // compatibile fix for rule50 having only 6 bits stored.
+    // This bit is just ignored by the old parsers.
+    stream.write_n_bit(pos.state()->rule50 >> 6, 1);
 
     assert(stream.get_cursor() <= 256);
   }
@@ -355,10 +347,20 @@ namespace Learner {
     }
 
     // Halfmove clock
-    pos.st->rule50 = static_cast<Square>(stream.read_n_bit(6));
+    pos.st->rule50 = stream.read_n_bit(6);
 
     // Fullmove number
-    pos.gamePly = static_cast<Square>(stream.read_n_bit(8));
+    pos.gamePly = stream.read_n_bit(8);
+
+    // Read the highest bit of rule50. This was added as a fix for rule50
+    // counter having only 6 bits stored.
+    // In older entries this will just be a zero bit.
+    pos.gamePly |= stream.read_n_bit(8) << 8;
+
+    // Read the highest bit of rule50. This was added as a fix for rule50
+    // counter having only 6 bits stored.
+    // In older entries this will just be a zero bit.
+    pos.st->rule50 |= stream.read_n_bit(1) << 6;
 
     // Convert from fullmove starting from 1 to gamePly starting from 0,
     // handle also common incorrect FEN with fullmove = 0.

From 4a2bf16b3046d92522d52518a33985273d72cc22 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 12 Oct 2020 11:46:50 +0200
Subject: [PATCH 193/398] Add option "auto_lr_drop" that specifies the amount
 of positions from previous lr drop after which to reduce lr by newbob_decay.

---
 src/learn/learn.cpp | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 5a540d31..3648a40f 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -744,6 +744,8 @@ namespace Learner
 
             newbob_decay = 1.0;
             newbob_num_trials = 2;
+            auto_lr_drop = 0;
+            last_lr_drop = 0;
             best_loss = std::numeric_limits<double>::infinity();
             latest_loss_sum = 0.0;
             latest_loss_count = 0;
@@ -797,6 +799,8 @@ namespace Learner
         shared_timed_mutex nn_mutex;
         double newbob_decay;
         int newbob_num_trials;
+        uint64_t auto_lr_drop;
+        uint64_t last_lr_drop;
         double best_loss;
         double latest_loss_sum;
         uint64_t latest_loss_count;
@@ -1295,7 +1299,21 @@ namespace Learner
                 latest_loss_sum = 0.0;
                 latest_loss_count = 0;
                 cout << "loss: " << latest_loss;
-                if (latest_loss < best_loss)
+                auto tot = sr.total_done.load();
+                if (auto_lr_drop)
+                {
+                    cout << " < best (" << best_loss << "), accepted" << endl;
+                    best_loss = latest_loss;
+                    best_nn_directory = Path::Combine((std::string)Options["EvalSaveDir"], dir_name);
+                    trials = newbob_num_trials;
+
+                    if (tot >= last_lr_drop + auto_lr_drop)
+                    {
+                        last_lr_drop = tot;
+                        global_learning_rate *= newbob_decay;
+                    }
+                }
+                else if (latest_loss < best_loss)
                 {
                     cout << " < best (" << best_loss << "), accepted" << endl;
                     best_loss = latest_loss;
@@ -1647,6 +1665,7 @@ namespace Learner
         uint64_t nn_batch_size = 1000;
         double newbob_decay = 0.5;
         int newbob_num_trials = 4;
+        uint64_t auto_lr_drop = 0;
         string nn_options;
 
         uint64_t eval_save_interval = LEARN_EVAL_SAVE_INTERVAL;
@@ -1729,6 +1748,7 @@ namespace Learner
             else if (option == "newbob_decay") is >> newbob_decay;
             else if (option == "newbob_num_trials") is >> newbob_num_trials;
             else if (option == "nn_options") is >> nn_options;
+            else if (option == "auto_lr_drop") is >> auto_lr_drop;
 
             else if (option == "eval_save_interval") is >> eval_save_interval;
             else if (option == "loss_output_interval") is >> loss_output_interval;
@@ -1972,6 +1992,7 @@ namespace Learner
 
         learn_think.newbob_decay = newbob_decay;
         learn_think.newbob_num_trials = newbob_num_trials;
+        learn_think.auto_lr_drop = auto_lr_drop;
 
         learn_think.eval_save_interval = eval_save_interval;
         learn_think.loss_output_interval = loss_output_interval;

From 4a340ad3b28823ea26e502d4dc3a68b41a349d39 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 12 Oct 2020 11:48:57 +0200
Subject: [PATCH 194/398] Add docs for auto_lr_drop

---
 docs/learn.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/learn.md b/docs/learn.md
index 4c8c3fc1..3a580134 100644
--- a/docs/learn.md
+++ b/docs/learn.md
@@ -64,6 +64,8 @@ Currently the following options are available:
 
 `newbob_num_trials` - determines after how many subsequent rejected nets the training process will be terminated. Default: 4.
 
+`auto_lr_drop` - every time this many positions are processed the learning rate is multiplied by `newbob_decay`. In other words this value specifies for how many positions a single learning rate stage lasts. If 0 then doesn't have any effect. Default: 0.
+
 `nn_options` - if you're reading this you don't use it. It passes messages directly to the network evaluation. I don't know what it can do either.
 
 `eval_save_interval` - every `eval_save_interval` positions the network will be saved and either accepted or rejected (in which case an LR drop follows). Default: 100000000 (100M). (generally people use values in 10M-100M range)

From ba73f8ce0d545a0f627b5bc8ba274ae9c85918f3 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Wed, 14 Oct 2020 10:23:30 +0200
Subject: [PATCH 195/398] Update default net to nn-04cf2b4ed1da.nnue

Further tune the net parameters, now the last but one layer (32x32).
To limit the number of parameters optimized, the network layer was
decomposed using SVD, and the singular values were treated
as parameters and tuned.

Tuning branch: https://github.com/vondele/Stockfish/tree/svdTune
Tuner: https://github.com/vondele/nevergrad4sf

passed STC:
https://tests.stockfishchess.org/tests/view/5f83e82f8ea73fb8ddf83e4e
LLR: 2.94 (-2.94,2.94) {-0.25,1.25}
Total: 8488 W: 944 L: 795 D: 6749
Ptnml(0-2): 39, 609, 2811, 734, 51

passed LTC:
https://tests.stockfishchess.org/tests/view/5f83f4118ea73fb8ddf83e66
LLR: 2.94 (-2.94,2.94) {0.25,1.25}
Total: 169016 W: 8043 L: 7589 D: 153384
Ptnml(0-2): 133, 6623, 70538, 7085, 129

closes https://github.com/official-stockfish/Stockfish/pull/3181

Bench: 3945198
---
 src/evaluate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/evaluate.h b/src/evaluate.h
index 4b57a050..6a17f284 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -36,7 +36,7 @@ namespace Eval {
   // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
   // for the build process (profile-build and fishtest) to work. Do not change the
   // name of the macro, as it is used in the Makefile.
-  #define EvalFileDefaultName   "nn-baeb9ef2d183.nnue"
+  #define EvalFileDefaultName   "nn-04cf2b4ed1da.nnue"
 
   namespace NNUE {
 

From 4a5cc1365f48f7fff08d3184cadac7a0a75dda6d Mon Sep 17 00:00:00 2001
From: FauziAkram <fauzi.dabat@hotmail.com>
Date: Tue, 6 Oct 2020 22:43:48 +0300
Subject: [PATCH 196/398] RookOnQueenFile Removal

Removing Rook On Queen File looks beneficial, and it might even bring some ELO.
I will try to reintroduce it with a different method later on.

Passed STC:
https://tests.stockfishchess.org/tests/view/5f7cea204389873867eb10cb
LLR: 2.94 (-2.94,2.94) {-1.25,0.25}
Total: 18624 W: 3800 L: 3568 D: 11256
Ptnml(0-2): 308, 2131, 4257, 2253, 363

Passed LTC:
https://tests.stockfishchess.org/tests/view/5f7d76a4e936c6892bf50598
LLR: 2.95 (-2.94,2.94) {-0.75,0.25}
Total: 117864 W: 15515 L: 15340 D: 87009
Ptnml(0-2): 926, 11127, 34671, 11262, 946

closes https://github.com/official-stockfish/Stockfish/pull/3176

Bench: 3756191
---
 src/evaluate.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 25e3bdc1..c68577a3 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -265,7 +265,6 @@ namespace {
   constexpr Score ReachableOutpost    = S( 31, 22);
   constexpr Score RestrictedPiece     = S(  7,  7);
   constexpr Score RookOnKingRing      = S( 16,  0);
-  constexpr Score RookOnQueenFile     = S(  6, 11);
   constexpr Score SliderOnQueen       = S( 60, 18);
   constexpr Score ThreatByKing        = S( 24, 89);
   constexpr Score ThreatByPawnPush    = S( 48, 39);
@@ -481,10 +480,6 @@ namespace {
 
         if (Pt == ROOK)
         {
-            // Bonus for rook on the same file as a queen
-            if (file_bb(s) & pos.pieces(QUEEN))
-                score += RookOnQueenFile;
-
             // Bonus for rook on an open or semi-open file
             if (pos.is_on_semiopen_file(Us, s))
                 score += RookOnFile[pos.is_on_semiopen_file(Them, s)];

From 288a604411fa72b06b30f16194cd03592b28f6f2 Mon Sep 17 00:00:00 2001
From: Unai Corzo <corzounai@gmail.com>
Date: Mon, 12 Oct 2020 09:03:49 +0200
Subject: [PATCH 197/398] Scale factor tweak

Add !pawnsOnBothFlanks heuristic to scale factor.

STC https://tests.stockfishchess.org/tests/view/5f8080575b3847b5d41f9134
LLR: 2.94 (-2.94,2.94) {-0.25,1.25}
Total: 250960 W: 49779 L: 49168 D: 152013
Ptnml(0-2): 4224, 28822, 58802, 29383, 4249

LTC https://tests.stockfishchess.org/tests/view/5f832f498ea73fb8ddf83ddb
LLR: 2.95 (-2.94,2.94) {0.25,1.25}
Total: 88584 W: 11827 L: 11388 D: 65369
Ptnml(0-2): 585, 8079, 26578, 8412, 638

closes https://github.com/official-stockfish/Stockfish/pull/3179

bench: 3834252
---
 src/evaluate.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index c68577a3..425ba6f8 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -905,7 +905,9 @@ namespace {
             sf = 37 + 3 * (pos.count<QUEEN>(WHITE) == 1 ? pos.count<BISHOP>(BLACK) + pos.count<KNIGHT>(BLACK)
                                                         : pos.count<BISHOP>(WHITE) + pos.count<KNIGHT>(WHITE));
         else
-            sf = std::min(sf, 36 + 7 * pos.count<PAWN>(strongSide));
+            sf = std::min(sf, 36 + 7 * pos.count<PAWN>(strongSide)) - 4 * !pawnsOnBothFlanks;
+      
+        sf -= 4 * !pawnsOnBothFlanks;
     }
 
     // Interpolate between the middlegame and (scaled by 'sf') endgame score

From 0494adeb2c9dba82f3ffd78823822aab4d450764 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 14 Oct 2020 19:06:47 +0200
Subject: [PATCH 198/398] Move nnue evaluation stuff from evaluate.h to
 nnue/evaluate_nnue.h

---
 src/evaluate.cpp           | 90 +++-----------------------------------
 src/evaluate.h             | 19 --------
 src/learn/gensfen.cpp      |  1 +
 src/learn/learn.cpp        |  1 +
 src/learn/multi_think.cpp  |  2 +
 src/main.cpp               |  2 +
 src/nnue/evaluate_nnue.cpp | 85 ++++++++++++++++++++++++++++++++++-
 src/nnue/evaluate_nnue.h   | 15 +++++++
 src/nnue/nnue_common.h     |  2 +
 src/position.cpp           | 10 +++--
 src/search.cpp             |  2 +
 src/uci.cpp                |  1 +
 src/ucioption.cpp          |  1 +
 13 files changed, 122 insertions(+), 109 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index b3894fe8..0326a2f8 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -27,6 +27,8 @@
 #include <streambuf>
 #include <vector>
 
+#include "nnue/evaluate_nnue.h"
+
 #include "bitboard.h"
 #include "evaluate.h"
 #include "material.h"
@@ -37,88 +39,6 @@
 #include "incbin/incbin.h"
 
 using namespace std;
-using namespace Eval::NNUE;
-
-namespace Eval {
-
-  UseNNUEMode useNNUE;
-  string eval_file_loaded = "None";
-
-  static UseNNUEMode nnue_mode_from_option(const UCI::Option& mode)
-  {
-    if (mode == "false")
-      return UseNNUEMode::False;
-    else if (mode == "true")
-      return UseNNUEMode::True;
-    else if (mode == "pure")
-      return UseNNUEMode::Pure;
-
-    return UseNNUEMode::False;
-  }
-
-  void NNUE::init() {
-
-    useNNUE = nnue_mode_from_option(Options["Use NNUE"]);
-    if (useNNUE == UseNNUEMode::False)
-        return;
-
-    string eval_file = string(Options["EvalFile"]);
-
-    #if defined(DEFAULT_NNUE_DIRECTORY)
-    #define stringify2(x) #x
-    #define stringify(x) stringify2(x)
-    vector<string> dirs = { "" , CommandLine::binaryDirectory , stringify(DEFAULT_NNUE_DIRECTORY) };
-    #else
-    vector<string> dirs = { "" , CommandLine::binaryDirectory };
-    #endif
-
-    for (string directory : dirs)
-        if (eval_file_loaded != eval_file)
-        {
-            ifstream stream(directory + eval_file, ios::binary);
-            if (load_eval(eval_file, stream))
-            {
-                sync_cout << "info string Loaded eval file " << directory + eval_file << sync_endl;
-                eval_file_loaded = eval_file;
-            }
-            else
-            {
-                sync_cout << "info string ERROR: failed to load eval file " << directory + eval_file << sync_endl;
-            }
-        }
-  }
-
-  /// NNUE::verify() verifies that the last net used was loaded successfully
-  void NNUE::verify() {
-
-    string eval_file = string(Options["EvalFile"]);
-
-    if (useNNUE != UseNNUEMode::False && eval_file_loaded != eval_file)
-    {
-        UCI::OptionsMap defaults;
-        UCI::init(defaults);
-
-        string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available.";
-        string msg2 = "The option is set to true, but the network file " + eval_file + " was not loaded successfully.";
-        string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file.";
-        string msg4 = "The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/" + string(defaults["EvalFile"]);
-        string msg5 = "The engine will be terminated now.";
-
-        sync_cout << "info string ERROR: " << msg1 << sync_endl;
-        sync_cout << "info string ERROR: " << msg2 << sync_endl;
-        sync_cout << "info string ERROR: " << msg3 << sync_endl;
-        sync_cout << "info string ERROR: " << msg4 << sync_endl;
-        sync_cout << "info string ERROR: " << msg5 << sync_endl;
-
-        exit(EXIT_FAILURE);
-    }
-
-    if (useNNUE != UseNNUEMode::False)
-        sync_cout << "info string NNUE evaluation using " << eval_file << " enabled" << sync_endl;
-    else
-        sync_cout << "info string classical evaluation enabled" << sync_endl;
-  }
-}
 
 namespace Trace {
 
@@ -994,7 +914,7 @@ Value Eval::evaluate(const Position& pos) {
 
   Value v;
 
-  if (Eval::useNNUE == UseNNUEMode::Pure) {
+  if (NNUE::useNNUE == NNUE::UseNNUEMode::Pure) {
       v = NNUE::evaluate(pos);
 
       // Guarantee evaluation does not hit the tablebase range
@@ -1002,7 +922,7 @@ Value Eval::evaluate(const Position& pos) {
 
       return v;
   }
-  else if (Eval::useNNUE == UseNNUEMode::False)
+  else if (NNUE::useNNUE == NNUE::UseNNUEMode::False)
       v = Evaluation<NO_TRACE>(pos).value();
   else
   {
@@ -1085,7 +1005,7 @@ std::string Eval::trace(const Position& pos) {
 
   ss << "\nClassical evaluation: " << to_cp(v) << " (white side)\n";
 
-  if (useNNUE != UseNNUEMode::False)
+  if (NNUE::useNNUE != NNUE::UseNNUEMode::False)
   {
       v = NNUE::evaluate(pos);
       v = pos.side_to_move() == WHITE ? v : -v;
diff --git a/src/evaluate.h b/src/evaluate.h
index bce5488d..fc626698 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -26,33 +26,14 @@
 class Position;
 
 namespace Eval {
-  enum struct UseNNUEMode
-  {
-    False,
-    True,
-    Pure
-  };
-
   std::string trace(const Position& pos);
   Value evaluate(const Position& pos);
 
-  extern UseNNUEMode useNNUE;
-  extern std::string eval_file_loaded;
-
   // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
   // for the build process (profile-build and fishtest) to work. Do not change the
   // name of the macro, as it is used in the Makefile.
   #define EvalFileDefaultName   "nn-98a7585c85e9.nnue"
 
-  namespace NNUE {
-
-    Value evaluate(const Position& pos);
-    bool load_eval(std::string name, std::istream& stream);
-    void init();
-    void verify();
-
-  } // namespace NNUE
-
 } // namespace Eval
 
 #endif // #ifndef EVALUATE_H_INCLUDED
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 5f7541f5..7c5b20be 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -12,6 +12,7 @@
 
 #include "extra/nnue_data_binpack_format.h"
 
+#include "nnue/evaluate_nnue.h"
 #include "nnue/evaluate_nnue_learner.h"
 
 #include "syzygy/tbprobe.h"
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 3648a40f..b2ee5aa1 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -32,6 +32,7 @@
 
 #include "extra/nnue_data_binpack_format.h"
 
+#include "nnue/evaluate_nnue.h"
 #include "nnue/evaluate_nnue_learner.h"
 
 #include "syzygy/tbprobe.h"
diff --git a/src/learn/multi_think.cpp b/src/learn/multi_think.cpp
index 80bc72b5..daed3e96 100644
--- a/src/learn/multi_think.cpp
+++ b/src/learn/multi_think.cpp
@@ -1,5 +1,7 @@
 ﻿#include "multi_think.h"
 
+#include "nnue/evaluate_nnue.h"
+
 #include "tt.h"
 #include "uci.h"
 #include "types.h"
diff --git a/src/main.cpp b/src/main.cpp
index e6dff918..1a13dc62 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -18,6 +18,8 @@
 
 #include <iostream>
 
+#include "nnue/evaluate_nnue.h"
+
 #include "bitboard.h"
 #include "endgame.h"
 #include "position.h"
diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index 28c86feb..f7f9adcc 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -19,12 +19,14 @@
 // Code for calculating NNUE evaluation function
 
 #include <iostream>
+#include <string>
+#include <fstream>
 #include <set>
 
-#include "../evaluate.h"
 #include "../position.h"
 #include "../misc.h"
 #include "../uci.h"
+#include "../types.h"
 
 #include "evaluate_nnue.h"
 
@@ -69,6 +71,9 @@ namespace Eval::NNUE {
       ",Network=" + Network::GetStructureString();
   }
 
+  UseNNUEMode useNNUE;
+  std::string eval_file_loaded = "None";
+
   namespace Detail {
 
   // Initialize the evaluation function parameters
@@ -190,4 +195,82 @@ namespace Eval::NNUE {
     return ReadParameters(stream);
   }
 
+  static UseNNUEMode nnue_mode_from_option(const UCI::Option& mode)
+  {
+    if (mode == "false")
+      return UseNNUEMode::False;
+    else if (mode == "true")
+      return UseNNUEMode::True;
+    else if (mode == "pure")
+      return UseNNUEMode::Pure;
+
+    return UseNNUEMode::False;
+  }
+
+  void init() {
+
+    useNNUE = nnue_mode_from_option(Options["Use NNUE"]);
+    if (useNNUE == UseNNUEMode::False)
+        return;
+
+    std::string eval_file = std::string(Options["EvalFile"]);
+
+    #if defined(DEFAULT_NNUE_DIRECTORY)
+    #define stringify2(x) #x
+    #define stringify(x) stringify2(x)
+    std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory , stringify(DEFAULT_NNUE_DIRECTORY) };
+    #else
+    std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory };
+    #endif
+
+    for (std::string directory : dirs)
+        if (eval_file_loaded != eval_file)
+        {
+            std::ifstream stream(directory + eval_file, std::ios::binary);
+            if (load_eval(eval_file, stream))
+            {
+                sync_cout << "info string Loaded eval file " << directory + eval_file << sync_endl;
+                eval_file_loaded = eval_file;
+            }
+            else
+            {
+                sync_cout << "info string ERROR: failed to load eval file " << directory + eval_file << sync_endl;
+            }
+        }
+
+    #undef stringify2
+    #undef stringify
+  }
+
+  /// NNUE::verify() verifies that the last net used was loaded successfully
+  void verify() {
+
+    std::string eval_file = std::string(Options["EvalFile"]);
+
+    if (useNNUE != UseNNUEMode::False && eval_file_loaded != eval_file)
+    {
+        UCI::OptionsMap defaults;
+        UCI::init(defaults);
+
+        std::string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available.";
+        std::string msg2 = "The option is set to true, but the network file " + eval_file + " was not loaded successfully.";
+        std::string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file.";
+        std::string msg4 = "The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/" + std::string(defaults["EvalFile"]);
+        std::string msg5 = "The engine will be terminated now.";
+
+        sync_cout << "info string ERROR: " << msg1 << sync_endl;
+        sync_cout << "info string ERROR: " << msg2 << sync_endl;
+        sync_cout << "info string ERROR: " << msg3 << sync_endl;
+        sync_cout << "info string ERROR: " << msg4 << sync_endl;
+        sync_cout << "info string ERROR: " << msg5 << sync_endl;
+
+        std::exit(EXIT_FAILURE);
+    }
+
+    if (useNNUE != UseNNUEMode::False)
+        sync_cout << "info string NNUE evaluation using " << eval_file << " enabled" << sync_endl;
+    else
+        sync_cout << "info string classical evaluation enabled" << sync_endl;
+  }
+
 } // namespace Eval::NNUE
diff --git a/src/nnue/evaluate_nnue.h b/src/nnue/evaluate_nnue.h
index 68153cac..dcfa071d 100644
--- a/src/nnue/evaluate_nnue.h
+++ b/src/nnue/evaluate_nnue.h
@@ -27,6 +27,13 @@
 
 namespace Eval::NNUE {
 
+  enum struct UseNNUEMode
+  {
+    False,
+    True,
+    Pure
+  };
+
   // Hash value of evaluation function structure
   constexpr std::uint32_t kHashValue =
       FeatureTransformer::GetHashValue() ^ Network::GetHashValue();
@@ -66,6 +73,9 @@ namespace Eval::NNUE {
   // Saved evaluation function file name
   extern std::string savedfileName;
 
+  extern UseNNUEMode useNNUE;
+  extern std::string eval_file_loaded;
+
   // Get a string that represents the structure of the evaluation function
   std::string GetArchitectureString();
 
@@ -83,6 +93,11 @@ namespace Eval::NNUE {
   // write evaluation function parameters
   bool WriteParameters(std::ostream& stream);
 
+  Value evaluate(const Position& pos);
+  bool load_eval(std::string name, std::istream& stream);
+  void init();
+  void verify();
+
 }  // namespace Eval::NNUE
 
 #endif // #ifndef NNUE_EVALUATE_NNUE_H_INCLUDED
diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h
index 319f005b..9975134c 100644
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -24,6 +24,8 @@
 #include <cstring>
 #include <iostream>
 
+#include "../types.h"
+
 #if defined(USE_AVX2)
 #include <immintrin.h>
 
diff --git a/src/position.cpp b/src/position.cpp
index 4e47f772..06a4e0b7 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -23,6 +23,8 @@
 #include <iomanip>
 #include <sstream>
 
+#include "nnue/evaluate_nnue.h"
+
 #include "bitboard.h"
 #include "misc.h"
 #include "movegen.h"
@@ -757,7 +759,7 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
       else
           st->nonPawnMaterial[them] -= PieceValue[MG][captured];
 
-      if (Eval::useNNUE != Eval::UseNNUEMode::False)
+      if (Eval::NNUE::useNNUE != Eval::NNUE::UseNNUEMode::False)
       {
           dp.dirty_num = 2;  // 1 piece moved, 1 piece captured
           dp.piece[1] = captured;
@@ -801,7 +803,7 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
   // Move the piece. The tricky Chess960 castling is handled earlier
   if (type_of(m) != CASTLING)
   {
-      if (Eval::useNNUE != Eval::UseNNUEMode::False)
+      if (Eval::NNUE::useNNUE != Eval::NNUE::UseNNUEMode::False)
       {
           dp.piece[0] = pc;
           dp.from[0] = from;
@@ -832,7 +834,7 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
           remove_piece(to);
           put_piece(promotion, to);
 
-          if (Eval::useNNUE != Eval::UseNNUEMode::False)
+          if (Eval::NNUE::useNNUE != Eval::NNUE::UseNNUEMode::False)
           {
               // Promoting pawn to SQ_NONE, promoted piece from SQ_NONE
               dp.to[0] = SQ_NONE;
@@ -970,7 +972,7 @@ void Position::do_castling(Color us, Square from, Square& to, Square& rfrom, Squ
   rto = relative_square(us, kingSide ? SQ_F1 : SQ_D1);
   to = relative_square(us, kingSide ? SQ_G1 : SQ_C1);
 
-  if (Do && Eval::useNNUE != Eval::UseNNUEMode::False)
+  if (Do && Eval::NNUE::useNNUE != Eval::NNUE::UseNNUEMode::False)
   {
       auto& dp = st->dirtyPiece;
       dp.piece[0] = make_piece(us, KING);
diff --git a/src/search.cpp b/src/search.cpp
index 1623ff06..26a675d7 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -23,6 +23,8 @@
 #include <iostream>
 #include <sstream>
 
+#include "nnue/evaluate_nnue.h"
+
 #include "evaluate.h"
 #include "misc.h"
 #include "movegen.h"
diff --git a/src/uci.cpp b/src/uci.cpp
index 166e437c..73ff0256 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -22,6 +22,7 @@
 #include <sstream>
 #include <string>
 
+#include "nnue/evaluate_nnue.h"
 #include "evaluate.h"
 #include "movegen.h"
 #include "nnue/nnue_test_command.h"
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index 099ca2ae..bdb1c6b1 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -21,6 +21,7 @@
 #include <ostream>
 #include <sstream>
 
+#include "nnue/evaluate_nnue.h"
 #include "evaluate.h"
 #include "misc.h"
 #include "search.h"

From 14f83ad7b91ab5f62f269d6317436c08f658ec07 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 14 Oct 2020 19:24:41 +0200
Subject: [PATCH 199/398] Move public search/qsearch interface from namespace
 Learner to namespace Search

---
 src/learn/gensfen.cpp |  4 ++--
 src/learn/learn.cpp   |  6 +++---
 src/search.cpp        |  9 ++-------
 src/search.h          | 11 ++++-------
 src/uci.cpp           |  4 ++--
 5 files changed, 13 insertions(+), 21 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 7c5b20be..7b135b81 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -673,7 +673,7 @@ namespace Learner
             }
             else
             {
-                Learner::search(pos, random_multi_pv_depth, random_multi_pv);
+                Search::search(pos, random_multi_pv_depth, random_multi_pv);
 
                 // Select one from the top N hands of root Moves
                 auto& rm = pos.this_thread()->rootMoves;
@@ -790,7 +790,7 @@ namespace Learner
                 const int depth = search_depth_min + (int)prng.rand(search_depth_max - search_depth_min + 1);
 
                 // Starting search calls init_for_search
-                auto [search_value, search_pv] = search(pos, depth, 1, nodes);
+                auto [search_value, search_pv] = Search::search(pos, depth, 1, nodes);
 
                 // This has to be performed after search because it needs to know
                 // rootMoves which are filled in init_for_search.
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index b2ee5aa1..452bd15f 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -824,7 +824,7 @@ namespace Learner
         // The value of evaluate() may be used, but when calculating loss, learn_cross_entropy and
         // Use qsearch() because it is difficult to compare the values.
         // EvalHash has been disabled in advance. (If not, the same value will be returned every time)
-        const auto [_, pv] = qsearch(task_pos);
+        const auto [_, pv] = Search::qsearch(task_pos);
 
         const auto rootColor = task_pos.side_to_move();
 
@@ -962,7 +962,7 @@ namespace Learner
 
                 // Determine if the teacher's move and the score of the shallow search match
                 {
-                    const auto [value, pv] = search(task_pos, 1);
+                    const auto [value, pv] = Search::search(task_pos, 1);
                     if ((uint16_t)pv[0] == ps.move)
                         move_accord_count.fetch_add(1, std::memory_order_relaxed);
                 }
@@ -1186,7 +1186,7 @@ namespace Learner
 				goto RETRY_READ;
 
             // Evaluation value of shallow search (qsearch)
-            const auto [_, pv] = qsearch(pos);
+            const auto [_, pv] = Search::qsearch(pos);
 
             // Evaluation value of deep search
             const auto deep_value = (Value)ps.score;
diff --git a/src/search.cpp b/src/search.cpp
index 26a675d7..79848812 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1968,9 +1968,7 @@ void Tablebases::rank_root_moves(Position& pos, Search::RootMoves& rootMoves) {
 }
 
 // --- expose the functions such as fixed depth search used for learning to the outside
-
-
-namespace Learner
+namespace Search
 {
   // For learning, prepare a stub that can call search,qsearch() from one thread.
   // From now on, it is better to have a Searcher and prepare a substitution table for each thread like Apery.
@@ -1978,7 +1976,7 @@ namespace Learner
 
   // Initialization for learning.
   // Called from Learner::search(),Learner::qsearch().
-  void init_for_search(Position& pos, Stack* ss)
+  static void init_for_search(Position& pos, Stack* ss)
   {
 
     // RootNode requires ss->ply == 0.
@@ -2046,9 +2044,6 @@ namespace Learner
     }
   }
 
-  // A pair of reader and evaluation value. Returned by Learner::search(),Learner::qsearch().
-  typedef std::pair<Value, std::vector<Move> > ValueAndPV;
-
   // Stationary search.
   //
   // Precondition) Search thread is set by pos.set_this_thread(Threads[thread_id]).
diff --git a/src/search.h b/src/search.h
index ab832ee2..13123323 100644
--- a/src/search.h
+++ b/src/search.h
@@ -110,15 +110,12 @@ extern LimitsType Limits;
 void init();
 void clear();
 
-} // namespace Search
+// A pair of reader and evaluation value. Returned by Learner::search(),Learner::qsearch().
+using ValueAndPV = std::pair<Value, std::vector<Move>>;
 
-namespace Learner {
+ValueAndPV qsearch(Position& pos);
+ValueAndPV search(Position& pos, int depth_, size_t multiPV = 1, uint64_t nodesLimit = 0);
 
-  // A pair of reader and evaluation value. Returned by Learner::search(),Learner::qsearch().
-  using ValueAndPV = std::pair<Value, std::vector<Move>>;
-
-  ValueAndPV qsearch(Position& pos);
-  ValueAndPV search(Position& pos, int depth_, size_t multiPV = 1, uint64_t nodesLimit = 0);
 }
 
 #endif // #ifndef SEARCH_H_INCLUDED
diff --git a/src/uci.cpp b/src/uci.cpp
index 73ff0256..ff735b2e 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -254,7 +254,7 @@ double UCI::win_rate_model_double(double v, int ply) {
 void qsearch_cmd(Position& pos)
 {
   cout << "qsearch : ";
-  auto pv = Learner::qsearch(pos);
+  auto pv = Search::qsearch(pos);
   cout << "Value = " << pv.first << " , " << UCI::value(pv.first) << " , PV = ";
   for (auto m : pv.second)
     cout << UCI::move(m, false) << " ";
@@ -275,7 +275,7 @@ void search_cmd(Position& pos, istringstream& is)
   }
 
   cout << "search depth = " << depth << " , multi_pv = " << multi_pv << " : ";
-  auto pv = Learner::search(pos, depth, multi_pv);
+  auto pv = Search::search(pos, depth, multi_pv);
   cout << "Value = " << pv.first << " , " << UCI::value(pv.first) << " , PV = ";
   for (auto m : pv.second)
     cout << UCI::move(m, false) << " ";

From 880d23af1c551e9122e95cd52c9aa155bfe11a38 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 14 Oct 2020 19:44:15 +0200
Subject: [PATCH 200/398] Move sfen input/output streams to sfen_stream.h

---
 src/learn/gensfen.cpp   | 100 +------------------
 src/learn/learn.cpp     | 112 +--------------------
 src/learn/sfen_stream.h | 213 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 217 insertions(+), 208 deletions(-)
 create mode 100644 src/learn/sfen_stream.h

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 7b135b81..4a6f26dc 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -2,6 +2,7 @@
 
 #include "packed_sfen.h"
 #include "multi_think.h"
+#include "sfen_stream.h"
 #include "../syzygy/tbprobe.h"
 
 #include "misc.h"
@@ -38,107 +39,12 @@ using namespace std;
 
 namespace Learner
 {
-    enum struct SfenOutputType
-    {
-        Bin,
-        Binpack
-    };
-
     static bool write_out_draw_game_in_training_data_generation = true;
     static bool detect_draw_by_consecutive_low_score = true;
     static bool detect_draw_by_insufficient_mating_material = true;
 
     static SfenOutputType sfen_output_type = SfenOutputType::Bin;
 
-    static bool ends_with(const std::string& lhs, const std::string& end)
-    {
-        if (end.size() > lhs.size()) return false;
-
-        return std::equal(end.rbegin(), end.rend(), lhs.rbegin());
-    }
-
-    static std::string filename_with_extension(const std::string& filename, const std::string& ext)
-    {
-        if (ends_with(filename, ext))
-        {
-            return filename;
-        }
-        else
-        {
-            return filename + "." + ext;
-        }
-    }
-
-    struct BasicSfenOutputStream
-    {
-        virtual void write(const PSVector& sfens) = 0;
-        virtual ~BasicSfenOutputStream() {}
-    };
-
-    struct BinSfenOutputStream : BasicSfenOutputStream
-    {
-        static constexpr auto openmode = ios::out | ios::binary | ios::app;
-        static inline const std::string extension = "bin";
-
-        BinSfenOutputStream(std::string filename) :
-            m_stream(filename_with_extension(filename, extension), openmode)
-        {
-        }
-
-        void write(const PSVector& sfens) override
-        {
-            m_stream.write(reinterpret_cast<const char*>(sfens.data()), sizeof(PackedSfenValue) * sfens.size());
-        }
-
-        ~BinSfenOutputStream() override {}
-
-    private:
-        fstream m_stream;
-    };
-
-    struct BinpackSfenOutputStream : BasicSfenOutputStream
-    {
-        static constexpr auto openmode = ios::out | ios::binary | ios::app;
-        static inline const std::string extension = "binpack";
-
-        BinpackSfenOutputStream(std::string filename) :
-            m_stream(filename_with_extension(filename, extension), openmode)
-        {
-        }
-
-        void write(const PSVector& sfens) override
-        {
-            static_assert(sizeof(binpack::nodchip::PackedSfenValue) == sizeof(PackedSfenValue));
-
-            for(auto& sfen : sfens)
-            {
-                // The library uses a type that's different but layout-compatibile.
-                binpack::nodchip::PackedSfenValue e;
-                std::memcpy(&e, &sfen, sizeof(binpack::nodchip::PackedSfenValue));
-                m_stream.addTrainingDataEntry(binpack::packedSfenValueToTrainingDataEntry(e));
-            }
-        }
-
-        ~BinpackSfenOutputStream() override {}
-
-    private:
-        binpack::CompressedTrainingDataEntryWriter m_stream;
-    };
-
-    static std::unique_ptr<BasicSfenOutputStream> create_new_sfen_output(const std::string& filename)
-    {
-        switch(sfen_output_type)
-        {
-            case SfenOutputType::Bin:
-                return std::make_unique<BinSfenOutputStream>(filename);
-            case SfenOutputType::Binpack:
-                return std::make_unique<BinpackSfenOutputStream>(filename);
-        }
-
-        assert(false);
-        return nullptr;
-    }
-
     // Helper class for exporting Sfen
     struct SfenWriter
     {
@@ -155,7 +61,7 @@ namespace Learner
             sfen_buffers_pool.reserve((size_t)thread_num * 10);
             sfen_buffers.resize(thread_num);
 
-            output_file_stream = create_new_sfen_output(filename_);
+            output_file_stream = create_new_sfen_output(filename_, sfen_output_type);
             filename = filename_;
 
             finished = false;
@@ -283,7 +189,7 @@ namespace Learner
                             // Add ios::app in consideration of overwriting.
                             // (Depending on the operation, it may not be necessary.)
                             string new_filename = filename + "_" + std::to_string(n);
-                            output_file_stream = create_new_sfen_output(new_filename);
+                            output_file_stream = create_new_sfen_output(new_filename, sfen_output_type);
                             cout << endl << "output sfen file = " << new_filename << endl;
                         }
 
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 452bd15f..6c865d98 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -21,6 +21,7 @@
 
 #include "convert.h"
 #include "multi_think.h"
+#include "sfen_stream.h"
 
 #include "misc.h"
 #include "position.h"
@@ -30,8 +31,6 @@
 #include "search.h"
 #include "timeman.h"
 
-#include "extra/nnue_data_binpack_format.h"
-
 #include "nnue/evaluate_nnue.h"
 #include "nnue/evaluate_nnue_learner.h"
 
@@ -286,115 +285,6 @@ namespace Learner
         return calc_grad((Value)psv.score, shallow, psv);
     }
 
-    struct BasicSfenInputStream
-    {
-        virtual std::optional<PackedSfenValue> next() = 0;
-        virtual bool eof() const = 0;
-        virtual ~BasicSfenInputStream() {}
-    };
-
-    struct BinSfenInputStream : BasicSfenInputStream
-    {
-        static constexpr auto openmode = ios::in | ios::binary;
-        static inline const std::string extension = "bin";
-
-        BinSfenInputStream(std::string filename) :
-            m_stream(filename, openmode),
-            m_eof(!m_stream)
-        {
-        }
-
-        std::optional<PackedSfenValue> next() override
-        {
-            PackedSfenValue e;
-            if(m_stream.read(reinterpret_cast<char*>(&e), sizeof(PackedSfenValue)))
-            {
-                return e;
-            }
-            else
-            {
-                m_eof = true;
-                return std::nullopt;
-            }
-        }
-
-        bool eof() const override
-        {
-            return m_eof;
-        }
-
-        ~BinSfenInputStream() override {}
-
-    private:
-        fstream m_stream;
-        bool m_eof;
-    };
-
-    struct BinpackSfenInputStream : BasicSfenInputStream
-    {
-        static constexpr auto openmode = ios::in | ios::binary;
-        static inline const std::string extension = "binpack";
-
-        BinpackSfenInputStream(std::string filename) :
-            m_stream(filename, openmode),
-            m_eof(!m_stream.hasNext())
-        {
-        }
-
-        std::optional<PackedSfenValue> next() override
-        {
-            static_assert(sizeof(binpack::nodchip::PackedSfenValue) == sizeof(PackedSfenValue));
-
-            if (!m_stream.hasNext())
-            {
-                m_eof = true;
-                return std::nullopt;
-            }
-
-            auto training_data_entry = m_stream.next();
-            auto v = binpack::trainingDataEntryToPackedSfenValue(training_data_entry);
-            PackedSfenValue psv;
-            // same layout, different types. One is from generic library.
-            std::memcpy(&psv, &v, sizeof(PackedSfenValue));
-
-            return psv;
-        }
-
-        bool eof() const override
-        {
-            return m_eof;
-        }
-
-        ~BinpackSfenInputStream() override {}
-
-    private:
-        binpack::CompressedTrainingDataEntryReader m_stream;
-        bool m_eof;
-    };
-
-    static bool ends_with(const std::string& lhs, const std::string& end)
-    {
-        if (end.size() > lhs.size()) return false;
-
-        return std::equal(end.rbegin(), end.rend(), lhs.rbegin());
-    }
-
-    static bool has_extension(const std::string& filename, const std::string& extension)
-    {
-        return ends_with(filename, "." + extension);
-    }
-
-    static std::unique_ptr<BasicSfenInputStream> open_sfen_input_file(const std::string& filename)
-    {
-        if (has_extension(filename, BinSfenInputStream::extension))
-            return std::make_unique<BinSfenInputStream>(filename);
-        else if (has_extension(filename, BinpackSfenInputStream::extension))
-            return std::make_unique<BinpackSfenInputStream>(filename);
-
-        assert(false);
-        return nullptr;
-    }
-
     // Sfen reader
     struct SfenReader
     {
diff --git a/src/learn/sfen_stream.h b/src/learn/sfen_stream.h
new file mode 100644
index 00000000..4d44901b
--- /dev/null
+++ b/src/learn/sfen_stream.h
@@ -0,0 +1,213 @@
+#ifndef _SFEN_STREAM_H_
+#define _SFEN_STREAM_H_
+
+#include "packed_sfen.h"
+
+#include "extra/nnue_data_binpack_format.h"
+
+#include <optional>
+#include <fstream>
+#include <string>
+#include <memory>
+
+namespace Learner {
+
+    enum struct SfenOutputType
+    {
+        Bin,
+        Binpack
+    };
+
+    static bool ends_with(const std::string& lhs, const std::string& end)
+    {
+        if (end.size() > lhs.size()) return false;
+
+        return std::equal(end.rbegin(), end.rend(), lhs.rbegin());
+    }
+
+    static bool has_extension(const std::string& filename, const std::string& extension)
+    {
+        return ends_with(filename, "." + extension);
+    }
+
+    static std::string filename_with_extension(const std::string& filename, const std::string& ext)
+    {
+        if (ends_with(filename, ext))
+        {
+            return filename;
+        }
+        else
+        {
+            return filename + "." + ext;
+        }
+    }
+
+    struct BasicSfenInputStream
+    {
+        virtual std::optional<PackedSfenValue> next() = 0;
+        virtual bool eof() const = 0;
+        virtual ~BasicSfenInputStream() {}
+    };
+
+    struct BinSfenInputStream : BasicSfenInputStream
+    {
+        static constexpr auto openmode = std::ios::in | std::ios::binary;
+        static inline const std::string extension = "bin";
+
+        BinSfenInputStream(std::string filename) :
+            m_stream(filename, openmode),
+            m_eof(!m_stream)
+        {
+        }
+
+        std::optional<PackedSfenValue> next() override
+        {
+            PackedSfenValue e;
+            if(m_stream.read(reinterpret_cast<char*>(&e), sizeof(PackedSfenValue)))
+            {
+                return e;
+            }
+            else
+            {
+                m_eof = true;
+                return std::nullopt;
+            }
+        }
+
+        bool eof() const override
+        {
+            return m_eof;
+        }
+
+        ~BinSfenInputStream() override {}
+
+    private:
+        std::fstream m_stream;
+        bool m_eof;
+    };
+
+    struct BinpackSfenInputStream : BasicSfenInputStream
+    {
+        static constexpr auto openmode = std::ios::in | std::ios::binary;
+        static inline const std::string extension = "binpack";
+
+        BinpackSfenInputStream(std::string filename) :
+            m_stream(filename, openmode),
+            m_eof(!m_stream.hasNext())
+        {
+        }
+
+        std::optional<PackedSfenValue> next() override
+        {
+            static_assert(sizeof(binpack::nodchip::PackedSfenValue) == sizeof(PackedSfenValue));
+
+            if (!m_stream.hasNext())
+            {
+                m_eof = true;
+                return std::nullopt;
+            }
+
+            auto training_data_entry = m_stream.next();
+            auto v = binpack::trainingDataEntryToPackedSfenValue(training_data_entry);
+            PackedSfenValue psv;
+            // same layout, different types. One is from generic library.
+            std::memcpy(&psv, &v, sizeof(PackedSfenValue));
+
+            return psv;
+        }
+
+        bool eof() const override
+        {
+            return m_eof;
+        }
+
+        ~BinpackSfenInputStream() override {}
+
+    private:
+        binpack::CompressedTrainingDataEntryReader m_stream;
+        bool m_eof;
+    };
+
+    struct BasicSfenOutputStream
+    {
+        virtual void write(const PSVector& sfens) = 0;
+        virtual ~BasicSfenOutputStream() {}
+    };
+
+    struct BinSfenOutputStream : BasicSfenOutputStream
+    {
+        static constexpr auto openmode = std::ios::out | std::ios::binary | std::ios::app;
+        static inline const std::string extension = "bin";
+
+        BinSfenOutputStream(std::string filename) :
+            m_stream(filename_with_extension(filename, extension), openmode)
+        {
+        }
+
+        void write(const PSVector& sfens) override
+        {
+            m_stream.write(reinterpret_cast<const char*>(sfens.data()), sizeof(PackedSfenValue) * sfens.size());
+        }
+
+        ~BinSfenOutputStream() override {}
+
+    private:
+        std::fstream m_stream;
+    };
+
+    struct BinpackSfenOutputStream : BasicSfenOutputStream
+    {
+        static constexpr auto openmode = std::ios::out | std::ios::binary | std::ios::app;
+        static inline const std::string extension = "binpack";
+
+        BinpackSfenOutputStream(std::string filename) :
+            m_stream(filename_with_extension(filename, extension), openmode)
+        {
+        }
+
+        void write(const PSVector& sfens) override
+        {
+            static_assert(sizeof(binpack::nodchip::PackedSfenValue) == sizeof(PackedSfenValue));
+
+            for(auto& sfen : sfens)
+            {
+                // The library uses a type that's different but layout-compatibile.
+                binpack::nodchip::PackedSfenValue e;
+                std::memcpy(&e, &sfen, sizeof(binpack::nodchip::PackedSfenValue));
+                m_stream.addTrainingDataEntry(binpack::packedSfenValueToTrainingDataEntry(e));
+            }
+        }
+
+        ~BinpackSfenOutputStream() override {}
+
+    private:
+        binpack::CompressedTrainingDataEntryWriter m_stream;
+    };
+
+    inline std::unique_ptr<BasicSfenInputStream> open_sfen_input_file(const std::string& filename)
+    {
+        if (has_extension(filename, BinSfenInputStream::extension))
+            return std::make_unique<BinSfenInputStream>(filename);
+        else if (has_extension(filename, BinpackSfenInputStream::extension))
+            return std::make_unique<BinpackSfenInputStream>(filename);
+
+        assert(false);
+        return nullptr;
+    }
+
+    inline std::unique_ptr<BasicSfenOutputStream> create_new_sfen_output(const std::string& filename, SfenOutputType sfen_output_type)
+    {
+        switch(sfen_output_type)
+        {
+            case SfenOutputType::Bin:
+                return std::make_unique<BinSfenOutputStream>(filename);
+            case SfenOutputType::Binpack:
+                return std::make_unique<BinpackSfenOutputStream>(filename);
+        }
+
+        assert(false);
+        return nullptr;
+    }
+}
+
+#endif
\ No newline at end of file

From 904adb9a32f2d96f5ffb99fb5d44912adecaa518 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 14 Oct 2020 19:53:41 +0200
Subject: [PATCH 201/398] Indentation consistency in learn folder

---
 src/learn/gensfen.cpp     |   3 +-
 src/learn/half_float.h    | 178 +++++-----
 src/learn/learn.cpp       |  16 +-
 src/learn/multi_think.cpp | 144 ++++----
 src/learn/multi_think.h   | 188 +++++------
 src/learn/sfen_packer.cpp | 690 +++++++++++++++++++-------------------
 6 files changed, 607 insertions(+), 612 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 4a6f26dc..1a9187ae 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -3,7 +3,6 @@
 #include "packed_sfen.h"
 #include "multi_think.h"
 #include "sfen_stream.h"
-#include "../syzygy/tbprobe.h"
 
 #include "misc.h"
 #include "position.h"
@@ -73,7 +72,7 @@ namespace Learner
             file_worker_thread.join();
             output_file_stream.reset();
 
-#if defined(_DEBUG)
+#if !defined(NDEBUG)
             {
                 // All buffers should be empty since file_worker_thread
                 // should have written everything before exiting.
diff --git a/src/learn/half_float.h b/src/learn/half_float.h
index ebe77526..5808a786 100644
--- a/src/learn/half_float.h
+++ b/src/learn/half_float.h
@@ -11,122 +11,122 @@
 
 namespace HalfFloat
 {
-	// IEEE 754 float 32 format is :
-	//   sign(1bit) + exponent(8bits) + fraction(23bits) = 32bits
-	//
-	// Our float16 format is :
-	//   sign(1bit) + exponent(5bits) + fraction(10bits) = 16bits
-	union float32_converter
-	{
-		int32_t n;
-		float f;
-	};
+    // IEEE 754 float 32 format is :
+    //   sign(1bit) + exponent(8bits) + fraction(23bits) = 32bits
+    //
+    // Our float16 format is :
+    //   sign(1bit) + exponent(5bits) + fraction(10bits) = 16bits
+    union float32_converter
+    {
+        int32_t n;
+        float f;
+    };
 
 
-	// 16-bit float
-	struct float16
-	{
-		// --- constructors
+    // 16-bit float
+    struct float16
+    {
+        // --- constructors
 
-		float16() {}
-		float16(int16_t n) { from_float((float)n);  }
-		float16(int32_t n) { from_float((float)n); }
-		float16(float n) { from_float(n); }
-		float16(double n) { from_float((float)n); }
+        float16() {}
+        float16(int16_t n) { from_float((float)n);  }
+        float16(int32_t n) { from_float((float)n); }
+        float16(float n) { from_float(n); }
+        float16(double n) { from_float((float)n); }
 
-		// build from a float
-		void from_float(float f) { *this = to_float16(f); }
+        // build from a float
+        void from_float(float f) { *this = to_float16(f); }
 
-		// --- implicit converters
+        // --- implicit converters
 
-		operator int32_t() const { return (int32_t)to_float(*this); }
-		operator float() const { return to_float(*this); }
-		operator double() const { return double(to_float(*this)); }
+        operator int32_t() const { return (int32_t)to_float(*this); }
+        operator float() const { return to_float(*this); }
+        operator double() const { return double(to_float(*this)); }
 
-		// --- operators
+        // --- operators
 
-		float16 operator += (float16 rhs) { from_float(to_float(*this) + to_float(rhs)); return *this; }
-		float16 operator -= (float16 rhs) { from_float(to_float(*this) - to_float(rhs)); return *this; }
-		float16 operator *= (float16 rhs) { from_float(to_float(*this) * to_float(rhs)); return *this; }
-		float16 operator /= (float16 rhs) { from_float(to_float(*this) / to_float(rhs)); return *this; }
-		float16 operator + (float16 rhs) const { return float16(*this) += rhs; }
-		float16 operator - (float16 rhs) const { return float16(*this) -= rhs; }
-		float16 operator * (float16 rhs) const { return float16(*this) *= rhs; }
-		float16 operator / (float16 rhs) const { return float16(*this) /= rhs; }
-		float16 operator - () const { return float16(-to_float(*this)); }
-		bool operator == (float16 rhs) const { return this->v_ == rhs.v_; }
-		bool operator != (float16 rhs) const { return !(*this == rhs); }
+        float16 operator += (float16 rhs) { from_float(to_float(*this) + to_float(rhs)); return *this; }
+        float16 operator -= (float16 rhs) { from_float(to_float(*this) - to_float(rhs)); return *this; }
+        float16 operator *= (float16 rhs) { from_float(to_float(*this) * to_float(rhs)); return *this; }
+        float16 operator /= (float16 rhs) { from_float(to_float(*this) / to_float(rhs)); return *this; }
+        float16 operator + (float16 rhs) const { return float16(*this) += rhs; }
+        float16 operator - (float16 rhs) const { return float16(*this) -= rhs; }
+        float16 operator * (float16 rhs) const { return float16(*this) *= rhs; }
+        float16 operator / (float16 rhs) const { return float16(*this) /= rhs; }
+        float16 operator - () const { return float16(-to_float(*this)); }
+        bool operator == (float16 rhs) const { return this->v_ == rhs.v_; }
+        bool operator != (float16 rhs) const { return !(*this == rhs); }
 
-		static void UnitTest() { unit_test(); }
+        static void UnitTest() { unit_test(); }
 
-	private:
+    private:
 
-		// --- entity
+        // --- entity
 
-		uint16_t v_;
+        uint16_t v_;
 
-		// --- conversion between float and float16
+        // --- conversion between float and float16
 
-		static float16 to_float16(float f)
-		{
-			float32_converter c;
-			c.f = f;
-			u32 n = c.n;
+        static float16 to_float16(float f)
+        {
+            float32_converter c;
+            c.f = f;
+            u32 n = c.n;
 
-			// The sign bit is MSB in common.
-			uint16_t sign_bit = (n >> 16) & 0x8000;
+            // The sign bit is MSB in common.
+            uint16_t sign_bit = (n >> 16) & 0x8000;
 
-			// The exponent of IEEE 754's float 32 is biased +127 , so we change this bias into +15 and limited to 5-bit.
-			uint16_t exponent = (((n >> 23) - 127 + 15) & 0x1f) << 10;
+            // The exponent of IEEE 754's float 32 is biased +127 , so we change this bias into +15 and limited to 5-bit.
+            uint16_t exponent = (((n >> 23) - 127 + 15) & 0x1f) << 10;
 
-			// The fraction is limited to 10-bit.
-			uint16_t fraction = (n >> (23-10)) & 0x3ff;
+            // The fraction is limited to 10-bit.
+            uint16_t fraction = (n >> (23-10)) & 0x3ff;
 
-			float16 f_;
-			f_.v_ = sign_bit | exponent | fraction;
+            float16 f_;
+            f_.v_ = sign_bit | exponent | fraction;
 
-			return f_;
-		}
+            return f_;
+        }
 
-		static float to_float(float16 v)
-		{
-			u32 sign_bit = (v.v_ & 0x8000) << 16;
-			u32 exponent = ((((v.v_ >> 10) & 0x1f) - 15 + 127) & 0xff) << 23;
-			u32 fraction = (v.v_ & 0x3ff) << (23 - 10);
+        static float to_float(float16 v)
+        {
+            u32 sign_bit = (v.v_ & 0x8000) << 16;
+            u32 exponent = ((((v.v_ >> 10) & 0x1f) - 15 + 127) & 0xff) << 23;
+            u32 fraction = (v.v_ & 0x3ff) << (23 - 10);
 
-			float32_converter c;
-			c.n = sign_bit | exponent | fraction;
-			return c.f;
-		}
+            float32_converter c;
+            c.n = sign_bit | exponent | fraction;
+            return c.f;
+        }
 
-		// It is not a unit test, but I confirmed that it can be calculated. I'll fix the code later (maybe).
-		static void unit_test()
-		{
-			float16 a, b, c, d;
-			a = 1;
-			std::cout << (float)a << std::endl;
-			b = -118.625;
-			std::cout << (float)b << std::endl;
-			c = 2.5;
-			std::cout << (float)c << std::endl;
-			d = a + c;
-			std::cout << (float)d << std::endl;
+        // It is not a unit test, but I confirmed that it can be calculated. I'll fix the code later (maybe).
+        static void unit_test()
+        {
+            float16 a, b, c, d;
+            a = 1;
+            std::cout << (float)a << std::endl;
+            b = -118.625;
+            std::cout << (float)b << std::endl;
+            c = 2.5;
+            std::cout << (float)c << std::endl;
+            d = a + c;
+            std::cout << (float)d << std::endl;
 
-			c *= 1.5;
-			std::cout << (float)c << std::endl;
+            c *= 1.5;
+            std::cout << (float)c << std::endl;
 
-			b /= 3;
-			std::cout << (float)b << std::endl;
+            b /= 3;
+            std::cout << (float)b << std::endl;
 
-			float f1 = 1.5;
-			a += f1;
-			std::cout << (float)a << std::endl;
+            float f1 = 1.5;
+            a += f1;
+            std::cout << (float)a << std::endl;
 
-			a += f1 * (float)a;
-			std::cout << (float)a << std::endl;
-		}
+            a += f1 * (float)a;
+            std::cout << (float)a << std::endl;
+        }
 
-	};
+    };
 
 }
 
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 6c865d98..b09700e9 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -1066,14 +1066,14 @@ namespace Learner
 
             pos.do_move((Move)ps.move, state[ply++]);
 
-			// There is a possibility that all the pieces are blocked and stuck.
-			// Also, the declaration win phase is excluded from
-			// learning because you cannot go to leaf with PV moves.
-			// (shouldn't write out such teacher aspect itself,
-			// but may have written it out with an old generation routine)
-			// Skip the position if there are no legal moves (=checkmated or stalemate).
-			if (MoveList<LEGAL>(pos).size() == 0)
-				goto RETRY_READ;
+            // There is a possibility that all the pieces are blocked and stuck.
+            // Also, the declaration win phase is excluded from
+            // learning because you cannot go to leaf with PV moves.
+            // (shouldn't write out such teacher aspect itself,
+            // but may have written it out with an old generation routine)
+            // Skip the position if there are no legal moves (=checkmated or stalemate).
+            if (MoveList<LEGAL>(pos).size() == 0)
+                goto RETRY_READ;
 
             // Evaluation value of shallow search (qsearch)
             const auto [_, pv] = Search::qsearch(pos);
diff --git a/src/learn/multi_think.cpp b/src/learn/multi_think.cpp
index daed3e96..d2ae65eb 100644
--- a/src/learn/multi_think.cpp
+++ b/src/learn/multi_think.cpp
@@ -1,103 +1,103 @@
 ﻿#include "multi_think.h"
 
-#include "nnue/evaluate_nnue.h"
-
 #include "tt.h"
 #include "uci.h"
 #include "types.h"
 #include "search.h"
 
+#include "nnue/evaluate_nnue.h"
+
 #include <thread>
 
 void MultiThink::go_think()
 {
-	// Read evaluation function, etc.
-	// In the case of the learn command, the value of the evaluation function may be corrected after reading the evaluation function, so
-	// Skip memory corruption check.
-	Eval::NNUE::init();
+    // Read evaluation function, etc.
+    // In the case of the learn command, the value of the evaluation function may be corrected after reading the evaluation function, so
+    // Skip memory corruption check.
+    Eval::NNUE::init();
 
-	// Call the derived class's init().
-	init();
+    // Call the derived class's init().
+    init();
 
-	// The loop upper limit is set with set_loop_max().
-	loop_count = 0;
-	done_count = 0;
+    // The loop upper limit is set with set_loop_max().
+    loop_count = 0;
+    done_count = 0;
 
-	// Create threads as many as Options["Threads"] and start thinking.
-	std::vector<std::thread> threads;
-	auto thread_num = (size_t)Options["Threads"];
+    // Create threads as many as Options["Threads"] and start thinking.
+    std::vector<std::thread> threads;
+    auto thread_num = (size_t)Options["Threads"];
 
-	// Secure end flag of worker thread
+    // Secure end flag of worker thread
         threads_finished=0;
 
-	// start worker thread
-	for (size_t i = 0; i < thread_num; ++i)
-	{
-		threads.push_back(std::thread([i, this]
-		{
-			// exhaust all processor threads.
-			WinProcGroup::bindThisThread(i);
+    // start worker thread
+    for (size_t i = 0; i < thread_num; ++i)
+    {
+        threads.push_back(std::thread([i, this]
+        {
+            // exhaust all processor threads.
+            WinProcGroup::bindThisThread(i);
 
-			// execute the overridden process
-			this->thread_worker(i);
+            // execute the overridden process
+            this->thread_worker(i);
 
-			// Set the end flag because the thread has ended
-			this->threads_finished++;
-		}));
-	}
+            // Set the end flag because the thread has ended
+            this->threads_finished++;
+        }));
+    }
 
-	// wait for all threads to finish
-	// for (auto& th :threads)
-	// th.join();
-	// If you write like, the thread will rush here while it is still working,
-	// During that time, callback_func() cannot be called and you cannot save.
-	// Therefore, you need to check the end flag yourself.
+    // wait for all threads to finish
+    // for (auto& th :threads)
+    // th.join();
+    // If you write like, the thread will rush here while it is still working,
+    // During that time, callback_func() cannot be called and you cannot save.
+    // Therefore, you need to check the end flag yourself.
 
-	// function to determine if all threads have finished
-	auto threads_done = [&]()
-	{
-		return threads_finished == thread_num;
-	};
+    // function to determine if all threads have finished
+    auto threads_done = [&]()
+    {
+        return threads_finished == thread_num;
+    };
 
-	// Call back if the callback function is set.
-	auto do_a_callback = [&]()
-	{
-		if (callback_func)
-			callback_func();
-	};
+    // Call back if the callback function is set.
+    auto do_a_callback = [&]()
+    {
+        if (callback_func)
+            callback_func();
+    };
 
 
-	for (uint64_t i = 0 ; ; )
-	{
-		// If all threads have finished, exit the loop.
-		if (threads_done())
-			break;
+    for (uint64_t i = 0 ; ; )
+    {
+        // If all threads have finished, exit the loop.
+        if (threads_done())
+            break;
 
-		sleep(1000);
+        sleep(1000);
 
-		// callback_func() is called every callback_seconds.
-		if (++i == callback_seconds)
-		{
-			do_a_callback();
-			// Since I am returning from ↑, I reset the counter, so
-			// no matter how long it takes to save() etc. in do_a_callback()
-			// The next call will take a certain amount of time.
-			i = 0;
-		}
-	}
+        // callback_func() is called every callback_seconds.
+        if (++i == callback_seconds)
+        {
+            do_a_callback();
+            // Since I am returning from ↑, I reset the counter, so
+            // no matter how long it takes to save() etc. in do_a_callback()
+            // The next call will take a certain amount of time.
+            i = 0;
+        }
+    }
 
-	// Last save.
-	std::cout << std::endl << "finalize..";
+    // Last save.
+    std::cout << std::endl << "finalize..";
 
-	// do_a_callback();
-	// → It should be saved by the caller, so I feel that it is not necessary here.
+    // do_a_callback();
+    // → It should be saved by the caller, so I feel that it is not necessary here.
 
-	// It is possible that the exit code of the thread is running but the exit code of the thread is running, so
-	// We need to wait for the end with join().
-	for (auto& th : threads)
-		th.join();
+    // It is possible that the exit code of the thread is running but the exit code of the thread is running, so
+    // We need to wait for the end with join().
+    for (auto& th : threads)
+        th.join();
 
-	// The file writing thread etc. are still running only when all threads are finished
-	// Since the work itself may not have completed, output only that all threads have finished.
-	std::cout << "all threads are joined." << std::endl;
+    // The file writing thread etc. are still running only when all threads are finished
+    // Since the work itself may not have completed, output only that all threads have finished.
+    std::cout << "all threads are joined." << std::endl;
 }
diff --git a/src/learn/multi_think.h b/src/learn/multi_think.h
index e6c436f8..7e541909 100644
--- a/src/learn/multi_think.h
+++ b/src/learn/multi_think.h
@@ -19,84 +19,84 @@
 // Derive and use this class.
 struct MultiThink
 {
-	static constexpr std::uint64_t LOOP_COUNT_FINISHED = std::numeric_limits<std::uint64_t>::max();
+    static constexpr std::uint64_t LOOP_COUNT_FINISHED = std::numeric_limits<std::uint64_t>::max();
 
-	MultiThink() : prng{}, loop_count(0) { }
+    MultiThink() : prng{}, loop_count(0) { }
 
-	MultiThink(std::uint64_t seed) : prng(seed), loop_count(0) { }
+    MultiThink(std::uint64_t seed) : prng(seed), loop_count(0) { }
 
-	MultiThink(const std::string& seed) : prng(seed), loop_count(0) { }
+    MultiThink(const std::string& seed) : prng(seed), loop_count(0) { }
 
-	// Call this function from the master thread, each thread will think,
-	// Return control when the thought ending condition is satisfied.
-	// Do something else.
-	// ・It is safe for each thread to call Learner::search(),qsearch()
-	// Separates the substitution table for each thread. (It will be restored after the end.)
-	// ・Book is not thread safe when in on the fly mode, so temporarily change this mode.
-	// Turn it off.
-	// [Requirements]
-	// 1) Override thread_worker()
-	// 2) Set the loop count with set_loop_max()
-	// 3) set a function to be called back periodically (if necessary)
-	// callback_func and callback_interval
-	void go_think();
+    // Call this function from the master thread, each thread will think,
+    // Return control when the thought ending condition is satisfied.
+    // Do something else.
+    // ・It is safe for each thread to call Learner::search(),qsearch()
+    // Separates the substitution table for each thread. (It will be restored after the end.)
+    // ・Book is not thread safe when in on the fly mode, so temporarily change this mode.
+    // Turn it off.
+    // [Requirements]
+    // 1) Override thread_worker()
+    // 2) Set the loop count with set_loop_max()
+    // 3) set a function to be called back periodically (if necessary)
+    // callback_func and callback_interval
+    void go_think();
 
-	// If there is something you want to initialize on the derived class side, override this,
-	// Called when initialization is completed with go_think().
-	// It is better to read the fixed trace at that timing.
-	virtual void init() {}
+    // If there is something you want to initialize on the derived class side, override this,
+    // Called when initialization is completed with go_think().
+    // It is better to read the fixed trace at that timing.
+    virtual void init() {}
 
-	// A thread worker that is called by creating a thread when you go_think()
-	// Override and use this.
-	virtual void thread_worker(size_t thread_id) = 0;
+    // A thread worker that is called by creating a thread when you go_think()
+    // Override and use this.
+    virtual void thread_worker(size_t thread_id) = 0;
 
-	// Called back every callback_seconds [seconds] when go_think().
-	std::function<void()> callback_func;
-	uint64_t callback_seconds = 600;
+    // Called back every callback_seconds [seconds] when go_think().
+    std::function<void()> callback_func;
+    uint64_t callback_seconds = 600;
 
-	// Set the number of times worker processes (calls Search::think()).
-	void set_loop_max(uint64_t loop_max_) { loop_max = loop_max_; }
+    // Set the number of times worker processes (calls Search::think()).
+    void set_loop_max(uint64_t loop_max_) { loop_max = loop_max_; }
 
-	// Get the value set by set_loop_max().
-	uint64_t get_loop_max() const { return loop_max; }
+    // Get the value set by set_loop_max().
+    uint64_t get_loop_max() const { return loop_max; }
 
-	// [ASYNC] Take the value of the loop counter and add the loop counter after taking it out.
-	// If the loop counter has reached loop_max, return UINT64_MAX.
-	// If you want to generate a phase, you must call this function at the time of generating the phase,
-	// Please note that the number of generated phases and the value of the counter will not match.
-	uint64_t get_next_loop_count() {
-		std::unique_lock<std::mutex> lk(loop_mutex);
-		if (loop_count >= loop_max)
-			return LOOP_COUNT_FINISHED;
-		return loop_count++;
-	}
+    // [ASYNC] Take the value of the loop counter and add the loop counter after taking it out.
+    // If the loop counter has reached loop_max, return UINT64_MAX.
+    // If you want to generate a phase, you must call this function at the time of generating the phase,
+    // Please note that the number of generated phases and the value of the counter will not match.
+    uint64_t get_next_loop_count() {
+        std::unique_lock<std::mutex> lk(loop_mutex);
+        if (loop_count >= loop_max)
+            return LOOP_COUNT_FINISHED;
+        return loop_count++;
+    }
 
-	// [ASYNC] For returning the processed number. Each time it is called, it returns a counter that is incremented.
-	uint64_t get_done_count() {
-		std::unique_lock<std::mutex> lk(loop_mutex);
-		return ++done_count;
-	}
+    // [ASYNC] For returning the processed number. Each time it is called, it returns a counter that is incremented.
+    uint64_t get_done_count() {
+        std::unique_lock<std::mutex> lk(loop_mutex);
+        return ++done_count;
+    }
 
-	// Mutex when worker thread accesses I/O
-	std::mutex io_mutex;
+    // Mutex when worker thread accesses I/O
+    std::mutex io_mutex;
 
 protected:
-	// Random number generator body
-	AsyncPRNG prng;
+    // Random number generator body
+    AsyncPRNG prng;
 
 private:
-	// number of times worker processes (calls Search::think())
-	std::atomic<uint64_t> loop_max;
-	// number of times the worker has processed (calls Search::think())
-	std::atomic<uint64_t> loop_count;
-	// To return the number of times it has been processed.
-	std::atomic<uint64_t> done_count;
+    // number of times worker processes (calls Search::think())
+    std::atomic<uint64_t> loop_max;
+    // number of times the worker has processed (calls Search::think())
+    std::atomic<uint64_t> loop_count;
+    // To return the number of times it has been processed.
+    std::atomic<uint64_t> done_count;
 
-	// Mutex when changing the variables in ↑
-	std::mutex loop_mutex;
+    // Mutex when changing the variables in ↑
+    std::mutex loop_mutex;
 
-	// Thread end flag.
-        std::atomic<uint64_t> threads_finished;
+    // Thread end flag.
+    std::atomic<uint64_t> threads_finished;
 };
 
 // Mechanism to process task during idle time.
@@ -105,48 +105,48 @@ private:
 // Convenient to use when you want to write MultiThink thread worker in master-slave method.
 struct TaskDispatcher
 {
-	typedef std::function<void(size_t /* thread_id */)> Task;
+    typedef std::function<void(size_t /* thread_id */)> Task;
 
-	// slave calls this function during idle.
-	void on_idle(size_t thread_id)
-	{
-		Task task;
-		while ((task = get_task_async()) != nullptr)
-			task(thread_id);
+    // slave calls this function during idle.
+    void on_idle(size_t thread_id)
+    {
+        Task task;
+        while ((task = get_task_async()) != nullptr)
+            task(thread_id);
 
-		sleep(1);
-	}
+        sleep(1);
+    }
 
-	// Stack [ASYNC] task.
-	void push_task_async(Task task)
-	{
-		std::unique_lock<std::mutex> lk(task_mutex);
-		tasks.push_back(task);
-	}
+    // Stack [ASYNC] task.
+    void push_task_async(Task task)
+    {
+        std::unique_lock<std::mutex> lk(task_mutex);
+        tasks.push_back(task);
+    }
 
-	// Allocate size array elements for task in advance.
-	void task_reserve(size_t size)
-	{
-		tasks.reserve(size);
-	}
+    // Allocate size array elements for task in advance.
+    void task_reserve(size_t size)
+    {
+        tasks.reserve(size);
+    }
 
 protected:
-	// set of tasks
-	std::vector<Task> tasks;
+    // set of tasks
+    std::vector<Task> tasks;
 
-	// Take out one [ASYNC] task. Called from on_idle().
-	Task get_task_async()
-	{
-		std::unique_lock<std::mutex> lk(task_mutex);
-		if (tasks.size() == 0)
-			return nullptr;
-		Task task = *tasks.rbegin();
-		tasks.pop_back();
-		return task;
-	}
+    // Take out one [ASYNC] task. Called from on_idle().
+    Task get_task_async()
+    {
+        std::unique_lock<std::mutex> lk(task_mutex);
+        if (tasks.size() == 0)
+            return nullptr;
+        Task task = *tasks.rbegin();
+        tasks.pop_back();
+        return task;
+    }
 
-	// a mutex for accessing tasks
-	std::mutex task_mutex;
+    // a mutex for accessing tasks
+    std::mutex task_mutex;
 };
 
 #endif
diff --git a/src/learn/sfen_packer.cpp b/src/learn/sfen_packer.cpp
index 2de7efa4..777b5943 100644
--- a/src/learn/sfen_packer.cpp
+++ b/src/learn/sfen_packer.cpp
@@ -13,378 +13,374 @@ using namespace std;
 
 namespace Learner {
 
-  // Class that handles bitstream
-  // useful when doing aspect encoding
-  struct BitStream
-  {
-    // Set the memory to store the data in advance.
-    // Assume that memory is cleared to 0.
-    void set_data(std::uint8_t* data_) { data = data_; reset(); }
-
-    // Get the pointer passed in set_data().
-    uint8_t* get_data() const { return data; }
-
-    // Get the cursor.
-    int get_cursor() const { return bit_cursor; }
-
-    // reset the cursor
-    void reset() { bit_cursor = 0; }
-
-    // Write 1bit to the stream.
-    // If b is non-zero, write out 1. If 0, write 0.
-    void write_one_bit(int b)
+    // Class that handles bitstream
+    // useful when doing aspect encoding
+    struct BitStream
     {
-      if (b)
-        data[bit_cursor / 8] |= 1 << (bit_cursor & 7);
+        // Set the memory to store the data in advance.
+        // Assume that memory is cleared to 0.
+        void set_data(std::uint8_t* data_) { data = data_; reset(); }
 
-      ++bit_cursor;
-    }
+        // Get the pointer passed in set_data().
+        uint8_t* get_data() const { return data; }
 
-    // Get 1 bit from the stream.
-    int read_one_bit()
+        // Get the cursor.
+        int get_cursor() const { return bit_cursor; }
+
+        // reset the cursor
+        void reset() { bit_cursor = 0; }
+
+        // Write 1bit to the stream.
+        // If b is non-zero, write out 1. If 0, write 0.
+        void write_one_bit(int b)
+        {
+            if (b)
+                data[bit_cursor / 8] |= 1 << (bit_cursor & 7);
+
+            ++bit_cursor;
+        }
+
+        // Get 1 bit from the stream.
+        int read_one_bit()
+        {
+            int b = (data[bit_cursor / 8] >> (bit_cursor & 7)) & 1;
+            ++bit_cursor;
+
+            return b;
+        }
+
+        // write n bits of data
+        // Data shall be written out from the lower order of d.
+        void write_n_bit(int d, int n)
+        {
+            for (int i = 0; i <n; ++i)
+                write_one_bit(d & (1 << i));
+        }
+
+        // read n bits of data
+        // Reverse conversion of write_n_bit().
+        int read_n_bit(int n)
+        {
+            int result = 0;
+            for (int i = 0; i < n; ++i)
+                result |= read_one_bit() ? (1 << i) : 0;
+
+            return result;
+        }
+
+    private:
+        // Next bit position to read/write.
+        int bit_cursor;
+
+        // data entity
+        std::uint8_t* data;
+    };
+
+    // Class for compressing/decompressing sfen
+    // sfen can be packed to 256bit (32bytes) by Huffman coding.
+    // This is proven by mini. The above is Huffman coding.
+    //
+    // Internal format = 1-bit turn + 7-bit king position *2 + piece on board (Huffman coding) + hand piece (Huffman coding)
+    // Side to move (White = 0, Black = 1) (1bit)
+    // White King Position (6 bits)
+    // Black King Position (6 bits)
+    // Huffman Encoding of the board
+    // Castling availability (1 bit x 4)
+    // En passant square (1 or 1 + 6 bits)
+    // Rule 50 (6 bits)
+    // Game play (8 bits)
+    //
+    // TODO(someone): Rename SFEN to FEN.
+    //
+    struct SfenPacker
     {
-      int b = (data[bit_cursor / 8] >> (bit_cursor & 7)) & 1;
-      ++bit_cursor;
+        void pack(const Position& pos);
 
-      return b;
-    }
+        // sfen packed by pack() (256bit = 32bytes)
+        // Or sfen to decode with unpack()
+        uint8_t *data; // uint8_t[32];
 
-    // write n bits of data
-    // Data shall be written out from the lower order of d.
-    void write_n_bit(int d, int n)
+        BitStream stream;
+
+        // Output the board pieces to stream.
+        void write_board_piece_to_stream(Piece pc);
+
+        // Read one board piece from stream
+        Piece read_board_piece_from_stream();
+    };
+
+
+    // Huffman coding
+    // * is simplified from mini encoding to make conversion easier.
+    //
+    // Huffman Encoding
+    //
+    // Empty  xxxxxxx0
+    // Pawn   xxxxx001 + 1 bit (Color)
+    // Knight xxxxx011 + 1 bit (Color)
+    // Bishop xxxxx101 + 1 bit (Color)
+    // Rook   xxxxx111 + 1 bit (Color)
+    // Queen   xxxx1001 + 1 bit (Color)
+    //
+    // Worst case:
+    // - 32 empty squares    32 bits
+    // - 30 pieces           150 bits
+    // - 2 kings             12 bits
+    // - castling rights     4 bits
+    // - ep square           7 bits
+    // - rule50              7 bits
+    // - game ply            16 bits
+    // - TOTAL               228 bits < 256 bits
+
+    struct HuffmanedPiece
     {
-      for (int i = 0; i <n; ++i)
-        write_one_bit(d & (1 << i));
-    }
+        int code; // how it will be coded
+        int bits; // How many bits do you have
+    };
 
-    // read n bits of data
-    // Reverse conversion of write_n_bit().
-    int read_n_bit(int n)
+    constexpr HuffmanedPiece huffman_table[] =
     {
-      int result = 0;
-      for (int i = 0; i < n; ++i)
-        result |= read_one_bit() ? (1 << i) : 0;
+        {0b0000,1}, // NO_PIECE
+        {0b0001,4}, // PAWN
+        {0b0011,4}, // KNIGHT
+        {0b0101,4}, // BISHOP
+        {0b0111,4}, // ROOK
+        {0b1001,4}, // QUEEN
+    };
 
-      return result;
+    // Pack sfen and store in data[32].
+    void SfenPacker::pack(const Position& pos)
+    {
+        memset(data, 0, 32 /* 256bit */);
+        stream.set_data(data);
+
+        // turn
+        // Side to move.
+        stream.write_one_bit((int)(pos.side_to_move()));
+
+        // 7-bit positions for leading and trailing balls
+        // White king and black king, 6 bits for each.
+        for(auto c: Colors)
+            stream.write_n_bit(pos.king_square(c), 6);
+
+        // Write the pieces on the board other than the kings.
+        for (Rank r = RANK_8; r >= RANK_1; --r)
+        {
+            for (File f = FILE_A; f <= FILE_H; ++f)
+            {
+                Piece pc = pos.piece_on(make_square(f, r));
+                if (type_of(pc) == KING)
+                    continue;
+                write_board_piece_to_stream(pc);
+            }
+        }
+
+        // TODO(someone): Support chess960.
+        stream.write_one_bit(pos.can_castle(WHITE_OO));
+        stream.write_one_bit(pos.can_castle(WHITE_OOO));
+        stream.write_one_bit(pos.can_castle(BLACK_OO));
+        stream.write_one_bit(pos.can_castle(BLACK_OOO));
+
+        if (pos.ep_square() == SQ_NONE) {
+            stream.write_one_bit(0);
+        }
+        else {
+            stream.write_one_bit(1);
+            stream.write_n_bit(static_cast<int>(pos.ep_square()), 6);
+        }
+
+        stream.write_n_bit(pos.state()->rule50, 6);
+
+        const int fm = 1 + (pos.game_ply()-(pos.side_to_move() == BLACK)) / 2;
+        stream.write_n_bit(fm, 8);
+
+        // Write high bits of half move. This is a fix for the
+        // limited range of half move counter.
+        // This is backwards compatibile.
+        stream.write_n_bit(fm >> 8, 8);
+
+        // Write the highest bit of rule50 at the end. This is a backwards
+        // compatibile fix for rule50 having only 6 bits stored.
+        // This bit is just ignored by the old parsers.
+        stream.write_n_bit(pos.state()->rule50 >> 6, 1);
+
+        assert(stream.get_cursor() <= 256);
     }
 
-  private:
-    // Next bit position to read/write.
-    int bit_cursor;
-
-    // data entity
-    std::uint8_t* data;
-  };
-
-  // Class for compressing/decompressing sfen
-  // sfen can be packed to 256bit (32bytes) by Huffman coding.
-  // This is proven by mini. The above is Huffman coding.
-  //
-  // Internal format = 1-bit turn + 7-bit king position *2 + piece on board (Huffman coding) + hand piece (Huffman coding)
-  // Side to move (White = 0, Black = 1) (1bit)
-  // White King Position (6 bits)
-  // Black King Position (6 bits)
-  // Huffman Encoding of the board
-  // Castling availability (1 bit x 4)
-  // En passant square (1 or 1 + 6 bits)
-  // Rule 50 (6 bits)
-  // Game play (8 bits)
-  //
-  // TODO(someone): Rename SFEN to FEN.
-  //
-  struct SfenPacker
-  {
-    void pack(const Position& pos);
-
-    // sfen packed by pack() (256bit = 32bytes)
-    // Or sfen to decode with unpack()
-    uint8_t *data; // uint8_t[32];
-
-    BitStream stream;
-
     // Output the board pieces to stream.
-    void write_board_piece_to_stream(Piece pc);
+    void SfenPacker::write_board_piece_to_stream(Piece pc)
+    {
+        // piece type
+        PieceType pr = type_of(pc);
+        auto c = huffman_table[pr];
+        stream.write_n_bit(c.code, c.bits);
+
+        if (pc == NO_PIECE)
+            return;
+
+        // first and second flag
+        stream.write_one_bit(color_of(pc));
+    }
 
     // Read one board piece from stream
-    Piece read_board_piece_from_stream();
-  };
-
-
-  // Huffman coding
-  // * is simplified from mini encoding to make conversion easier.
-  //
-  // Huffman Encoding
-  //
-  // Empty  xxxxxxx0
-  // Pawn   xxxxx001 + 1 bit (Color)
-  // Knight xxxxx011 + 1 bit (Color)
-  // Bishop xxxxx101 + 1 bit (Color)
-  // Rook   xxxxx111 + 1 bit (Color)
-  // Queen   xxxx1001 + 1 bit (Color)
-  //
-  // Worst case:
-  // - 32 empty squares    32 bits
-  // - 30 pieces           150 bits
-  // - 2 kings             12 bits
-  // - castling rights     4 bits
-  // - ep square           7 bits
-  // - rule50              7 bits
-  // - game ply            16 bits
-  // - TOTAL               228 bits < 256 bits
-
-  struct HuffmanedPiece
-  {
-    int code; // how it will be coded
-    int bits; // How many bits do you have
-  };
-
-  constexpr HuffmanedPiece huffman_table[] =
-  {
-    {0b0000,1}, // NO_PIECE
-    {0b0001,4}, // PAWN
-    {0b0011,4}, // KNIGHT
-    {0b0101,4}, // BISHOP
-    {0b0111,4}, // ROOK
-    {0b1001,4}, // QUEEN
-  };
-
-  // Pack sfen and store in data[32].
-  void SfenPacker::pack(const Position& pos)
-  {
-  // cout << pos;
-
-    memset(data, 0, 32 /* 256bit */);
-    stream.set_data(data);
-
-    // turn
-    // Side to move.
-    stream.write_one_bit((int)(pos.side_to_move()));
-
-    // 7-bit positions for leading and trailing balls
-    // White king and black king, 6 bits for each.
-    for(auto c: Colors)
-      stream.write_n_bit(pos.king_square(c), 6);
-
-    // Write the pieces on the board other than the kings.
-    for (Rank r = RANK_8; r >= RANK_1; --r)
+    Piece SfenPacker::read_board_piece_from_stream()
     {
-      for (File f = FILE_A; f <= FILE_H; ++f)
-      {
-        Piece pc = pos.piece_on(make_square(f, r));
-        if (type_of(pc) == KING)
-          continue;
-        write_board_piece_to_stream(pc);
-      }
-    }
-
-    // TODO(someone): Support chess960.
-    stream.write_one_bit(pos.can_castle(WHITE_OO));
-    stream.write_one_bit(pos.can_castle(WHITE_OOO));
-    stream.write_one_bit(pos.can_castle(BLACK_OO));
-    stream.write_one_bit(pos.can_castle(BLACK_OOO));
-
-    if (pos.ep_square() == SQ_NONE) {
-      stream.write_one_bit(0);
-    }
-    else {
-      stream.write_one_bit(1);
-      stream.write_n_bit(static_cast<int>(pos.ep_square()), 6);
-    }
-
-    stream.write_n_bit(pos.state()->rule50, 6);
-
-    const int fm = 1 + (pos.game_ply()-(pos.side_to_move() == BLACK)) / 2;
-    stream.write_n_bit(fm, 8);
-
-    // Write high bits of half move. This is a fix for the
-    // limited range of half move counter.
-    // This is backwards compatibile.
-    stream.write_n_bit(fm >> 8, 8);
-
-    // Write the highest bit of rule50 at the end. This is a backwards
-    // compatibile fix for rule50 having only 6 bits stored.
-    // This bit is just ignored by the old parsers.
-    stream.write_n_bit(pos.state()->rule50 >> 6, 1);
-
-    assert(stream.get_cursor() <= 256);
-  }
-
-  // Output the board pieces to stream.
-  void SfenPacker::write_board_piece_to_stream(Piece pc)
-  {
-    // piece type
-    PieceType pr = type_of(pc);
-    auto c = huffman_table[pr];
-    stream.write_n_bit(c.code, c.bits);
-
-    if (pc == NO_PIECE)
-      return;
-
-    // first and second flag
-    stream.write_one_bit(color_of(pc));
-  }
-
-  // Read one board piece from stream
-  Piece SfenPacker::read_board_piece_from_stream()
-  {
-    PieceType pr = NO_PIECE_TYPE;
-    int code = 0, bits = 0;
-    while (true)
-    {
-      code |= stream.read_one_bit() << bits;
-      ++bits;
-
-      assert(bits <= 6);
-
-      for (pr = NO_PIECE_TYPE; pr <KING; ++pr)
-        if (huffman_table[pr].code == code
-          && huffman_table[pr].bits == bits)
-          goto Found;
-    }
-  Found:;
-    if (pr == NO_PIECE_TYPE)
-      return NO_PIECE;
-
-    // first and second flag
-    Color c = (Color)stream.read_one_bit();
-
-    return make_piece(c, pr);
-  }
-
-  int set_from_packed_sfen(Position& pos, const PackedSfen& sfen, StateInfo* si, Thread* th)
-  {
-    SfenPacker packer;
-    auto& stream = packer.stream;
-
-    // TODO: separate streams for writing and reading. Here we actually have to
-    // const_cast which is not safe in the long run.
-    stream.set_data(const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(&sfen)));
-
-    pos.clear();
-    std::memset(si, 0, sizeof(StateInfo));
-    std::fill_n(&pos.pieceList[0][0], sizeof(pos.pieceList) / sizeof(Square), SQ_NONE);
-    pos.st = si;
-
-    // Active color
-    pos.sideToMove = (Color)stream.read_one_bit();
-
-    pos.pieceList[W_KING][0] = SQUARE_NB;
-    pos.pieceList[B_KING][0] = SQUARE_NB;
-
-    // First the position of the ball
-    for (auto c : Colors)
-      pos.board[stream.read_n_bit(6)] = make_piece(c, KING);
-
-    // Piece placement
-    for (Rank r = RANK_8; r >= RANK_1; --r)
-    {
-      for (File f = FILE_A; f <= FILE_H; ++f)
-      {
-        auto sq = make_square(f, r);
-
-        // it seems there are already balls
-        Piece pc;
-        if (type_of(pos.board[sq]) != KING)
+        PieceType pr = NO_PIECE_TYPE;
+        int code = 0, bits = 0;
+        while (true)
         {
-          assert(pos.board[sq] == NO_PIECE);
-          pc = packer.read_board_piece_from_stream();
+            code |= stream.read_one_bit() << bits;
+            ++bits;
+
+            assert(bits <= 6);
+
+            for (pr = NO_PIECE_TYPE; pr <KING; ++pr)
+                if (huffman_table[pr].code == code
+                    && huffman_table[pr].bits == bits)
+                    goto Found;
         }
-        else
+    Found:;
+        if (pr == NO_PIECE_TYPE)
+            return NO_PIECE;
+
+        // first and second flag
+        Color c = (Color)stream.read_one_bit();
+
+        return make_piece(c, pr);
+    }
+
+    int set_from_packed_sfen(Position& pos, const PackedSfen& sfen, StateInfo* si, Thread* th)
+    {
+        SfenPacker packer;
+        auto& stream = packer.stream;
+
+        // TODO: separate streams for writing and reading. Here we actually have to
+        // const_cast which is not safe in the long run.
+        stream.set_data(const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(&sfen)));
+
+        pos.clear();
+        std::memset(si, 0, sizeof(StateInfo));
+        std::fill_n(&pos.pieceList[0][0], sizeof(pos.pieceList) / sizeof(Square), SQ_NONE);
+        pos.st = si;
+
+        // Active color
+        pos.sideToMove = (Color)stream.read_one_bit();
+
+        pos.pieceList[W_KING][0] = SQUARE_NB;
+        pos.pieceList[B_KING][0] = SQUARE_NB;
+
+        // First the position of the ball
+        for (auto c : Colors)
+            pos.board[stream.read_n_bit(6)] = make_piece(c, KING);
+
+        // Piece placement
+        for (Rank r = RANK_8; r >= RANK_1; --r)
         {
-          pc = pos.board[sq];
-          // put_piece() will catch ASSERT unless you remove it all.
-          pos.board[sq] = NO_PIECE;
+            for (File f = FILE_A; f <= FILE_H; ++f)
+            {
+                auto sq = make_square(f, r);
+
+                // it seems there are already balls
+                Piece pc;
+                if (type_of(pos.board[sq]) != KING)
+                {
+                    assert(pos.board[sq] == NO_PIECE);
+                    pc = packer.read_board_piece_from_stream();
+                }
+                else
+                {
+                    pc = pos.board[sq];
+                    // put_piece() will catch ASSERT unless you remove it all.
+                    pos.board[sq] = NO_PIECE;
+                }
+
+                // There may be no pieces, so skip in that case.
+                if (pc == NO_PIECE)
+                    continue;
+
+                pos.put_piece(Piece(pc), sq);
+
+                if (stream.get_cursor()> 256)
+                    return 1;
+            }
         }
 
-        // There may be no pieces, so skip in that case.
-        if (pc == NO_PIECE)
-          continue;
+        // Castling availability.
+        // TODO(someone): Support chess960.
+        pos.st->castlingRights = 0;
+        if (stream.read_one_bit()) {
+            Square rsq;
+            for (rsq = relative_square(WHITE, SQ_H1); pos.piece_on(rsq) != W_ROOK; --rsq) {}
+            pos.set_castling_right(WHITE, rsq);
+        }
+        if (stream.read_one_bit()) {
+            Square rsq;
+            for (rsq = relative_square(WHITE, SQ_A1); pos.piece_on(rsq) != W_ROOK; ++rsq) {}
+            pos.set_castling_right(WHITE, rsq);
+        }
+        if (stream.read_one_bit()) {
+            Square rsq;
+            for (rsq = relative_square(BLACK, SQ_H1); pos.piece_on(rsq) != B_ROOK; --rsq) {}
+            pos.set_castling_right(BLACK, rsq);
+        }
+        if (stream.read_one_bit()) {
+            Square rsq;
+            for (rsq = relative_square(BLACK, SQ_A1); pos.piece_on(rsq) != B_ROOK; ++rsq) {}
+            pos.set_castling_right(BLACK, rsq);
+        }
 
-        pos.put_piece(Piece(pc), sq);
+        // En passant square. Ignore if no pawn capture is possible
+        if (stream.read_one_bit()) {
+            Square ep_square = static_cast<Square>(stream.read_n_bit(6));
+            pos.st->epSquare = ep_square;
 
-        if (stream.get_cursor()> 256)
-          return 1;
+            if (!(pos.attackers_to(pos.st->epSquare) & pos.pieces(pos.sideToMove, PAWN))
+                || !(pos.pieces(~pos.sideToMove, PAWN) & (pos.st->epSquare + pawn_push(~pos.sideToMove))))
+                pos.st->epSquare = SQ_NONE;
+        }
+        else {
+            pos.st->epSquare = SQ_NONE;
+        }
 
-        //assert(stream.get_cursor() <= 256);
-      }
+        // Halfmove clock
+        pos.st->rule50 = stream.read_n_bit(6);
+
+        // Fullmove number
+        pos.gamePly = stream.read_n_bit(8);
+
+        // Read the highest bit of rule50. This was added as a fix for rule50
+        // counter having only 6 bits stored.
+        // In older entries this will just be a zero bit.
+        pos.gamePly |= stream.read_n_bit(8) << 8;
+
+        // Read the highest bit of rule50. This was added as a fix for rule50
+        // counter having only 6 bits stored.
+        // In older entries this will just be a zero bit.
+        pos.st->rule50 |= stream.read_n_bit(1) << 6;
+
+        // Convert from fullmove starting from 1 to gamePly starting from 0,
+        // handle also common incorrect FEN with fullmove = 0.
+        pos.gamePly = std::max(2 * (pos.gamePly - 1), 0) + (pos.sideToMove == BLACK);
+
+        assert(stream.get_cursor() <= 256);
+
+        pos.chess960 = false;
+        pos.thisThread = th;
+        pos.set_state(pos.st);
+
+        assert(pos.pos_is_ok());
+
+        return 0;
     }
 
-    // Castling availability.
-    // TODO(someone): Support chess960.
-    pos.st->castlingRights = 0;
-    if (stream.read_one_bit()) {
-      Square rsq;
-      for (rsq = relative_square(WHITE, SQ_H1); pos.piece_on(rsq) != W_ROOK; --rsq) {}
-      pos.set_castling_right(WHITE, rsq);
+    PackedSfen sfen_pack(Position& pos)
+    {
+        PackedSfen sfen;
+
+        SfenPacker sp;
+        sp.data = (uint8_t*)&sfen;
+        sp.pack(pos);
+
+        return sfen;
     }
-    if (stream.read_one_bit()) {
-      Square rsq;
-      for (rsq = relative_square(WHITE, SQ_A1); pos.piece_on(rsq) != W_ROOK; ++rsq) {}
-      pos.set_castling_right(WHITE, rsq);
-    }
-    if (stream.read_one_bit()) {
-      Square rsq;
-      for (rsq = relative_square(BLACK, SQ_H1); pos.piece_on(rsq) != B_ROOK; --rsq) {}
-      pos.set_castling_right(BLACK, rsq);
-    }
-    if (stream.read_one_bit()) {
-      Square rsq;
-      for (rsq = relative_square(BLACK, SQ_A1); pos.piece_on(rsq) != B_ROOK; ++rsq) {}
-      pos.set_castling_right(BLACK, rsq);
-    }
-
-    // En passant square. Ignore if no pawn capture is possible
-    if (stream.read_one_bit()) {
-      Square ep_square = static_cast<Square>(stream.read_n_bit(6));
-      pos.st->epSquare = ep_square;
-
-      if (!(pos.attackers_to(pos.st->epSquare) & pos.pieces(pos.sideToMove, PAWN))
-        || !(pos.pieces(~pos.sideToMove, PAWN) & (pos.st->epSquare + pawn_push(~pos.sideToMove))))
-        pos.st->epSquare = SQ_NONE;
-    }
-    else {
-      pos.st->epSquare = SQ_NONE;
-    }
-
-    // Halfmove clock
-    pos.st->rule50 = stream.read_n_bit(6);
-
-    // Fullmove number
-    pos.gamePly = stream.read_n_bit(8);
-
-    // Read the highest bit of rule50. This was added as a fix for rule50
-    // counter having only 6 bits stored.
-    // In older entries this will just be a zero bit.
-    pos.gamePly |= stream.read_n_bit(8) << 8;
-
-    // Read the highest bit of rule50. This was added as a fix for rule50
-    // counter having only 6 bits stored.
-    // In older entries this will just be a zero bit.
-    pos.st->rule50 |= stream.read_n_bit(1) << 6;
-
-    // Convert from fullmove starting from 1 to gamePly starting from 0,
-    // handle also common incorrect FEN with fullmove = 0.
-    pos.gamePly = std::max(2 * (pos.gamePly - 1), 0) + (pos.sideToMove == BLACK);
-
-    assert(stream.get_cursor() <= 256);
-
-    pos.chess960 = false;
-    pos.thisThread = th;
-    pos.set_state(pos.st);
-
-    assert(pos.pos_is_ok());
-
-    return 0;
-  }
-
-  PackedSfen sfen_pack(Position& pos)
-  {
-    PackedSfen sfen;
-
-    SfenPacker sp;
-    sp.data = (uint8_t*)&sfen;
-    sp.pack(pos);
-
-    return sfen;
-  }
 }

From 5856237e3f397ae2db2a0c69e5648386507b019f Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 14 Oct 2020 19:59:31 +0200
Subject: [PATCH 202/398] Rename hirate to startpos

---
 src/learn/learn.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index b09700e9..d0e84945 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -768,12 +768,11 @@ namespace Learner
         atomic<int> move_accord_count;
         move_accord_count = 0;
 
-        // Display the value of eval() in the initial stage of Hirate and see the shaking.
         auto th = Threads[thread_id];
         auto& pos = th->rootPos;
         StateInfo si;
         pos.set(StartFEN, false, &si, th);
-        cout << "hirate eval = " << Eval::evaluate(pos) << endl;
+        cout << "startpos eval = " << Eval::evaluate(pos) << endl;
 
         // It's better to parallelize here, but it's a bit
         // troublesome because the search before slave has not finished.

From e503cc4ea80920d96be58f11c36c828076b380de Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 14 Oct 2020 19:59:41 +0200
Subject: [PATCH 203/398] Add one more empty line between progress reports.

---
 src/learn/learn.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index d0e84945..0fce5d95 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -903,6 +903,10 @@ namespace Learner
                     << " , learn_entropy = " << learn_sum_entropy / done
                     << endl;
             }
+
+            // Bigger space between progress reports so that they can be more
+            // easly disinguished. Looking for timestamps is hard.
+            cout << endl;
         }
         else
         {

From 5db46d0c82a12f834e65d6464e43b9aa346d3b3f Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 14 Oct 2020 20:20:10 +0200
Subject: [PATCH 204/398] Verify whether there is a network being used during
 training.

---
 src/learn/gensfen.cpp      |  2 +-
 src/learn/learn.cpp        | 46 ++++++++++++++++++++------------------
 src/learn/multi_think.cpp  |  5 -----
 src/nnue/evaluate_nnue.cpp | 30 ++++++++++++++++++++++++-
 src/nnue/evaluate_nnue.h   |  3 ++-
 src/search.cpp             |  2 +-
 src/uci.cpp                |  2 +-
 7 files changed, 58 insertions(+), 32 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 1a9187ae..22fddafb 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -1000,7 +1000,7 @@ namespace Learner
             << "  detect_draw_by_insufficient_mating_material = " << detect_draw_by_insufficient_mating_material << endl;
 
         // Show if the training data generator uses NNUE.
-        Eval::NNUE::verify();
+        Eval::NNUE::verify_eval_file_loaded();
 
         Threads.main()->ponder = false;
 
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 0fce5d95..a0a8ec07 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -1486,6 +1486,27 @@ namespace Learner
         std::cout << "..shuffle_on_memory done." << std::endl;
     }
 
+    static void set_learning_search_limits()
+    {
+        // About Search::Limits
+        // Be careful because this member variable is global and affects other threads.
+        auto& limits = Search::Limits;
+
+        limits.startTime = now();
+
+        // Make the search equivalent to the "go infinite" command. (Because it is troublesome if time management is done)
+        limits.infinite = true;
+
+        // Since PV is an obstacle when displayed, erase it.
+        limits.silent = true;
+
+        // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it.
+        limits.nodes = 0;
+
+        // depth is also processed by the one passed as an argument of Learner::search().
+        limits.depth = 0;
+    }
+
     // Learning from the generated game record
     void learn(Position&, istringstream& is)
     {
@@ -1837,30 +1858,9 @@ namespace Learner
 
         cout << "init.." << endl;
 
-        // Read evaluation function parameters
-        Eval::NNUE::init();
-
         Threads.main()->ponder = false;
 
-        // About Search::Limits
-        // Be careful because this member variable is global and affects other threads.
-        {
-          auto& limits = Search::Limits;
-
-          limits.startTime = now();
-
-          // Make the search equivalent to the "go infinite" command. (Because it is troublesome if time management is done)
-          limits.infinite = true;
-
-          // Since PV is an obstacle when displayed, erase it.
-          limits.silent = true;
-
-          // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it.
-          limits.nodes = 0;
-
-          // depth is also processed by the one passed as an argument of Learner::search().
-          limits.depth = 0;
-        }
+        set_learning_search_limits();
 
         cout << "init_training.." << endl;
         Eval::NNUE::InitializeTraining(seed);
@@ -1907,6 +1907,8 @@ namespace Learner
             sr.read_validation_set(validation_set_file_name, eval_limit);
         }
 
+        Eval::NNUE::verify_any_net_loaded();
+
         // Calculate rmse once at this point (timing of 0 sfen)
         // sr.calc_rmse();
 
diff --git a/src/learn/multi_think.cpp b/src/learn/multi_think.cpp
index d2ae65eb..bf1ab29b 100644
--- a/src/learn/multi_think.cpp
+++ b/src/learn/multi_think.cpp
@@ -11,11 +11,6 @@
 
 void MultiThink::go_think()
 {
-    // Read evaluation function, etc.
-    // In the case of the learn command, the value of the evaluation function may be corrected after reading the evaluation function, so
-    // Skip memory corruption check.
-    Eval::NNUE::init();
-
     // Call the derived class's init().
     init();
 
diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index f7f9adcc..e3a7be63 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -235,6 +235,7 @@ namespace Eval::NNUE {
             else
             {
                 sync_cout << "info string ERROR: failed to load eval file " << directory + eval_file << sync_endl;
+                eval_file_loaded.clear();
             }
         }
 
@@ -243,7 +244,7 @@ namespace Eval::NNUE {
   }
 
   /// NNUE::verify() verifies that the last net used was loaded successfully
-  void verify() {
+  void verify_eval_file_loaded() {
 
     std::string eval_file = std::string(Options["EvalFile"]);
 
@@ -273,4 +274,31 @@ namespace Eval::NNUE {
         sync_cout << "info string classical evaluation enabled" << sync_endl;
   }
 
+  /// In training we override eval file so this is useful.
+  void verify_any_net_loaded() {
+
+    if (useNNUE != UseNNUEMode::False && eval_file_loaded.empty())
+    {
+        UCI::OptionsMap defaults;
+        UCI::init(defaults);
+
+        std::string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available.";
+        std::string msg2 = "The option is set to true, but the network file was not loaded successfully.";
+        std::string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file.";
+        std::string msg5 = "The engine will be terminated now.";
+
+        sync_cout << "info string ERROR: " << msg1 << sync_endl;
+        sync_cout << "info string ERROR: " << msg2 << sync_endl;
+        sync_cout << "info string ERROR: " << msg3 << sync_endl;
+        sync_cout << "info string ERROR: " << msg5 << sync_endl;
+
+        std::exit(EXIT_FAILURE);
+    }
+
+    if (useNNUE != UseNNUEMode::False)
+        sync_cout << "info string NNUE evaluation using " << eval_file_loaded << " enabled" << sync_endl;
+    else
+        sync_cout << "info string classical evaluation enabled" << sync_endl;
+  }
+
 } // namespace Eval::NNUE
diff --git a/src/nnue/evaluate_nnue.h b/src/nnue/evaluate_nnue.h
index dcfa071d..5335713b 100644
--- a/src/nnue/evaluate_nnue.h
+++ b/src/nnue/evaluate_nnue.h
@@ -96,7 +96,8 @@ namespace Eval::NNUE {
   Value evaluate(const Position& pos);
   bool load_eval(std::string name, std::istream& stream);
   void init();
-  void verify();
+  void verify_eval_file_loaded();
+  void verify_any_net_loaded();
 
 }  // namespace Eval::NNUE
 
diff --git a/src/search.cpp b/src/search.cpp
index 79848812..436e11fd 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -219,7 +219,7 @@ void MainThread::search() {
   Time.init(Limits, us, rootPos.game_ply());
   TT.new_search();
 
-  Eval::NNUE::verify();
+  Eval::NNUE::verify_eval_file_loaded();
 
   if (rootMoves.empty())
   {
diff --git a/src/uci.cpp b/src/uci.cpp
index ff735b2e..896f6db8 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -101,7 +101,7 @@ namespace {
     Position p;
     p.set(pos.fen(), Options["UCI_Chess960"], &states->back(), Threads.main());
 
-    Eval::NNUE::verify();
+    Eval::NNUE::verify_eval_file_loaded();
 
     sync_cout << "\n" << Eval::trace(p) << sync_endl;
   }

From 3cf193a90eb400b0bea0dfd562cf41ba8a2d420b Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 16 Oct 2020 19:37:53 +0200
Subject: [PATCH 205/398] Properly handle cases in verify and init when
 SkipLoadingEval is set.

---
 src/nnue/evaluate_nnue.cpp | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index e3a7be63..4d8a4b66 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -186,11 +186,6 @@ namespace Eval::NNUE {
 
     Initialize();
 
-    if (Options["SkipLoadingEval"])
-    {
-      std::cout << "info string SkipLoadingEval set to true, Net not loaded!" << std::endl;
-      return true;
-    }
     fileName = name;
     return ReadParameters(stream);
   }
@@ -210,8 +205,12 @@ namespace Eval::NNUE {
   void init() {
 
     useNNUE = nnue_mode_from_option(Options["Use NNUE"]);
-    if (useNNUE == UseNNUEMode::False)
-        return;
+
+    if (Options["SkipLoadingEval"] || useNNUE == UseNNUEMode::False)
+    {
+      eval_file_loaded.clear();
+      return;
+    }
 
     std::string eval_file = std::string(Options["EvalFile"]);
 
@@ -277,7 +276,7 @@ namespace Eval::NNUE {
   /// In training we override eval file so this is useful.
   void verify_any_net_loaded() {
 
-    if (useNNUE != UseNNUEMode::False && eval_file_loaded.empty())
+    if (!Options["SkipLoadingEval"] && useNNUE != UseNNUEMode::False && eval_file_loaded.empty())
     {
         UCI::OptionsMap defaults;
         UCI::init(defaults);

From c93f8732bfcdebcb23518c3ffe58ce9a5356cfac Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 16 Oct 2020 19:40:05 +0200
Subject: [PATCH 206/398] Force Use NNUE to pure when learning.

---
 src/learn/learn.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index a0a8ec07..95cbe4bb 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -1685,7 +1685,6 @@ namespace Learner
             else if (option == "seed") is >> seed;
             else if (option == "set_recommended_uci_options")
             {
-                UCI::setoption("Use NNUE", "pure");
                 UCI::setoption("MultiPV", "1");
                 UCI::setoption("Contempt", "0");
                 UCI::setoption("Skill Level", "20");
@@ -1907,6 +1906,9 @@ namespace Learner
             sr.read_validation_set(validation_set_file_name, eval_limit);
         }
 
+        cout << "Forcing Use NNUE pure.\n";
+        UCI::setoption("Use NNUE", "pure");
+
         Eval::NNUE::verify_any_net_loaded();
 
         // Calculate rmse once at this point (timing of 0 sfen)

From ca760c3a5b78e74a06f3790492e91281a6c1159c Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 14 Oct 2020 20:24:26 +0200
Subject: [PATCH 207/398] Cleanup architecture files.

---
 .../architectures/halfkp-cr-ep_256x2-32-32.h  | 66 +++++++++----------
 .../architectures/halfkp-cr_256x2-32-32.h     | 38 +++++------
 src/nnue/architectures/halfkp_256x2-32-32.h   | 60 ++++++++---------
 src/nnue/architectures/halfkp_384x2-32-32.h   | 44 ++++++-------
 .../architectures/k-p-cr-ep_256x2-32-32.h     | 36 +++++-----
 src/nnue/architectures/k-p-cr_256x2-32-32.h   | 34 +++++-----
 src/nnue/architectures/k-p_256x2-32-32.h      | 44 ++++++-------
 7 files changed, 153 insertions(+), 169 deletions(-)

diff --git a/src/nnue/architectures/halfkp-cr-ep_256x2-32-32.h b/src/nnue/architectures/halfkp-cr-ep_256x2-32-32.h
index a90de8e6..6327b78a 100644
--- a/src/nnue/architectures/halfkp-cr-ep_256x2-32-32.h
+++ b/src/nnue/architectures/halfkp-cr-ep_256x2-32-32.h
@@ -1,19 +1,19 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
 
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
 
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 // Definition of input features and network structure used in NNUE evaluation function
@@ -21,36 +21,36 @@
 #ifndef NNUE_HALFKP_CR_EP_256X2_32_32_H_INCLUDED
 #define NNUE_HALFKP_CR_EP_256X2_32_32_H_INCLUDED
 
-#include "../features/feature_set.h"
-#include "../features/half_kp.h"
-#include "../features/castling_right.h"
-#include "../features/enpassant.h"
+#include "nnue/features/feature_set.h"
+#include "nnue/features/half_kp.h"
+#include "nnue/features/castling_right.h"
+#include "nnue/features/enpassant.h"
 
-#include "../layers/input_slice.h"
-#include "../layers/affine_transform.h"
-#include "../layers/clipped_relu.h"
+#include "nnue/layers/input_slice.h"
+#include "nnue/layers/affine_transform.h"
+#include "nnue/layers/clipped_relu.h"
 
 namespace Eval::NNUE {
 
-// Input features used in evaluation function
-using RawFeatures = Features::FeatureSet<
-    Features::HalfKP<Features::Side::kFriend>, Features::CastlingRight,
-    Features::EnPassant>;
+    // Input features used in evaluation function
+    using RawFeatures = Features::FeatureSet<
+        Features::HalfKP<Features::Side::kFriend>, Features::CastlingRight,
+        Features::EnPassant>;
 
-// Number of input feature dimensions after conversion
-constexpr IndexType kTransformedFeatureDimensions = 256;
+    // Number of input feature dimensions after conversion
+    constexpr IndexType kTransformedFeatureDimensions = 256;
 
-namespace Layers {
+    namespace Layers {
 
-// Define network structure
-using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+        // Define network structure
+        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+        using OutputLayer = AffineTransform<HiddenLayer2, 1>;
 
-}  // namespace Layers
+    }  // namespace Layers
 
-using Network = Layers::OutputLayer;
+    using Network = Layers::OutputLayer;
 
 }  // namespace Eval::NNUE
 
diff --git a/src/nnue/architectures/halfkp-cr_256x2-32-32.h b/src/nnue/architectures/halfkp-cr_256x2-32-32.h
index df14f499..dd587d1d 100644
--- a/src/nnue/architectures/halfkp-cr_256x2-32-32.h
+++ b/src/nnue/architectures/halfkp-cr_256x2-32-32.h
@@ -3,34 +3,34 @@
 #ifndef NNUE_HALFKP_CR_256X2_32_32_H_INCLUDED
 #define NNUE_HALFKP_CR_256X2_32_32_H_INCLUDED
 
-#include "../features/feature_set.h"
-#include "../features/half_kp.h"
-#include "../features/castling_right.h"
+#include "nnue/features/feature_set.h"
+#include "nnue/features/half_kp.h"
+#include "nnue/features/castling_right.h"
 
-#include "../layers/input_slice.h"
-#include "../layers/affine_transform.h"
-#include "../layers/clipped_relu.h"
+#include "nnue/layers/input_slice.h"
+#include "nnue/layers/affine_transform.h"
+#include "nnue/layers/clipped_relu.h"
 
 namespace Eval::NNUE {
 
-// Input features used in evaluation function
-using RawFeatures = Features::FeatureSet<
-    Features::HalfKP<Features::Side::kFriend>, Features::CastlingRight>;
+    // Input features used in evaluation function
+    using RawFeatures = Features::FeatureSet<
+        Features::HalfKP<Features::Side::kFriend>, Features::CastlingRight>;
 
-// Number of input feature dimensions after conversion
-constexpr IndexType kTransformedFeatureDimensions = 256;
+    // Number of input feature dimensions after conversion
+    constexpr IndexType kTransformedFeatureDimensions = 256;
 
-namespace Layers {
+    namespace Layers {
 
-// Define network structure
-using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+        // Define network structure
+        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+        using OutputLayer = AffineTransform<HiddenLayer2, 1>;
 
-}  // namespace Layers
+    }  // namespace Layers
 
-using Network = Layers::OutputLayer;
+    using Network = Layers::OutputLayer;
 
 }  // namespace Eval::NNUE
 
diff --git a/src/nnue/architectures/halfkp_256x2-32-32.h b/src/nnue/architectures/halfkp_256x2-32-32.h
index 9216bd41..333feb83 100644
--- a/src/nnue/architectures/halfkp_256x2-32-32.h
+++ b/src/nnue/architectures/halfkp_256x2-32-32.h
@@ -1,19 +1,19 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
 
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
 
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 // Definition of input features and network structure used in NNUE evaluation function
@@ -21,33 +21,33 @@
 #ifndef NNUE_HALFKP_256X2_32_32_H_INCLUDED
 #define NNUE_HALFKP_256X2_32_32_H_INCLUDED
 
-#include "../features/feature_set.h"
-#include "../features/half_kp.h"
+#include "nnue/features/feature_set.h"
+#include "nnue/features/half_kp.h"
 
-#include "../layers/input_slice.h"
-#include "../layers/affine_transform.h"
-#include "../layers/clipped_relu.h"
+#include "nnue/layers/input_slice.h"
+#include "nnue/layers/affine_transform.h"
+#include "nnue/layers/clipped_relu.h"
 
 namespace Eval::NNUE {
 
-// Input features used in evaluation function
-using RawFeatures = Features::FeatureSet<
-    Features::HalfKP<Features::Side::kFriend>>;
+    // Input features used in evaluation function
+    using RawFeatures = Features::FeatureSet<
+        Features::HalfKP<Features::Side::kFriend>>;
 
-// Number of input feature dimensions after conversion
-constexpr IndexType kTransformedFeatureDimensions = 256;
+    // Number of input feature dimensions after conversion
+    constexpr IndexType kTransformedFeatureDimensions = 256;
 
-namespace Layers {
+    namespace Layers {
 
-// Define network structure
-using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+        // Define network structure
+        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+        using OutputLayer = AffineTransform<HiddenLayer2, 1>;
 
-}  // namespace Layers
+    }  // namespace Layers
 
-using Network = Layers::OutputLayer;
+    using Network = Layers::OutputLayer;
 
 }  // namespace Eval::NNUE
 
diff --git a/src/nnue/architectures/halfkp_384x2-32-32.h b/src/nnue/architectures/halfkp_384x2-32-32.h
index 3d28139a..96913295 100644
--- a/src/nnue/architectures/halfkp_384x2-32-32.h
+++ b/src/nnue/architectures/halfkp_384x2-32-32.h
@@ -3,37 +3,33 @@
 #ifndef HALFKP_384X2_32_32_H
 #define HALFKP_384X2_32_32_H
 
-#include "../features/feature_set.h"
-#include "../features/half_kp.h"
+#include "nnue/features/feature_set.h"
+#include "nnue/features/half_kp.h"
 
-#include "../layers/input_slice.h"
-#include "../layers/affine_transform.h"
-#include "../layers/clipped_relu.h"
+#include "nnue/layers/input_slice.h"
+#include "nnue/layers/affine_transform.h"
+#include "nnue/layers/clipped_relu.h"
 
-namespace Eval {
+namespace Eval::NNUE {
 
-namespace NNUE {
+    // Input features used in evaluation function
+    using RawFeatures = Features::FeatureSet<
+        Features::HalfKP<Features::Side::kFriend>>;
 
-// Input features used in evaluation function
-using RawFeatures = Features::FeatureSet<
-    Features::HalfKP<Features::Side::kFriend>>;
+    // Number of input feature dimensions after conversion
+    constexpr IndexType kTransformedFeatureDimensions = 384;
 
-// Number of input feature dimensions after conversion
-constexpr IndexType kTransformedFeatureDimensions = 384;
+    namespace Layers {
 
-namespace Layers {
+        // define network structure
+        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+        using OutputLayer = AffineTransform<HiddenLayer2, 1>;
 
-// define network structure
-using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+    }  // namespace Layers
 
-}  // namespace Layers
+    using Network = Layers::OutputLayer;
 
-using Network = Layers::OutputLayer;
-
-}  // namespace NNUE
-
-}  // namespace Eval
+}  // namespace Eval::NNUE
 #endif // HALFKP_384X2_32_32_H
diff --git a/src/nnue/architectures/k-p-cr-ep_256x2-32-32.h b/src/nnue/architectures/k-p-cr-ep_256x2-32-32.h
index e178b57b..14eeba54 100644
--- a/src/nnue/architectures/k-p-cr-ep_256x2-32-32.h
+++ b/src/nnue/architectures/k-p-cr-ep_256x2-32-32.h
@@ -3,40 +3,36 @@
 #ifndef K_P_CR_EP_256X2_32_32_H
 #define K_P_CR_EP_256X2_32_32_H
 
-#include "../features/feature_set.h"
-#include "../features/k.h"
-#include "../features/p.h"
-#include "../features/castling_right.h"
-#include "../features/enpassant.h"
+#include "nnue/features/feature_set.h"
+#include "nnue/features/k.h"
+#include "nnue/features/p.h"
+#include "nnue/features/castling_right.h"
+#include "nnue/features/enpassant.h"
 
-#include "../layers/input_slice.h"
-#include "../layers/affine_transform.h"
-#include "../layers/clipped_relu.h"
+#include "nnue/layers/input_slice.h"
+#include "nnue/layers/affine_transform.h"
+#include "nnue/layers/clipped_relu.h"
 
-namespace Eval {
-
-  namespace NNUE {
+namespace Eval::NNUE {
 
     // Input features used in evaluation function
     using RawFeatures = Features::FeatureSet<Features::K, Features::P,
-      Features::CastlingRight, Features::EnPassant>;
+        Features::CastlingRight, Features::EnPassant>;
 
     // Number of input feature dimensions after conversion
     constexpr IndexType kTransformedFeatureDimensions = 256;
 
     namespace Layers {
 
-      // define network structure
-      using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-      using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-      using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-      using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+        // define network structure
+        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+        using OutputLayer = AffineTransform<HiddenLayer2, 1>;
 
     }  // namespace Layers
 
     using Network = Layers::OutputLayer;
 
-  }  // namespace NNUE
-
-}  // namespace Eval
+}  // namespace Eval::NNUE
 #endif // K_P_CR_EP_256X2_32_32_H
diff --git a/src/nnue/architectures/k-p-cr_256x2-32-32.h b/src/nnue/architectures/k-p-cr_256x2-32-32.h
index d3c187c0..1db34b22 100644
--- a/src/nnue/architectures/k-p-cr_256x2-32-32.h
+++ b/src/nnue/architectures/k-p-cr_256x2-32-32.h
@@ -3,39 +3,35 @@
 #ifndef K_P_CR_256X2_32_32_H
 #define K_P_CR_256X2_32_32_H
 
-#include "../features/feature_set.h"
-#include "../features/k.h"
-#include "../features/p.h"
-#include "../features/castling_right.h"
+#include "nnue/features/feature_set.h"
+#include "nnue/features/k.h"
+#include "nnue/features/p.h"
+#include "nnue/features/castling_right.h"
 
-#include "../layers/input_slice.h"
-#include "../layers/affine_transform.h"
-#include "../layers/clipped_relu.h"
+#include "nnue/layers/input_slice.h"
+#include "nnue/layers/affine_transform.h"
+#include "nnue/layers/clipped_relu.h"
 
-namespace Eval {
-
-  namespace NNUE {
+namespace Eval::NNUE {
 
     // Input features used in evaluation function
     using RawFeatures = Features::FeatureSet<Features::K, Features::P,
-      Features::CastlingRight>;
+        Features::CastlingRight>;
 
     // Number of input feature dimensions after conversion
     constexpr IndexType kTransformedFeatureDimensions = 256;
 
     namespace Layers {
 
-      // define network structure
-      using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-      using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-      using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-      using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+        // define network structure
+        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+        using OutputLayer = AffineTransform<HiddenLayer2, 1>;
 
     }  // namespace Layers
 
     using Network = Layers::OutputLayer;
 
-  }  // namespace NNUE
-
-}  // namespace Eval
+}  // namespace Eval::NNUE
 #endif // K_P_CR_256X2_32_32_H
diff --git a/src/nnue/architectures/k-p_256x2-32-32.h b/src/nnue/architectures/k-p_256x2-32-32.h
index 0f340dee..92c9efcd 100644
--- a/src/nnue/architectures/k-p_256x2-32-32.h
+++ b/src/nnue/architectures/k-p_256x2-32-32.h
@@ -3,37 +3,33 @@
 #ifndef K_P_256X2_32_32_H
 #define K_P_256X2_32_32_H
 
-#include "../features/feature_set.h"
-#include "../features/k.h"
-#include "../features/p.h"
+#include "nnue/features/feature_set.h"
+#include "nnue/features/k.h"
+#include "nnue/features/p.h"
 
-#include "../layers/input_slice.h"
-#include "../layers/affine_transform.h"
-#include "../layers/clipped_relu.h"
+#include "nnue/layers/input_slice.h"
+#include "nnue/layers/affine_transform.h"
+#include "nnue/layers/clipped_relu.h"
 
-namespace Eval {
+namespace Eval::NNUE {
 
-namespace NNUE {
+    // Input features used in evaluation function
+    using RawFeatures = Features::FeatureSet<Features::K, Features::P>;
 
-// Input features used in evaluation function
-using RawFeatures = Features::FeatureSet<Features::K, Features::P>;
+    // Number of input feature dimensions after conversion
+    constexpr IndexType kTransformedFeatureDimensions = 256;
 
-// Number of input feature dimensions after conversion
-constexpr IndexType kTransformedFeatureDimensions = 256;
+    namespace Layers {
 
-namespace Layers {
+        // define network structure
+        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+        using OutputLayer = AffineTransform<HiddenLayer2, 1>;
 
-// define network structure
-using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+    }  // namespace Layers
 
-}  // namespace Layers
+    using Network = Layers::OutputLayer;
 
-using Network = Layers::OutputLayer;
-
-}  // namespace NNUE
-
-}  // namespace Eval
+}  // namespace Eval::NNUE
 #endif // K_P_256X2_32_32_H

From 0d4c3014caf06ee4382e2600264051c3dacc11a9 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 14 Oct 2020 20:44:47 +0200
Subject: [PATCH 208/398] Cleanup features.

---
 src/nnue/features/castling_right.cpp   |  94 ++---
 src/nnue/features/castling_right.h     |  48 +--
 src/nnue/features/enpassant.cpp        |  61 ++--
 src/nnue/features/enpassant.h          |  44 +--
 src/nnue/features/feature_set.h        | 470 +++++++++++++------------
 src/nnue/features/features_common.h    |  59 ++--
 src/nnue/features/half_kp.cpp          | 114 +++---
 src/nnue/features/half_kp.h            |  94 ++---
 src/nnue/features/half_relative_kp.cpp | 128 +++----
 src/nnue/features/half_relative_kp.h   |  92 ++---
 src/nnue/features/index_list.h         |  84 ++---
 src/nnue/features/k.cpp                |  67 ++--
 src/nnue/features/k.h                  |  62 ++--
 src/nnue/features/p.cpp                |  79 ++---
 src/nnue/features/p.h                  |  62 ++--
 15 files changed, 797 insertions(+), 761 deletions(-)

diff --git a/src/nnue/features/castling_right.cpp b/src/nnue/features/castling_right.cpp
index 2b3f3209..eb8a36a1 100644
--- a/src/nnue/features/castling_right.cpp
+++ b/src/nnue/features/castling_right.cpp
@@ -1,60 +1,60 @@
-//Definition of input feature quantity CastlingRight of NNUE evaluation function
-
 #include "castling_right.h"
 #include "index_list.h"
 
+//Definition of input feature quantity CastlingRight of NNUE evaluation function
 namespace Eval::NNUE::Features {
 
-  // Get a list of indices with a value of 1 among the features
-  void CastlingRight::AppendActiveIndices(
-    const Position& pos, Color perspective, IndexList* active) {
-    // do nothing if array size is small to avoid compiler warning
-    if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
+    // Get a list of indices with a value of 1 among the features
+    void CastlingRight::AppendActiveIndices(
+        const Position& pos, Color perspective, IndexList* active) {
+        // do nothing if array size is small to avoid compiler warning
+        if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
 
-    int castling_rights = pos.state()->castlingRights;
-    int relative_castling_rights;
-    if (perspective == WHITE) {
-      relative_castling_rights = castling_rights;
-    }
-    else {
-      // Invert the perspective.
-      relative_castling_rights = ((castling_rights & 3) << 2)
-        & ((castling_rights >> 2) & 3);
+        int castling_rights = pos.state()->castlingRights;
+        int relative_castling_rights;
+        if (perspective == WHITE) {
+            relative_castling_rights = castling_rights;
+        }
+        else {
+            // Invert the perspective.
+            relative_castling_rights = ((castling_rights & 3) << 2)
+                & ((castling_rights >> 2) & 3);
+        }
+
+        for (Eval::NNUE::IndexType i = 0; i < kDimensions; ++i) {
+            if (relative_castling_rights & (1 << i)) {
+                active->push_back(i);
+            }
+        }
     }
 
-    for (Eval::NNUE::IndexType i = 0; i < kDimensions; ++i) {
-      if (relative_castling_rights & (1 << i)) {
-        active->push_back(i);
-      }
-    }
-  }
+    // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+    void CastlingRight::AppendChangedIndices(
+        const Position& pos, Color perspective,
+        IndexList* removed, IndexList* /* added */) {
 
-  // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-  void CastlingRight::AppendChangedIndices(
-      const Position& pos, Color perspective,
-      IndexList* removed, IndexList* /* added */) {
-    int previous_castling_rights = pos.state()->previous->castlingRights;
-    int current_castling_rights = pos.state()->castlingRights;
-    int relative_previous_castling_rights;
-    int relative_current_castling_rights;
-    if (perspective == WHITE) {
-      relative_previous_castling_rights = previous_castling_rights;
-      relative_current_castling_rights = current_castling_rights;
-    }
-    else {
-      // Invert the perspective.
-      relative_previous_castling_rights = ((previous_castling_rights & 3) << 2)
-        & ((previous_castling_rights >> 2) & 3);
-      relative_current_castling_rights = ((current_castling_rights & 3) << 2)
-        & ((current_castling_rights >> 2) & 3);
-    }
+        int previous_castling_rights = pos.state()->previous->castlingRights;
+        int current_castling_rights = pos.state()->castlingRights;
+        int relative_previous_castling_rights;
+        int relative_current_castling_rights;
+        if (perspective == WHITE) {
+            relative_previous_castling_rights = previous_castling_rights;
+            relative_current_castling_rights = current_castling_rights;
+        }
+        else {
+            // Invert the perspective.
+            relative_previous_castling_rights = ((previous_castling_rights & 3) << 2)
+                & ((previous_castling_rights >> 2) & 3);
+            relative_current_castling_rights = ((current_castling_rights & 3) << 2)
+                & ((current_castling_rights >> 2) & 3);
+        }
 
-    for (Eval::NNUE::IndexType i = 0; i < kDimensions; ++i) {
-      if ((relative_previous_castling_rights & (1 << i)) &&
-        (relative_current_castling_rights & (1 << i)) == 0) {
-        removed->push_back(i);
-      }
+        for (Eval::NNUE::IndexType i = 0; i < kDimensions; ++i) {
+            if ((relative_previous_castling_rights & (1 << i)) &&
+                (relative_current_castling_rights & (1 << i)) == 0) {
+                removed->push_back(i);
+            }
+        }
     }
-  }
 
 }  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/castling_right.h b/src/nnue/features/castling_right.h
index 2d8c5322..3e35e432 100644
--- a/src/nnue/features/castling_right.h
+++ b/src/nnue/features/castling_right.h
@@ -1,34 +1,38 @@
-//Definition of input feature quantity CastlingRight of NNUE evaluation function
-
 #ifndef _NNUE_FEATURES_CASTLING_RIGHT_H_
 #define _NNUE_FEATURES_CASTLING_RIGHT_H_
 
-#include "../../evaluate.h"
 #include "features_common.h"
 
+#include "evaluate.h"
+
+//Definition of input feature quantity CastlingRight of NNUE evaluation function
 namespace Eval::NNUE::Features {
 
-  class CastlingRight {
-  public:
-    // feature quantity name
-    static constexpr const char* kName = "CastlingRight";
-    // Hash value embedded in the evaluation function file
-    static constexpr std::uint32_t kHashValue = 0x913968AAu;
-    // number of feature dimensions
-    static constexpr IndexType kDimensions = 4;
-    // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-    static constexpr IndexType kMaxActiveDimensions = 4;
-    // Timing of full calculation instead of difference calculation
-    static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
+    class CastlingRight {
+    public:
+        // feature quantity name
+        static constexpr const char* kName = "CastlingRight";
 
-    // Get a list of indices with a value of 1 among the features
-    static void AppendActiveIndices(const Position& pos, Color perspective,
-      IndexList* active);
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t kHashValue = 0x913968AAu;
 
-    // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-    static void AppendChangedIndices(const Position& pos, Color perspective,
-      IndexList* removed, IndexList* added);
-  };
+        // number of feature dimensions
+        static constexpr IndexType kDimensions = 4;
+
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions = 4;
+
+        // Timing of full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
+
+        // Get a list of indices with a value of 1 among the features
+        static void AppendActiveIndices(const Position& pos, Color perspective,
+            IndexList* active);
+
+        // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+        static void AppendChangedIndices(const Position& pos, Color perspective,
+            IndexList* removed, IndexList* added);
+    };
 
 }  // namespace Eval::NNUE::Features
 
diff --git a/src/nnue/features/enpassant.cpp b/src/nnue/features/enpassant.cpp
index e5ceed5c..7aa8988b 100644
--- a/src/nnue/features/enpassant.cpp
+++ b/src/nnue/features/enpassant.cpp
@@ -1,42 +1,45 @@
-//Definition of input feature quantity EnPassant of NNUE evaluation function
-
 #include "enpassant.h"
 #include "index_list.h"
 
+//Definition of input feature quantity EnPassant of NNUE evaluation function
 namespace Eval::NNUE::Features {
 
-  // Get a list of indices with a value of 1 among the features
-  void EnPassant::AppendActiveIndices(
-    const Position& pos, Color /* perspective */, IndexList* active) {
-    // do nothing if array size is small to avoid compiler warning
-    if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
+    // Get a list of indices with a value of 1 among the features
+    void EnPassant::AppendActiveIndices(
+        const Position& pos, Color /* perspective */, IndexList* active) {
 
-    auto epSquare = pos.state()->epSquare;
-    if (epSquare == SQ_NONE) {
-      return;
+        // do nothing if array size is small to avoid compiler warning
+        if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions)
+            return;
+
+        auto epSquare = pos.state()->epSquare;
+        if (epSquare == SQ_NONE)
+            return;
+
+        auto file = file_of(epSquare);
+        active->push_back(file);
     }
-    auto file = file_of(epSquare);
-    active->push_back(file);
-  }
 
-  // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-  void EnPassant::AppendChangedIndices(
-      const Position& pos, Color /* perspective */,
-      IndexList* removed, IndexList* added) {
+    // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+    void EnPassant::AppendChangedIndices(
+        const Position& pos, Color /* perspective */,
+        IndexList* removed, IndexList* added) {
 
-    auto previous_epSquare = pos.state()->previous->epSquare;
-    auto epSquare = pos.state()->epSquare;
+        auto previous_epSquare = pos.state()->previous->epSquare;
+        auto epSquare = pos.state()->epSquare;
 
-    if (previous_epSquare != SQ_NONE) {
-      if (epSquare != SQ_NONE && file_of(epSquare) == file_of(previous_epSquare))
-        return;
-      auto file = file_of(previous_epSquare);
-      removed->push_back(file);
+        if (previous_epSquare != SQ_NONE) {
+            if (epSquare != SQ_NONE && file_of(epSquare) == file_of(previous_epSquare))
+                return;
+
+            auto file = file_of(previous_epSquare);
+            removed->push_back(file);
+        }
+
+        if (epSquare != SQ_NONE) {
+            auto file = file_of(epSquare);
+            added->push_back(file);
+        }
     }
-    if (epSquare != SQ_NONE) {
-      auto file = file_of(epSquare);
-      added->push_back(file);
-    }
-  }
 
 }  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/enpassant.h b/src/nnue/features/enpassant.h
index 065e74a0..65819a96 100644
--- a/src/nnue/features/enpassant.h
+++ b/src/nnue/features/enpassant.h
@@ -1,34 +1,34 @@
-//Definition of input feature quantity EnPassant of NNUE evaluation function
-
 #ifndef _NNUE_FEATURES_ENPASSANT_H_
 #define _NNUE_FEATURES_ENPASSANT_H_
 
-#include "../../evaluate.h"
 #include "features_common.h"
 
+#include "evaluate.h"
+
+//Definition of input feature quantity EnPassant of NNUE evaluation function
 namespace Eval::NNUE::Features {
 
-  class EnPassant {
-  public:
-    // feature quantity name
-    static constexpr const char* kName = "EnPassant";
-    // Hash value embedded in the evaluation function file
-    static constexpr std::uint32_t kHashValue = 0x02924F91u;
-    // number of feature dimensions
-    static constexpr IndexType kDimensions = 8;
-    // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-    static constexpr IndexType kMaxActiveDimensions = 1;
-    // Timing of full calculation instead of difference calculation
-    static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
+    class EnPassant {
+    public:
+        // feature quantity name
+        static constexpr const char* kName = "EnPassant";
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t kHashValue = 0x02924F91u;
+        // number of feature dimensions
+        static constexpr IndexType kDimensions = 8;
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions = 1;
+        // Timing of full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
 
-    // Get a list of indices with a value of 1 among the features
-    static void AppendActiveIndices(const Position& pos, Color perspective,
-      IndexList* active);
+        // Get a list of indices with a value of 1 among the features
+        static void AppendActiveIndices(const Position& pos, Color perspective,
+            IndexList* active);
 
-    // Get a list of indices whose values have changed from the previous one in the feature quantity
-    static void AppendChangedIndices(const Position& pos, Color perspective,
-      IndexList* removed, IndexList* added);
-  };
+        // Get a list of indices whose values have changed from the previous one in the feature quantity
+        static void AppendChangedIndices(const Position& pos, Color perspective,
+            IndexList* removed, IndexList* added);
+    };
 
 }  // namespace Eval::NNUE::Features
 
diff --git a/src/nnue/features/feature_set.h b/src/nnue/features/feature_set.h
index a057142c..5b243424 100644
--- a/src/nnue/features/feature_set.h
+++ b/src/nnue/features/feature_set.h
@@ -1,19 +1,19 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
 
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
 
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 // A class template that represents the input feature set of the NNUE evaluation function
@@ -22,238 +22,266 @@
 #define NNUE_FEATURE_SET_H_INCLUDED
 
 #include "features_common.h"
+
 #include <array>
 
 namespace Eval::NNUE::Features {
 
-  // Class template that represents a list of values
-  template <typename T, T... Values>
-  struct CompileTimeList;
+    // Class template that represents a list of values
+    template <typename T, T... Values>
+    struct CompileTimeList;
 
-  template <typename T, T First, T... Remaining>
-  struct CompileTimeList<T, First, Remaining...> {
-    static constexpr bool Contains(T value) {
-      return value == First || CompileTimeList<T, Remaining...>::Contains(value);
-    }
-    static constexpr std::array<T, sizeof...(Remaining) + 1>
-        kValues = {{First, Remaining...}};
-  };
-
-  template <typename T, T First, T... Remaining>
-  constexpr std::array<T, sizeof...(Remaining) + 1>
-    CompileTimeList<T, First, Remaining...>::kValues;
-  template <typename T>
-  struct CompileTimeList<T> {
-    static constexpr bool Contains(T /*value*/) {
-      return false;
-    }
-    static constexpr std::array<T, 0> kValues = { {} };
-  };
-
-  // Class template that adds to the beginning of the list
-  template <typename T, typename ListType, T Value>
-  struct AppendToList;
-  template <typename T, T... Values, T AnotherValue>
-  struct AppendToList<T, CompileTimeList<T, Values...>, AnotherValue> {
-    using Result = CompileTimeList<T, AnotherValue, Values...>;
-  };
-
-  // Class template for adding to a sorted, unique list
-  template <typename T, typename ListType, T Value>
-  struct InsertToSet;
-  template <typename T, T First, T... Remaining, T AnotherValue>
-  struct InsertToSet<T, CompileTimeList<T, First, Remaining...>, AnotherValue> {
-    using Result = std::conditional_t<
-      CompileTimeList<T, First, Remaining...>::Contains(AnotherValue),
-      CompileTimeList<T, First, Remaining...>,
-      std::conditional_t<(AnotherValue < First),
-      CompileTimeList<T, AnotherValue, First, Remaining...>,
-      typename AppendToList<T, typename InsertToSet<
-      T, CompileTimeList<T, Remaining...>, AnotherValue>::Result,
-      First>::Result>>;
-  };
-  template <typename T, T Value>
-  struct InsertToSet<T, CompileTimeList<T>, Value> {
-    using Result = CompileTimeList<T, Value>;
-  };
-
-  // Base class of feature set
-  template <typename Derived>
-  class FeatureSetBase {
-
-   public:
-    // Get a list of indices for active features
-    template <typename IndexListType>
-    static void AppendActiveIndices(
-        const Position& pos, TriggerEvent trigger, IndexListType active[2]) {
-
-      for (Color perspective : { WHITE, BLACK }) {
-        Derived::CollectActiveIndices(
-            pos, trigger, perspective, &active[perspective]);
-      }
-    }
-
-    // Get a list of indices for recently changed features
-    template <typename PositionType, typename IndexListType>
-    static void AppendChangedIndices(
-        const PositionType& pos, TriggerEvent trigger,
-        IndexListType removed[2], IndexListType added[2], bool reset[2]) {
-
-      const auto& dp = pos.state()->dirtyPiece;
-
-      for (Color perspective : { WHITE, BLACK }) {
-        switch (trigger) {
-          case TriggerEvent::kNone:
-            break;
-          case TriggerEvent::kFriendKingMoved:
-            if (dp.dirty_num == 0) continue;
-            reset[perspective] = dp.piece[0] == make_piece(perspective, KING);
-            break;
-          case TriggerEvent::kEnemyKingMoved:
-            if (dp.dirty_num == 0) continue;
-            reset[perspective] = dp.piece[0] == make_piece(~perspective, KING);
-            break;
-          case TriggerEvent::kAnyKingMoved:
-            if (dp.dirty_num == 0) continue;
-            reset[perspective] = type_of(dp.piece[0]) == KING;
-            break;
-          case TriggerEvent::kAnyPieceMoved:
-            reset[perspective] = true;
-            break;
-          default:
-            assert(false);
-            break;
+    template <typename T, T First, T... Remaining>
+    struct CompileTimeList<T, First, Remaining...> {
+        static constexpr bool Contains(T value) {
+            return value == First || CompileTimeList<T, Remaining...>::Contains(value);
         }
-        if (reset[perspective]) {
-          Derived::CollectActiveIndices(
-              pos, trigger, perspective, &added[perspective]);
-        } else {
-          Derived::CollectChangedIndices(
-              pos, trigger, perspective,
-              &removed[perspective], &added[perspective]);
+
+        static constexpr std::array<T, sizeof...(Remaining) + 1>
+            kValues = {{First, Remaining...}};
+    };
+
+    template <typename T, T First, T... Remaining>
+    constexpr std::array<T, sizeof...(Remaining) + 1>
+        CompileTimeList<T, First, Remaining...>::kValues;
+
+    template <typename T>
+    struct CompileTimeList<T> {
+        static constexpr bool Contains(T /*value*/) {
+            return false;
         }
-      }
-    }
-  };
+        static constexpr std::array<T, 0> kValues = { {} };
+    };
 
-  // Class template that represents the feature set
-  // do internal processing in reverse order of template arguments in order to linearize the amount of calculation at runtime
-  template <typename FirstFeatureType, typename... RemainingFeatureTypes>
-  class FeatureSet<FirstFeatureType, RemainingFeatureTypes...> :
-    public FeatureSetBase<
-    FeatureSet<FirstFeatureType, RemainingFeatureTypes...>> {
-  private:
-    using Head = FirstFeatureType;
-    using Tail = FeatureSet<RemainingFeatureTypes...>;
+    // Class template that adds to the beginning of the list
+    template <typename T, typename ListType, T Value>
+    struct AppendToList;
 
-  public:
-    // Hash value embedded in the evaluation function file
-    static constexpr std::uint32_t kHashValue =
-      Head::kHashValue ^ (Tail::kHashValue << 1) ^ (Tail::kHashValue >> 31);
-    // number of feature dimensions
-    static constexpr IndexType kDimensions =
-      Head::kDimensions + Tail::kDimensions;
-    // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-    static constexpr IndexType kMaxActiveDimensions =
-      Head::kMaxActiveDimensions + Tail::kMaxActiveDimensions;
-    // List of timings to perform all calculations instead of difference calculation
-    using SortedTriggerSet = typename InsertToSet<TriggerEvent,
-      typename Tail::SortedTriggerSet, Head::kRefreshTrigger>::Result;
-    static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
+    template <typename T, T... Values, T AnotherValue>
+    struct AppendToList<T, CompileTimeList<T, Values...>, AnotherValue> {
+        using Result = CompileTimeList<T, AnotherValue, Values...>;
+    };
 
-    // Get the feature quantity name
-    static std::string GetName() {
-      return std::string(Head::kName) + "+" + Tail::GetName();
-    }
+    // Class template for adding to a sorted, unique list
+    template <typename T, typename ListType, T Value>
+    struct InsertToSet;
 
-  private:
-    // Get a list of indices with a value of 1 among the features
-    template <typename IndexListType>
-    static void CollectActiveIndices(
-      const Position& pos, const TriggerEvent trigger, const Color perspective,
-      IndexListType* const active) {
-      Tail::CollectActiveIndices(pos, trigger, perspective, active);
-      if (Head::kRefreshTrigger == trigger) {
-        const auto start = active->size();
-        Head::AppendActiveIndices(pos, perspective, active);
-        for (auto i = start; i < active->size(); ++i) {
-          (*active)[i] += Tail::kDimensions;
+    template <typename T, T First, T... Remaining, T AnotherValue>
+    struct InsertToSet<T, CompileTimeList<T, First, Remaining...>, AnotherValue> {
+        using Result =
+            std::conditional_t<
+                CompileTimeList<T, First, Remaining...>::Contains(AnotherValue),
+                CompileTimeList<T, First, Remaining...>,
+                std::conditional_t<
+                    (AnotherValue < First),
+                    CompileTimeList<T, AnotherValue, First, Remaining...>,
+                    typename AppendToList<T, typename InsertToSet<
+                        T, CompileTimeList<T, Remaining...>, AnotherValue>::Result,
+                        First
+                    >::Result
+                >
+            >;
+    };
+
+    template <typename T, T Value>
+    struct InsertToSet<T, CompileTimeList<T>, Value> {
+        using Result = CompileTimeList<T, Value>;
+    };
+
+    // Base class of feature set
+    template <typename Derived>
+    class FeatureSetBase {
+
+       public:
+        // Get a list of indices for active features
+        template <typename IndexListType>
+        static void AppendActiveIndices(
+            const Position& pos, TriggerEvent trigger, IndexListType active[2]) {
+
+            for (Color perspective : { WHITE, BLACK }) {
+                Derived::CollectActiveIndices(
+                    pos, trigger, perspective, &active[perspective]);
+            }
         }
-      }
-    }
 
-    // Get a list of indices whose values have changed from the previous one in the feature quantity
-    template <typename IndexListType>
-    static void CollectChangedIndices(
-      const Position& pos, const TriggerEvent trigger, const Color perspective,
-      IndexListType* const removed, IndexListType* const added) {
-      Tail::CollectChangedIndices(pos, trigger, perspective, removed, added);
-      if (Head::kRefreshTrigger == trigger) {
-        const auto start_removed = removed->size();
-        const auto start_added = added->size();
-        Head::AppendChangedIndices(pos, perspective, removed, added);
-        for (auto i = start_removed; i < removed->size(); ++i) {
-          (*removed)[i] += Tail::kDimensions;
+        // Get a list of indices for recently changed features
+        template <typename PositionType, typename IndexListType>
+        static void AppendChangedIndices(
+            const PositionType& pos, TriggerEvent trigger,
+            IndexListType removed[2], IndexListType added[2], bool reset[2]) {
+
+            const auto& dp = pos.state()->dirtyPiece;
+
+            for (Color perspective : { WHITE, BLACK }) {
+                switch (trigger) {
+                    case TriggerEvent::kNone:
+                        break;
+                    case TriggerEvent::kFriendKingMoved:
+                        if (dp.dirty_num == 0) continue;
+                        reset[perspective] = dp.piece[0] == make_piece(perspective, KING);
+                        break;
+                    case TriggerEvent::kEnemyKingMoved:
+                        if (dp.dirty_num == 0) continue;
+                        reset[perspective] = dp.piece[0] == make_piece(~perspective, KING);
+                        break;
+                    case TriggerEvent::kAnyKingMoved:
+                        if (dp.dirty_num == 0) continue;
+                        reset[perspective] = type_of(dp.piece[0]) == KING;
+                        break;
+                    case TriggerEvent::kAnyPieceMoved:
+                        reset[perspective] = true;
+                        break;
+                    default:
+                        assert(false);
+                        break;
+                }
+
+                if (reset[perspective]) {
+                    Derived::CollectActiveIndices(
+                        pos, trigger, perspective, &added[perspective]);
+                } else {
+                    Derived::CollectChangedIndices(
+                        pos, trigger, perspective,
+                        &removed[perspective], &added[perspective]);
+                }
+            }
         }
-        for (auto i = start_added; i < added->size(); ++i) {
-          (*added)[i] += Tail::kDimensions;
+    };
+
+    // Class template that represents the feature set
+    // do internal processing in reverse order of template arguments in order to linearize the amount of calculation at runtime
+    template <typename FirstFeatureType, typename... RemainingFeatureTypes>
+    class FeatureSet<FirstFeatureType, RemainingFeatureTypes...> :
+      public FeatureSetBase<
+          FeatureSet<FirstFeatureType, RemainingFeatureTypes...>
+      > {
+
+    private:
+        using Head = FirstFeatureType;
+        using Tail = FeatureSet<RemainingFeatureTypes...>;
+
+    public:
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t kHashValue =
+            Head::kHashValue ^ (Tail::kHashValue << 1) ^ (Tail::kHashValue >> 31);
+
+        // number of feature dimensions
+        static constexpr IndexType kDimensions =
+            Head::kDimensions + Tail::kDimensions;
+
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions =
+            Head::kMaxActiveDimensions + Tail::kMaxActiveDimensions;
+
+        // List of timings to perform all calculations instead of difference calculation
+        using SortedTriggerSet = typename InsertToSet<TriggerEvent,
+            typename Tail::SortedTriggerSet, Head::kRefreshTrigger>::Result;
+
+        static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
+
+        // Get the feature quantity name
+        static std::string GetName() {
+            return std::string(Head::kName) + "+" + Tail::GetName();
         }
-      }
-    }
 
-    // Make the base class and the class template that recursively uses itself a friend
-    friend class FeatureSetBase<FeatureSet>;
-    template <typename... FeatureTypes>
-    friend class FeatureSet;
-  };
+    private:
+        // Get a list of indices with a value of 1 among the features
+        template <typename IndexListType>
+        static void CollectActiveIndices(
+            const Position& pos, const TriggerEvent trigger, const Color perspective,
+            IndexListType* const active) {
+            Tail::CollectActiveIndices(pos, trigger, perspective, active);
+            if (Head::kRefreshTrigger == trigger) {
+                const auto start = active->size();
+                Head::AppendActiveIndices(pos, perspective, active);
 
-  // Class template that represents the feature set
-  template <typename FeatureType>
-  class FeatureSet<FeatureType> : public FeatureSetBase<FeatureSet<FeatureType>> {
+                for (auto i = start; i < active->size(); ++i) {
+                    (*active)[i] += Tail::kDimensions;
+                }
+            }
+        }
 
-   public:
-    // Hash value embedded in the evaluation file
-    static constexpr std::uint32_t kHashValue = FeatureType::kHashValue;
-    // Number of feature dimensions
-    static constexpr IndexType kDimensions = FeatureType::kDimensions;
-    // Maximum number of simultaneously active features
-    static constexpr IndexType kMaxActiveDimensions =
-        FeatureType::kMaxActiveDimensions;
-    // Trigger for full calculation instead of difference calculation
-    using SortedTriggerSet =
-        CompileTimeList<TriggerEvent, FeatureType::kRefreshTrigger>;
-    static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
+        // Get a list of indices whose values have changed from the previous one in the feature quantity
+        template <typename IndexListType>
+        static void CollectChangedIndices(
+            const Position& pos, const TriggerEvent trigger, const Color perspective,
+            IndexListType* const removed, IndexListType* const added) {
+            Tail::CollectChangedIndices(pos, trigger, perspective, removed, added);
+            if (Head::kRefreshTrigger == trigger) {
+                const auto start_removed = removed->size();
+                const auto start_added = added->size();
+                Head::AppendChangedIndices(pos, perspective, removed, added);
 
-    // Get the feature quantity name
-    static std::string GetName() {
-      return FeatureType::kName;
-    }
+                for (auto i = start_removed; i < removed->size(); ++i) {
+                    (*removed)[i] += Tail::kDimensions;
+                }
 
-   private:
-    // Get a list of indices for active features
-    static void CollectActiveIndices(
-        const Position& pos, const TriggerEvent trigger, const Color perspective,
-        IndexList* const active) {
-      if (FeatureType::kRefreshTrigger == trigger) {
-        FeatureType::AppendActiveIndices(pos, perspective, active);
-      }
-    }
+                for (auto i = start_added; i < added->size(); ++i) {
+                    (*added)[i] += Tail::kDimensions;
+                }
+            }
+        }
 
-    // Get a list of indices for recently changed features
-    static void CollectChangedIndices(
-        const Position& pos, const TriggerEvent trigger, const Color perspective,
-        IndexList* const removed, IndexList* const added) {
+        // Make the base class and the class template that recursively uses itself a friend
+        friend class FeatureSetBase<FeatureSet>;
 
-      if (FeatureType::kRefreshTrigger == trigger) {
-        FeatureType::AppendChangedIndices(pos, perspective, removed, added);
-      }
-    }
+        template <typename... FeatureTypes>
+        friend class FeatureSet;
+    };
 
-    // Make the base class and the class template that recursively uses itself a friend
-    friend class FeatureSetBase<FeatureSet>;
-    template <typename... FeatureTypes>
-    friend class FeatureSet;
-  };
+    // Class template that represents the feature set
+    template <typename FeatureType>
+    class FeatureSet<FeatureType> : public FeatureSetBase<FeatureSet<FeatureType>> {
+
+    public:
+        // Hash value embedded in the evaluation file
+        static constexpr std::uint32_t kHashValue = FeatureType::kHashValue;
+
+        // Number of feature dimensions
+        static constexpr IndexType kDimensions = FeatureType::kDimensions;
+
+        // Maximum number of simultaneously active features
+        static constexpr IndexType kMaxActiveDimensions =
+            FeatureType::kMaxActiveDimensions;
+
+        // Trigger for full calculation instead of difference calculation
+        using SortedTriggerSet =
+            CompileTimeList<TriggerEvent, FeatureType::kRefreshTrigger>;
+
+        static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
+
+        // Get the feature quantity name
+        static std::string GetName() {
+            return FeatureType::kName;
+        }
+
+    private:
+        // Get a list of indices for active features
+        static void CollectActiveIndices(
+            const Position& pos, const TriggerEvent trigger, const Color perspective,
+            IndexList* const active) {
+
+            if (FeatureType::kRefreshTrigger == trigger) {
+              FeatureType::AppendActiveIndices(pos, perspective, active);
+            }
+        }
+
+        // Get a list of indices for recently changed features
+        static void CollectChangedIndices(
+            const Position& pos, const TriggerEvent trigger, const Color perspective,
+            IndexList* const removed, IndexList* const added) {
+
+            if (FeatureType::kRefreshTrigger == trigger) {
+              FeatureType::AppendChangedIndices(pos, perspective, removed, added);
+            }
+        }
+
+        // Make the base class and the class template that recursively uses itself a friend
+        friend class FeatureSetBase<FeatureSet>;
+
+        template <typename... FeatureTypes>
+        friend class FeatureSet;
+    };
 
 }  // namespace Eval::NNUE::Features
 
diff --git a/src/nnue/features/features_common.h b/src/nnue/features/features_common.h
index 656502a3..671ceeb9 100644
--- a/src/nnue/features/features_common.h
+++ b/src/nnue/features/features_common.h
@@ -1,19 +1,19 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
 
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
 
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 //Common header of input features of NNUE evaluation function
@@ -21,29 +21,30 @@
 #ifndef NNUE_FEATURES_COMMON_H_INCLUDED
 #define NNUE_FEATURES_COMMON_H_INCLUDED
 
-#include "../../evaluate.h"
-#include "../nnue_common.h"
+#include "evaluate.h"
+
+#include "nnue/nnue_common.h"
 
 namespace Eval::NNUE::Features {
 
-  class IndexList;
+    class IndexList;
 
-  template <typename... FeatureTypes>
-  class FeatureSet;
+    template <typename... FeatureTypes>
+    class FeatureSet;
 
-  // Trigger to perform full calculations instead of difference only
-  enum class TriggerEvent {
-    kNone, // Calculate the difference whenever possible
-    kFriendKingMoved, // calculate full evaluation when own king moves
-    kEnemyKingMoved, // calculate full evaluation when opponent king moves
-    kAnyKingMoved, // calculate full evaluation when any king moves
-    kAnyPieceMoved, // always calculate full evaluation
-  };
+    // Trigger to perform full calculations instead of difference only
+    enum class TriggerEvent {
+        kNone, // Calculate the difference whenever possible
+        kFriendKingMoved, // calculate full evaluation when own king moves
+        kEnemyKingMoved, // calculate full evaluation when opponent king moves
+        kAnyKingMoved, // calculate full evaluation when any king moves
+        kAnyPieceMoved, // always calculate full evaluation
+    };
 
-  enum class Side {
-    kFriend, // side to move
-    kEnemy, // opponent
-  };
+    enum class Side {
+        kFriend, // side to move
+        kEnemy, // opponent
+    };
 
 }  // namespace Eval::NNUE::Features
 
diff --git a/src/nnue/features/half_kp.cpp b/src/nnue/features/half_kp.cpp
index ae1d697f..17b50472 100644
--- a/src/nnue/features/half_kp.cpp
+++ b/src/nnue/features/half_kp.cpp
@@ -1,19 +1,19 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
 
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
 
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 //Definition of input features HalfKP of NNUE evaluation function
@@ -23,51 +23,59 @@
 
 namespace Eval::NNUE::Features {
 
-  // Orient a square according to perspective (flip rank for black)
-  inline Square orient(Color perspective, Square s) {
-    return Square(int(s) ^ (bool(perspective) * SQ_A8));
-  }
-
-  // Find the index of the feature quantity from the king position and PieceSquare
-  template <Side AssociatedKing>
-  inline IndexType HalfKP<AssociatedKing>::MakeIndex(
-      Color perspective, Square s, Piece pc, Square ksq) {
-
-    return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective] + PS_END * ksq);
-  }
-
-  // Get a list of indices for active features
-  template <Side AssociatedKing>
-  void HalfKP<AssociatedKing>::AppendActiveIndices(
-      const Position& pos, Color perspective, IndexList* active) {
-
-    Square ksq = orient(perspective, pos.square<KING>(AssociatedKing == Side::kFriend ? perspective : ~perspective));
-    Bitboard bb = pos.pieces() & ~pos.pieces(KING);
-    while (bb) {
-      Square s = pop_lsb(&bb);
-      active->push_back(MakeIndex(perspective, s, pos.piece_on(s), ksq));
+    // Orient a square according to perspective (flip rank for black)
+    inline Square orient(Color perspective, Square s) {
+        return Square(int(s) ^ (bool(perspective) * SQ_A8));
     }
-  }
 
-  // Get a list of indices for recently changed features
-  template <Side AssociatedKing>
-  void HalfKP<AssociatedKing>::AppendChangedIndices(
-      const Position& pos, Color perspective,
-      IndexList* removed, IndexList* added) {
+    // Find the index of the feature quantity from the king position and PieceSquare
+    template <Side AssociatedKing>
+    inline IndexType HalfKP<AssociatedKing>::MakeIndex(
+        Color perspective, Square s, Piece pc, Square ksq) {
 
-    Square ksq = orient(perspective, pos.square<KING>(AssociatedKing == Side::kFriend ? perspective : ~perspective));
-    const auto& dp = pos.state()->dirtyPiece;
-    for (int i = 0; i < dp.dirty_num; ++i) {
-      Piece pc = dp.piece[i];
-      if (type_of(pc) == KING) continue;
-      if (dp.from[i] != SQ_NONE)
-        removed->push_back(MakeIndex(perspective, dp.from[i], pc, ksq));
-      if (dp.to[i] != SQ_NONE)
-        added->push_back(MakeIndex(perspective, dp.to[i], pc, ksq));
+        return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective] + PS_END * ksq);
     }
-  }
 
-  template class HalfKP<Side::kFriend>;
-  template class HalfKP<Side::kEnemy>;
+    // Get a list of indices for active features
+    template <Side AssociatedKing>
+    void HalfKP<AssociatedKing>::AppendActiveIndices(
+        const Position& pos, Color perspective, IndexList* active) {
+
+        Square ksq = orient(perspective, pos.square<KING>(AssociatedKing == Side::kFriend ? perspective : ~perspective));
+        Bitboard bb = pos.pieces() & ~pos.pieces(KING);
+        while (bb) {
+            Square s = pop_lsb(&bb);
+            active->push_back(MakeIndex(perspective, s, pos.piece_on(s), ksq));
+        }
+    }
+
+    // Get a list of indices for recently changed features
+    template <Side AssociatedKing>
+    void HalfKP<AssociatedKing>::AppendChangedIndices(
+        const Position& pos, Color perspective,
+        IndexList* removed, IndexList* added) {
+
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));
+
+        const auto& dp = pos.state()->dirtyPiece;
+        for (int i = 0; i < dp.dirty_num; ++i) {
+            Piece pc = dp.piece[i];
+
+            if (type_of(pc) == KING)
+                continue;
+
+            if (dp.from[i] != SQ_NONE)
+                removed->push_back(MakeIndex(perspective, dp.from[i], pc, ksq));
+
+            if (dp.to[i] != SQ_NONE)
+                added->push_back(MakeIndex(perspective, dp.to[i], pc, ksq));
+        }
+    }
+
+    template class HalfKP<Side::kFriend>;
+    template class HalfKP<Side::kEnemy>;
 
 }  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/half_kp.h b/src/nnue/features/half_kp.h
index 23e8beb6..834f800e 100644
--- a/src/nnue/features/half_kp.h
+++ b/src/nnue/features/half_kp.h
@@ -1,65 +1,69 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
 
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
 
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-//Definition of input features HalfKP of NNUE evaluation function
-
 #ifndef NNUE_FEATURES_HALF_KP_H_INCLUDED
 #define NNUE_FEATURES_HALF_KP_H_INCLUDED
 
-#include "../../evaluate.h"
 #include "features_common.h"
 
+#include "evaluate.h"
+
+//Definition of input features HalfKP of NNUE evaluation function
 namespace Eval::NNUE::Features {
 
-  // Feature HalfKP: Combination of the position of own king
-  // and the position of pieces other than kings
-  template <Side AssociatedKing>
-  class HalfKP {
+    // Feature HalfKP: Combination of the position of own king
+    // and the position of pieces other than kings
+    template <Side AssociatedKing>
+    class HalfKP {
 
-   public:
-    // Feature name
-    static constexpr const char* kName = (AssociatedKing == Side::kFriend) ?
-        "HalfKP(Friend)" : "HalfKP(Enemy)";
-    // Hash value embedded in the evaluation file
-    static constexpr std::uint32_t kHashValue =
-        0x5D69D5B9u ^ (AssociatedKing == Side::kFriend);
-    // Number of feature dimensions
-    static constexpr IndexType kDimensions =
-        static_cast<IndexType>(SQUARE_NB) * static_cast<IndexType>(PS_END);
-    // Maximum number of simultaneously active features
-    static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
-    // Trigger for full calculation instead of difference calculation
-    static constexpr TriggerEvent kRefreshTrigger =
-        (AssociatedKing == Side::kFriend) ?
-        TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
+    public:
+        // Feature name
+        static constexpr const char* kName = (AssociatedKing == Side::kFriend) ?
+            "HalfKP(Friend)" : "HalfKP(Enemy)";
 
-    // Get a list of indices for active features
-    static void AppendActiveIndices(const Position& pos, Color perspective,
-                                    IndexList* active);
+        // Hash value embedded in the evaluation file
+        static constexpr std::uint32_t kHashValue =
+            0x5D69D5B9u ^ (AssociatedKing == Side::kFriend);
 
-    // Get a list of indices for recently changed features
-    static void AppendChangedIndices(const Position& pos, Color perspective,
-                                     IndexList* removed, IndexList* added);
+        // Number of feature dimensions
+        static constexpr IndexType kDimensions =
+            static_cast<IndexType>(SQUARE_NB) * static_cast<IndexType>(PS_END);
 
-   private:
-    // Index of a feature for a given king position and another piece on some square
-    static IndexType MakeIndex(Color perspective, Square s, Piece pc, Square sq_k);
-  };
+        // Maximum number of simultaneously active features
+        static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
+
+        // Trigger for full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger =
+            (AssociatedKing == Side::kFriend) ?
+            TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
+
+        // Get a list of indices for active features
+        static void AppendActiveIndices(const Position& pos, Color perspective,
+                                        IndexList* active);
+
+        // Get a list of indices for recently changed features
+        static void AppendChangedIndices(const Position& pos, Color perspective,
+                                         IndexList* removed, IndexList* added);
+
+    private:
+        // Index of a feature for a given king position and another piece on some square
+        static IndexType MakeIndex(Color perspective, Square s, Piece pc, Square sq_k);
+    };
 
 }  // namespace Eval::NNUE::Features
 
diff --git a/src/nnue/features/half_relative_kp.cpp b/src/nnue/features/half_relative_kp.cpp
index 6b456a1f..5ab22890 100644
--- a/src/nnue/features/half_relative_kp.cpp
+++ b/src/nnue/features/half_relative_kp.cpp
@@ -1,74 +1,80 @@
-﻿//Definition of input features HalfRelativeKP of NNUE evaluation function
-
-#include "half_relative_kp.h"
+﻿#include "half_relative_kp.h"
 #include "index_list.h"
 
-namespace Eval {
+//Definition of input features HalfRelativeKP of NNUE evaluation function
+namespace Eval::NNUE::Features {
 
-namespace NNUE {
+    // Orient a square according to perspective (flip rank for black)
+    inline Square orient(Color perspective, Square s) {
+        return Square(int(s) ^ (bool(perspective) * SQ_A8));
+    }
 
-namespace Features {
+    // Find the index of the feature quantity from the ball position and PieceSquare
+    template <Side AssociatedKing>
+    inline IndexType HalfRelativeKP<AssociatedKing>::MakeIndex(
+        Color perspective, Square s, Piece pc, Square sq_k) {
+        const IndexType p = IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
+        return MakeIndex(sq_k, p);
+    }
 
-// Orient a square according to perspective (flip rank for black)
-inline Square orient(Color perspective, Square s) {
-  return Square(int(s) ^ (bool(perspective) * SQ_A8));
-}
+    // Find the index of the feature quantity from the ball position and PieceSquare
+    template <Side AssociatedKing>
+    inline IndexType HalfRelativeKP<AssociatedKing>::MakeIndex(
+        Square sq_k, IndexType p) {
 
-// Find the index of the feature quantity from the ball position and PieceSquare
-template <Side AssociatedKing>
-inline IndexType HalfRelativeKP<AssociatedKing>::MakeIndex(
-  Color perspective, Square s, Piece pc, Square sq_k) {
-  const IndexType p = IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
-  return MakeIndex(sq_k, p);
-}
+        constexpr IndexType W = kBoardWidth;
+        constexpr IndexType H = kBoardHeight;
+        const IndexType piece_index = (p - PS_W_PAWN) / SQUARE_NB;
+        const Square sq_p = static_cast<Square>((p - PS_W_PAWN) % SQUARE_NB);
+        const IndexType relative_file = file_of(sq_p) - file_of(sq_k) + (W / 2);
+        const IndexType relative_rank = rank_of(sq_p) - rank_of(sq_k) + (H / 2);
+        return H * W * piece_index + H * relative_file + relative_rank;
+    }
 
-// Find the index of the feature quantity from the ball position and PieceSquare
-template <Side AssociatedKing>
-inline IndexType HalfRelativeKP<AssociatedKing>::MakeIndex(
-    Square sq_k, IndexType p) {
-  constexpr IndexType W = kBoardWidth;
-  constexpr IndexType H = kBoardHeight;
-  const IndexType piece_index = (p - PS_W_PAWN) / SQUARE_NB;
-  const Square sq_p = static_cast<Square>((p - PS_W_PAWN) % SQUARE_NB);
-  const IndexType relative_file = file_of(sq_p) - file_of(sq_k) + (W / 2);
-  const IndexType relative_rank = rank_of(sq_p) - rank_of(sq_k) + (H / 2);
-  return H * W * piece_index + H * relative_file + relative_rank;
-}
+    // Get a list of indices with a value of 1 among the features
+    template <Side AssociatedKing>
+    void HalfRelativeKP<AssociatedKing>::AppendActiveIndices(
+        const Position& pos, Color perspective, IndexList* active) {
 
-// Get a list of indices with a value of 1 among the features
-template <Side AssociatedKing>
-void HalfRelativeKP<AssociatedKing>::AppendActiveIndices(
-    const Position& pos, Color perspective, IndexList* active) {
-  Square ksq = orient(perspective, pos.square<KING>(AssociatedKing == Side::kFriend ? perspective : ~perspective));
-  Bitboard bb = pos.pieces() & ~pos.pieces(KING);
-  while (bb) {
-    Square s = pop_lsb(&bb);
-    active->push_back(MakeIndex(perspective, s, pos.piece_on(s), ksq));
-  }
-}
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));
 
-// Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-template <Side AssociatedKing>
-void HalfRelativeKP<AssociatedKing>::AppendChangedIndices(
-    const Position& pos, Color perspective,
-    IndexList* removed, IndexList* added) {
-  Square ksq = orient(perspective, pos.square<KING>(AssociatedKing == Side::kFriend ? perspective : ~perspective));
-  const auto& dp = pos.state()->dirtyPiece;
-  for (int i = 0; i < dp.dirty_num; ++i) {
-    Piece pc = dp.piece[i];
-    if (type_of(pc) == KING) continue;
-    if (dp.from[i] != SQ_NONE)
-      removed->push_back(MakeIndex(perspective, dp.from[i], pc, ksq));
-    if (dp.to[i] != SQ_NONE)
-      added->push_back(MakeIndex(perspective, dp.to[i], pc, ksq));
-  }
-}
+        Bitboard bb = pos.pieces() & ~pos.pieces(KING);
+        while (bb) {
+            Square s = pop_lsb(&bb);
+            active->push_back(MakeIndex(perspective, s, pos.piece_on(s), ksq));
+        }
+    }
 
-template class HalfRelativeKP<Side::kFriend>;
-template class HalfRelativeKP<Side::kEnemy>;
+    // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+    template <Side AssociatedKing>
+    void HalfRelativeKP<AssociatedKing>::AppendChangedIndices(
+        const Position& pos, Color perspective,
+        IndexList* removed, IndexList* added) {
 
-}  // namespace Features
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));
 
-}  // namespace NNUE
+        const auto& dp = pos.state()->dirtyPiece;
+        for (int i = 0; i < dp.dirty_num; ++i) {
+            Piece pc = dp.piece[i];
 
-}  // namespace Eval
+            if (type_of(pc) == KING)
+                continue;
+
+            if (dp.from[i] != SQ_NONE)
+                removed->push_back(MakeIndex(perspective, dp.from[i], pc, ksq));
+
+            if (dp.to[i] != SQ_NONE)
+                added->push_back(MakeIndex(perspective, dp.to[i], pc, ksq));
+        }
+    }
+
+    template class HalfRelativeKP<Side::kFriend>;
+    template class HalfRelativeKP<Side::kEnemy>;
+
+}  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/half_relative_kp.h b/src/nnue/features/half_relative_kp.h
index 1b384c14..cc1e136f 100644
--- a/src/nnue/features/half_relative_kp.h
+++ b/src/nnue/features/half_relative_kp.h
@@ -1,61 +1,61 @@
-﻿//Definition of input features HalfRelativeKP of NNUE evaluation function
-
-#ifndef _NNUE_FEATURES_HALF_RELATIVE_KP_H_
+﻿#ifndef _NNUE_FEATURES_HALF_RELATIVE_KP_H_
 #define _NNUE_FEATURES_HALF_RELATIVE_KP_H_
 
-#include "../../evaluate.h"
 #include "features_common.h"
 
-namespace Eval {
+#include "evaluate.h"
 
-namespace NNUE {
+//Definition of input features HalfRelativeKP of NNUE evaluation function
+namespace Eval::NNUE::Features {
 
-namespace Features {
+    // Feature HalfRelativeKP: Relative position of each piece other than the ball based on own ball or enemy ball
+    template <Side AssociatedKing>
+    class HalfRelativeKP {
+    public:
+        // feature quantity name
+        static constexpr const char* kName = (AssociatedKing == Side::kFriend) ?
+            "HalfRelativeKP(Friend)" : "HalfRelativeKP(Enemy)";
 
-// Feature HalfRelativeKP: Relative position of each piece other than the ball based on own ball or enemy ball
-template <Side AssociatedKing>
-class HalfRelativeKP {
- public:
-  // feature quantity name
-  static constexpr const char* kName = (AssociatedKing == Side::kFriend) ?
-      "HalfRelativeKP(Friend)" : "HalfRelativeKP(Enemy)";
-  // Hash value embedded in the evaluation function file
-  static constexpr std::uint32_t kHashValue =
-      0xF9180919u ^ (AssociatedKing == Side::kFriend);
-  // Piece type excluding balls
-  static constexpr IndexType kNumPieceKinds = 5 * 2;
-  // width of the virtual board with the ball in the center
-  static constexpr IndexType kBoardWidth = FILE_NB * 2 - 1;
-  // height of a virtual board with balls in the center
-  static constexpr IndexType kBoardHeight = RANK_NB * 2 - 1;
-  // number of feature dimensions
-  static constexpr IndexType kDimensions =
-      kNumPieceKinds * kBoardHeight * kBoardWidth;
-  // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-  static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
-  // Timing of full calculation instead of difference calculation
-  static constexpr TriggerEvent kRefreshTrigger =
-      (AssociatedKing == Side::kFriend) ?
-      TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t kHashValue =
+            0xF9180919u ^ (AssociatedKing == Side::kFriend);
 
-  // Get a list of indices with a value of 1 among the features
-  static void AppendActiveIndices(const Position& pos, Color perspective,
-                                  IndexList* active);
+        // Piece type excluding balls
+        static constexpr IndexType kNumPieceKinds = 5 * 2;
 
-  // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-  static void AppendChangedIndices(const Position& pos, Color perspective,
-                                   IndexList* removed, IndexList* added);
+        // width of the virtual board with the ball in the center
+        static constexpr IndexType kBoardWidth = FILE_NB * 2 - 1;
 
-  // Find the index of the feature quantity from the ball position and PieceSquare
-  static IndexType MakeIndex(Square s, IndexType p);
-  // Find the index of the feature quantity from the ball position and PieceSquare
-  static IndexType MakeIndex(Color perspective, Square s, Piece pc, Square sq_k);
-};
+        // height of a virtual board with balls in the center
+        static constexpr IndexType kBoardHeight = RANK_NB * 2 - 1;
 
-}  // namespace Features
+        // number of feature dimensions
+        static constexpr IndexType kDimensions =
+            kNumPieceKinds * kBoardHeight * kBoardWidth;
 
-}  // namespace NNUE
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
 
-}  // namespace Eval
+        // Timing of full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger =
+            (AssociatedKing == Side::kFriend) ?
+            TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
+
+        // Get a list of indices with a value of 1 among the features
+        static void AppendActiveIndices(const Position& pos, Color perspective,
+                                        IndexList* active);
+
+        // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+        static void AppendChangedIndices(const Position& pos, Color perspective,
+                                         IndexList* removed, IndexList* added);
+
+        // Find the index of the feature quantity from the ball position and PieceSquare
+        static IndexType MakeIndex(Square s, IndexType p);
+
+        // Find the index of the feature quantity from the ball position and PieceSquare
+        static IndexType MakeIndex(Color perspective, Square s, Piece pc, Square sq_k);
+    };
+
+}  // namespace Eval::NNUE::Features
 
 #endif
diff --git a/src/nnue/features/index_list.h b/src/nnue/features/index_list.h
index dd055fb3..6751b26c 100644
--- a/src/nnue/features/index_list.h
+++ b/src/nnue/features/index_list.h
@@ -1,19 +1,19 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
 
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
 
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 // Definition of index list of input features
@@ -21,43 +21,43 @@
 #ifndef NNUE_FEATURES_INDEX_LIST_H_INCLUDED
 #define NNUE_FEATURES_INDEX_LIST_H_INCLUDED
 
-#include "../../position.h"
-#include "../nnue_architecture.h"
+#include "position.h"
+
+#include "nnue/nnue_architecture.h"
 
 namespace Eval::NNUE::Features {
 
-  // Class template used for feature index list
-  template <typename T, std::size_t MaxSize>
-  class ValueList {
+    // Class template used for feature index list
+    template <typename T, std::size_t MaxSize>
+    class ValueList {
 
-   public:
-    std::size_t size() const { return size_; }
-    void resize(std::size_t size) { size_ = size; }
-    void push_back(const T& value) { values_[size_++] = value; }
-    T& operator[](std::size_t index) { return values_[index]; }
-    T* begin() { return values_; }
-    T* end() { return values_ + size_; }
-    const T& operator[](std::size_t index) const { return values_[index]; }
-    const T* begin() const { return values_; }
-    const T* end() const { return values_ + size_; }
+    public:
+        std::size_t size() const { return size_; }
+        void resize(std::size_t size) { size_ = size; }
+        void push_back(const T& value) { values_[size_++] = value; }
+        T& operator[](std::size_t index) { return values_[index]; }
+        T* begin() { return values_; }
+        T* end() { return values_ + size_; }
+        const T& operator[](std::size_t index) const { return values_[index]; }
+        const T* begin() const { return values_; }
+        const T* end() const { return values_ + size_; }
 
-    void swap(ValueList& other) {
-      const std::size_t max_size = std::max(size_, other.size_);
-      for (std::size_t i = 0; i < max_size; ++i) {
-        std::swap(values_[i], other.values_[i]);
-      }
-      std::swap(size_, other.size_);
-    }
+        void swap(ValueList& other) {
+            const std::size_t max_size = std::max(size_, other.size_);
+            for (std::size_t i = 0; i < max_size; ++i) {
+                std::swap(values_[i], other.values_[i]);
+            }
+            std::swap(size_, other.size_);
+        }
 
-   private:
-    T values_[MaxSize] = {};
-    std::size_t size_ = 0;
-  };
+    private:
+        T values_[MaxSize] = {};
+        std::size_t size_ = 0;
+    };
 
-  //Type of feature index list
-  class IndexList
-      : public ValueList<IndexType, RawFeatures::kMaxActiveDimensions> {
-  };
+    //Type of feature index list
+    class IndexList : public ValueList<IndexType, RawFeatures::kMaxActiveDimensions> {
+    };
 
 }  // namespace Eval::NNUE::Features
 
diff --git a/src/nnue/features/k.cpp b/src/nnue/features/k.cpp
index bd8d7dd0..8911abb7 100644
--- a/src/nnue/features/k.cpp
+++ b/src/nnue/features/k.cpp
@@ -1,46 +1,39 @@
-﻿//Definition of input feature quantity K of NNUE evaluation function
-
-#include "k.h"
+﻿#include "k.h"
 #include "index_list.h"
 
-namespace Eval {
+//Definition of input feature quantity K of NNUE evaluation function
+namespace Eval::NNUE::Features {
 
-namespace NNUE {
+    // Orient a square according to perspective (flip rank for black)
+    inline Square orient(Color perspective, Square s) {
+        return Square(int(s) ^ (bool(perspective) * SQ_A8));
+    }
 
-namespace Features {
+    // Index of a feature for a given king position.
+    IndexType K::MakeIndex(Color perspective, Square s, Color king_color) {
+        return IndexType(orient(perspective, s) + bool(perspective ^ king_color) * 64);
+    }
 
-// Orient a square according to perspective (flip rank for black)
-inline Square orient(Color perspective, Square s) {
-  return Square(int(s) ^ (bool(perspective) * SQ_A8));
-}
+    // Get a list of indices with a value of 1 among the features
+    void K::AppendActiveIndices(
+        const Position& pos, Color perspective, IndexList* active) {
 
-// Index of a feature for a given king position.
-IndexType K::MakeIndex(Color perspective, Square s, Color king_color) {
-  return IndexType(orient(perspective, s) + bool(perspective ^ king_color) * 64);
-}
+        for (auto color : Colors) {
+          active->push_back(MakeIndex(perspective, pos.square<KING>(color), color));
+        }
+    }
 
-// Get a list of indices with a value of 1 among the features
-void K::AppendActiveIndices(
-    const Position& pos, Color perspective, IndexList* active) {
-  for (auto color : Colors) {
-    active->push_back(MakeIndex(perspective, pos.square<KING>(color), color));
-  }
-}
+    // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+    void K::AppendChangedIndices(
+        const Position& pos, Color perspective,
+        IndexList* removed, IndexList* added) {
 
-// Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-void K::AppendChangedIndices(
-    const Position& pos, Color perspective,
-    IndexList* removed, IndexList* added) {
-  const auto& dp = pos.state()->dirtyPiece;
-  if (type_of(dp.piece[0]) == KING)
-  {
-    removed->push_back(MakeIndex(perspective, dp.from[0], color_of(dp.piece[0])));
-    added->push_back(MakeIndex(perspective, dp.to[0], color_of(dp.piece[0])));
-  }
-}
+        const auto& dp = pos.state()->dirtyPiece;
+        if (type_of(dp.piece[0]) == KING)
+        {
+            removed->push_back(MakeIndex(perspective, dp.from[0], color_of(dp.piece[0])));
+            added->push_back(MakeIndex(perspective, dp.to[0], color_of(dp.piece[0])));
+        }
+    }
 
-}  // namespace Features
-
-}  // namespace NNUE
-
-}  // namespace Eval
+}  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/k.h b/src/nnue/features/k.h
index 9a0be4bb..c9726ab2 100644
--- a/src/nnue/features/k.h
+++ b/src/nnue/features/k.h
@@ -1,48 +1,44 @@
-﻿//Definition of input feature quantity K of NNUE evaluation function
-
-#ifndef _NNUE_FEATURES_K_H_
+﻿#ifndef _NNUE_FEATURES_K_H_
 #define _NNUE_FEATURES_K_H_
 
-#include "../../evaluate.h"
 #include "features_common.h"
 
-namespace Eval {
+#include "evaluate.h"
 
-namespace NNUE {
+//Definition of input feature quantity K of NNUE evaluation function
+namespace Eval::NNUE::Features {
 
-namespace Features {
+  // Feature K: Ball position
+  class K {
+  public:
+      // feature quantity name
+      static constexpr const char* kName = "K";
 
-// Feature K: Ball position
-class K {
- public:
-  // feature quantity name
-  static constexpr const char* kName = "K";
-  // Hash value embedded in the evaluation function file
-  static constexpr std::uint32_t kHashValue = 0xD3CEE169u;
-  // number of feature dimensions
-  static constexpr IndexType kDimensions = SQUARE_NB * 2;
-  // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-  static constexpr IndexType kMaxActiveDimensions = 2;
-  // Timing of full calculation instead of difference calculation
-  static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
+      // Hash value embedded in the evaluation function file
+      static constexpr std::uint32_t kHashValue = 0xD3CEE169u;
 
-  // Get a list of indices with a value of 1 among the features
-  static void AppendActiveIndices(const Position& pos, Color perspective,
-                                  IndexList* active);
+      // number of feature dimensions
+      static constexpr IndexType kDimensions = SQUARE_NB * 2;
 
-  // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-  static void AppendChangedIndices(const Position& pos, Color perspective,
-                                   IndexList* removed, IndexList* added);
+      // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+      static constexpr IndexType kMaxActiveDimensions = 2;
 
-private:
-  // Index of a feature for a given king position.
-  static IndexType MakeIndex(Color perspective, Square s, Color king_color);
-};
+      // Timing of full calculation instead of difference calculation
+      static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
 
-}  // namespace Features
+      // Get a list of indices with a value of 1 among the features
+      static void AppendActiveIndices(const Position& pos, Color perspective,
+                                      IndexList* active);
 
-}  // namespace NNUE
+      // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+      static void AppendChangedIndices(const Position& pos, Color perspective,
+                                       IndexList* removed, IndexList* added);
 
-}  // namespace Eval
+  private:
+      // Index of a feature for a given king position.
+      static IndexType MakeIndex(Color perspective, Square s, Color king_color);
+  };
+
+}  // namespace Eval::NNUE::Features
 
 #endif
diff --git a/src/nnue/features/p.cpp b/src/nnue/features/p.cpp
index 012311ac..b4757284 100644
--- a/src/nnue/features/p.cpp
+++ b/src/nnue/features/p.cpp
@@ -1,52 +1,49 @@
-﻿//Definition of input feature P of NNUE evaluation function
-
-#include "p.h"
+﻿#include "p.h"
 #include "index_list.h"
 
-namespace Eval {
+//Definition of input feature P of NNUE evaluation function
+namespace Eval::NNUE::Features {
 
-namespace NNUE {
+    // Orient a square according to perspective (flip rank for black)
+    inline Square orient(Color perspective, Square s) {
+        return Square(int(s) ^ (bool(perspective) * SQ_A8));
+    }
 
-namespace Features {
+    // Find the index of the feature quantity from the king position and PieceSquare
+    inline IndexType P::MakeIndex(
+        Color perspective, Square s, Piece pc) {
+        return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
+    }
 
-// Orient a square according to perspective (flip rank for black)
-inline Square orient(Color perspective, Square s) {
-  return Square(int(s) ^ (bool(perspective) * SQ_A8));
-}
+    // Get a list of indices with a value of 1 among the features
+    void P::AppendActiveIndices(
+        const Position& pos, Color perspective, IndexList* active) {
 
-// Find the index of the feature quantity from the king position and PieceSquare
-inline IndexType P::MakeIndex(
-  Color perspective, Square s, Piece pc) {
-  return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
-}
+        Bitboard bb = pos.pieces() & ~pos.pieces(KING);
+        while (bb) {
+            Square s = pop_lsb(&bb);
+            active->push_back(MakeIndex(perspective, s, pos.piece_on(s)));
+        }
+    }
 
-// Get a list of indices with a value of 1 among the features
-void P::AppendActiveIndices(
-    const Position& pos, Color perspective, IndexList* active) {
-  Bitboard bb = pos.pieces() & ~pos.pieces(KING);
-  while (bb) {
-    Square s = pop_lsb(&bb);
-    active->push_back(MakeIndex(perspective, s, pos.piece_on(s)));
-  }
-}
+    // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+    void P::AppendChangedIndices(
+        const Position& pos, Color perspective,
+        IndexList* removed, IndexList* added) {
 
-// Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-void P::AppendChangedIndices(
-    const Position& pos, Color perspective,
-    IndexList* removed, IndexList* added) {
-  const auto& dp = pos.state()->dirtyPiece;
-  for (int i = 0; i < dp.dirty_num; ++i) {
-    Piece pc = dp.piece[i];
-    if (type_of(pc) == KING) continue;
-    if (dp.from[i] != SQ_NONE)
-      removed->push_back(MakeIndex(perspective, dp.from[i], pc));
-    if (dp.to[i] != SQ_NONE)
-      added->push_back(MakeIndex(perspective, dp.to[i], pc));
-  }
-}
+        const auto& dp = pos.state()->dirtyPiece;
+        for (int i = 0; i < dp.dirty_num; ++i) {
+            Piece pc = dp.piece[i];
 
-}  // namespace Features
+            if (type_of(pc) == KING)
+              continue;
 
-}  // namespace NNUE
+            if (dp.from[i] != SQ_NONE)
+              removed->push_back(MakeIndex(perspective, dp.from[i], pc));
 
-}  // namespace Eval
+            if (dp.to[i] != SQ_NONE)
+              added->push_back(MakeIndex(perspective, dp.to[i], pc));
+        }
+    }
+
+}  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/p.h b/src/nnue/features/p.h
index 07d88952..6a8a5392 100644
--- a/src/nnue/features/p.h
+++ b/src/nnue/features/p.h
@@ -1,48 +1,44 @@
-﻿//Definition of input feature P of NNUE evaluation function
-
-#ifndef _NNUE_FEATURES_P_H_
+﻿#ifndef _NNUE_FEATURES_P_H_
 #define _NNUE_FEATURES_P_H_
 
-#include "../../evaluate.h"
 #include "features_common.h"
 
-namespace Eval {
+#include "evaluate.h"
 
-namespace NNUE {
+//Definition of input feature P of NNUE evaluation function
+namespace Eval::NNUE::Features {
 
-namespace Features {
+  // Feature P: PieceSquare of pieces other than balls
+  class P {
+  public:
+      // feature quantity name
+      static constexpr const char* kName = "P";
 
-// Feature P: PieceSquare of pieces other than balls
-class P {
- public:
-  // feature quantity name
-  static constexpr const char* kName = "P";
-  // Hash value embedded in the evaluation function file
-  static constexpr std::uint32_t kHashValue = 0x764CFB4Bu;
-  // number of feature dimensions
-  static constexpr IndexType kDimensions = PS_END;
-  // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-  static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
-  // Timing of full calculation instead of difference calculation
-  static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
+      // Hash value embedded in the evaluation function file
+      static constexpr std::uint32_t kHashValue = 0x764CFB4Bu;
 
-  // Get a list of indices with a value of 1 among the features
-  static void AppendActiveIndices(const Position& pos, Color perspective,
-                                  IndexList* active);
+      // number of feature dimensions
+      static constexpr IndexType kDimensions = PS_END;
 
-  // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-  static void AppendChangedIndices(const Position& pos, Color perspective,
-                                   IndexList* removed, IndexList* added);
+      // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+      static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
 
- private:
-  // Index of a feature for a given piece on some square
-  static IndexType MakeIndex(Color perspective, Square s, Piece pc);
-};
+      // Timing of full calculation instead of difference calculation
+      static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
 
-}  // namespace Features
+      // Get a list of indices with a value of 1 among the features
+      static void AppendActiveIndices(const Position& pos, Color perspective,
+                                      IndexList* active);
 
-}  // namespace NNUE
+      // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+      static void AppendChangedIndices(const Position& pos, Color perspective,
+                                       IndexList* removed, IndexList* added);
 
-}  // namespace Eval
+  private:
+      // Index of a feature for a given piece on some square
+      static IndexType MakeIndex(Color perspective, Square s, Piece pc);
+  };
+
+}  // namespace Eval::NNUE::Features
 
 #endif

From 3041adb080558700cf4b77833305d974a1ca82c2 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 14 Oct 2020 20:55:35 +0200
Subject: [PATCH 209/398] Cleanup layers.

---
 src/nnue/layers/affine_transform.h | 523 +++++++++++++++--------------
 src/nnue/layers/clipped_relu.h     | 299 +++++++++--------
 src/nnue/layers/input_slice.h      | 110 +++---
 src/nnue/layers/sum.h              | 257 +++++++-------
 4 files changed, 616 insertions(+), 573 deletions(-)

diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index f24578a8..cc5e5eef 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -1,19 +1,19 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
 
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
 
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 // Definition of layer AffineTransform of NNUE evaluation function
@@ -21,267 +21,290 @@
 #ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
 #define NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
 
-#include <iostream>
-#include "../nnue_common.h"
+#include "nnue/nnue_common.h"
+
+#include <string>
+#include <type_traits>
+#include <cstdint>
 
 namespace Eval::NNUE::Layers {
 
-  // Affine transformation layer
-  template <typename PreviousLayer, IndexType OutputDimensions>
-  class AffineTransform {
-   public:
-    // Input/output type
-    using InputType = typename PreviousLayer::OutputType;
-    using OutputType = std::int32_t;
-    static_assert(std::is_same<InputType, std::uint8_t>::value, "");
+    // Affine transformation layer
+    template <typename PreviousLayer, IndexType OutputDimensions>
+    class AffineTransform {
+    public:
+        // Input/output type
+        using InputType = typename PreviousLayer::OutputType;
 
-    // Number of input/output dimensions
-    static constexpr IndexType kInputDimensions =
-        PreviousLayer::kOutputDimensions;
-    static constexpr IndexType kOutputDimensions = OutputDimensions;
-    static constexpr IndexType kPaddedInputDimensions =
-        CeilToMultiple<IndexType>(kInputDimensions, kMaxSimdWidth);
+        using OutputType = std::int32_t;
 
-    // Size of forward propagation buffer used in this layer
-    static constexpr std::size_t kSelfBufferSize =
-        CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
+        static_assert(std::is_same<InputType, std::uint8_t>::value, "");
 
-    // Size of the forward propagation buffer used from the input layer to this layer
-    static constexpr std::size_t kBufferSize =
-        PreviousLayer::kBufferSize + kSelfBufferSize;
+        // Number of input/output dimensions
+        static constexpr IndexType kInputDimensions =
+            PreviousLayer::kOutputDimensions;
 
-    // Hash value embedded in the evaluation file
-    static constexpr std::uint32_t GetHashValue() {
-      std::uint32_t hash_value = 0xCC03DAE4u;
-      hash_value += kOutputDimensions;
-      hash_value ^= PreviousLayer::GetHashValue() >> 1;
-      hash_value ^= PreviousLayer::GetHashValue() << 31;
-      return hash_value;
-    }
+        static constexpr IndexType kOutputDimensions = OutputDimensions;
 
-    // A string that represents the structure from the input layer to this layer
-    static std::string GetStructureString() {
-      return "AffineTransform[" +
-        std::to_string(kOutputDimensions) + "<-" +
-        std::to_string(kInputDimensions) + "](" +
-        PreviousLayer::GetStructureString() + ")";
-    }
-    
-   // Read network parameters
-    bool ReadParameters(std::istream& stream) {
-      if (!previous_layer_.ReadParameters(stream)) return false;
-      for (std::size_t i = 0; i < kOutputDimensions; ++i)
-        biases_[i] = read_little_endian<BiasType>(stream);
-      for (std::size_t i = 0; i < kOutputDimensions * kPaddedInputDimensions; ++i)
-        weights_[i] = read_little_endian<WeightType>(stream);
-      return !stream.fail();
-    }
+        static constexpr IndexType kPaddedInputDimensions =
+            CeilToMultiple<IndexType>(kInputDimensions, kMaxSimdWidth);
 
-    // write parameters
-    bool WriteParameters(std::ostream& stream) const {
-      if (!previous_layer_.WriteParameters(stream)) return false;
-      stream.write(reinterpret_cast<const char*>(biases_),
-        kOutputDimensions * sizeof(BiasType));
-      stream.write(reinterpret_cast<const char*>(weights_),
-        kOutputDimensions * kPaddedInputDimensions *
-        sizeof(WeightType));
-      return !stream.fail();
-    }
+        // Size of forward propagation buffer used in this layer
+        static constexpr std::size_t kSelfBufferSize =
+            CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
 
-    // Forward propagation
-    const OutputType* Propagate(
-        const TransformedFeatureType* transformed_features, char* buffer) const {
-      const auto input = previous_layer_.Propagate(
-          transformed_features, buffer + kSelfBufferSize);
-      const auto output = reinterpret_cast<OutputType*>(buffer);
+        // Size of the forward propagation buffer used from the input layer to this layer
+        static constexpr std::size_t kBufferSize =
+            PreviousLayer::kBufferSize + kSelfBufferSize;
 
-  #if defined(USE_AVX512)
-      constexpr IndexType kNumChunks = kPaddedInputDimensions / (kSimdWidth * 2);
-      const auto input_vector = reinterpret_cast<const __m512i*>(input);
-  #if !defined(USE_VNNI)
-      const __m512i kOnes = _mm512_set1_epi16(1);
-  #endif
-
-  #elif defined(USE_AVX2)
-      constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
-      const auto input_vector = reinterpret_cast<const __m256i*>(input);
-  #if !defined(USE_VNNI)
-      const __m256i kOnes = _mm256_set1_epi16(1);
-  #endif
-
-  #elif defined(USE_SSE2)
-      constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
-  #ifndef USE_SSSE3
-      const __m128i kZeros = _mm_setzero_si128();
-  #else
-      const __m128i kOnes = _mm_set1_epi16(1);
-  #endif
-      const auto input_vector = reinterpret_cast<const __m128i*>(input);
-
-  #elif defined(USE_MMX)
-      constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
-      const __m64 kZeros = _mm_setzero_si64();
-      const auto input_vector = reinterpret_cast<const __m64*>(input);
-
-  #elif defined(USE_NEON)
-      constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
-      const auto input_vector = reinterpret_cast<const int8x8_t*>(input);
-  #endif
-
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        const IndexType offset = i * kPaddedInputDimensions;
-
-  #if defined(USE_AVX512)
-        __m512i sum = _mm512_setzero_si512();
-        const auto row = reinterpret_cast<const __m512i*>(&weights_[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-  #if defined(USE_VNNI)
-            sum = _mm512_dpbusd_epi32(sum, _mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
-  #else
-            __m512i product = _mm512_maddubs_epi16(_mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
-            product = _mm512_madd_epi16(product, kOnes);
-            sum = _mm512_add_epi32(sum, product);
-  #endif
+        // Hash value embedded in the evaluation file
+        static constexpr std::uint32_t GetHashValue() {
+            std::uint32_t hash_value = 0xCC03DAE4u;
+            hash_value += kOutputDimensions;
+            hash_value ^= PreviousLayer::GetHashValue() >> 1;
+            hash_value ^= PreviousLayer::GetHashValue() << 31;
+            return hash_value;
         }
 
-        // Note: Changing kMaxSimdWidth from 32 to 64 breaks loading existing networks.
-        // As a result kPaddedInputDimensions may not be an even multiple of 64(512bit)
-        // and we have to do one more 256bit chunk.
-        if (kPaddedInputDimensions != kNumChunks * kSimdWidth * 2)
-        {
-            const auto iv256  = reinterpret_cast<const __m256i*>(&input_vector[kNumChunks]);
-            const auto row256 = reinterpret_cast<const __m256i*>(&row[kNumChunks]);
-  #if defined(USE_VNNI)
-            __m256i product256 = _mm256_dpbusd_epi32(
-                _mm512_castsi512_si256(sum), _mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0]));
-            sum = _mm512_inserti32x8(sum, product256, 0);
-  #else
-            __m256i product256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0]));
-            sum = _mm512_add_epi32(sum, _mm512_cvtepi16_epi32(product256));
-  #endif
+        // A string that represents the structure from the input layer to this layer
+        static std::string GetStructureString() {
+            return "AffineTransform[" +
+                std::to_string(kOutputDimensions) + "<-" +
+                std::to_string(kInputDimensions) + "](" +
+                PreviousLayer::GetStructureString() + ")";
         }
-        output[i] = _mm512_reduce_add_epi32(sum) + biases_[i];
 
-  #elif defined(USE_AVX2)
-        __m256i sum = _mm256_setzero_si256();
-        const auto row = reinterpret_cast<const __m256i*>(&weights_[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-  #if defined(USE_VNNI)
-          sum = _mm256_dpbusd_epi32(sum, _mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
-  #else
-          __m256i product = _mm256_maddubs_epi16(_mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
-          product = _mm256_madd_epi16(product, kOnes);
-          sum = _mm256_add_epi32(sum, product);
-  #endif
+       // Read network parameters
+        bool ReadParameters(std::istream& stream) {
+            if (!previous_layer_.ReadParameters(stream))
+                return false;
+
+            for (std::size_t i = 0; i < kOutputDimensions; ++i)
+                biases_[i] = read_little_endian<BiasType>(stream);
+
+            for (std::size_t i = 0; i < kOutputDimensions * kPaddedInputDimensions; ++i)
+                weights_[i] = read_little_endian<WeightType>(stream);
+
+            return !stream.fail();
         }
-        __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
-        sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC));
-        sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB));
-        output[i] = _mm_cvtsi128_si32(sum128) + biases_[i];
 
-  #elif defined(USE_SSSE3)
-        __m128i sum = _mm_setzero_si128();
-        const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
-        for (int j = 0; j < (int)kNumChunks - 1; j += 2) {
-          __m128i product0 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j]));
-          product0 = _mm_madd_epi16(product0, kOnes);
-          sum = _mm_add_epi32(sum, product0);
-          __m128i product1 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j+1]), _mm_load_si128(&row[j+1]));
-          product1 = _mm_madd_epi16(product1, kOnes);
-          sum = _mm_add_epi32(sum, product1);
+        // write parameters
+        bool WriteParameters(std::ostream& stream) const {
+            if (!previous_layer_.WriteParameters(stream))
+                return false;
+
+            stream.write(reinterpret_cast<const char*>(biases_),
+                kOutputDimensions * sizeof(BiasType));
+
+            stream.write(reinterpret_cast<const char*>(weights_),
+                kOutputDimensions * kPaddedInputDimensions *
+                sizeof(WeightType));
+
+            return !stream.fail();
         }
-        if (kNumChunks & 0x1) {
-          __m128i product = _mm_maddubs_epi16(_mm_load_si128(&input_vector[kNumChunks-1]), _mm_load_si128(&row[kNumChunks-1]));
-          product = _mm_madd_epi16(product, kOnes);
-          sum = _mm_add_epi32(sum, product);
+
+        // Forward propagation
+        const OutputType* Propagate(
+            const TransformedFeatureType* transformed_features, char* buffer) const {
+
+            const auto input = previous_layer_.Propagate(
+                transformed_features, buffer + kSelfBufferSize);
+            const auto output = reinterpret_cast<OutputType*>(buffer);
+
+#if defined(USE_AVX512)
+            constexpr IndexType kNumChunks = kPaddedInputDimensions / (kSimdWidth * 2);
+            const auto input_vector = reinterpret_cast<const __m512i*>(input);
+#if !defined(USE_VNNI)
+            const __m512i kOnes = _mm512_set1_epi16(1);
+#endif
+
+#elif defined(USE_AVX2)
+            constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+            const auto input_vector = reinterpret_cast<const __m256i*>(input);
+#if !defined(USE_VNNI)
+            const __m256i kOnes = _mm256_set1_epi16(1);
+#endif
+
+#elif defined(USE_SSE2)
+            constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+#ifndef USE_SSSE3
+            const __m128i kZeros = _mm_setzero_si128();
+#else
+            const __m128i kOnes = _mm_set1_epi16(1);
+#endif
+            const auto input_vector = reinterpret_cast<const __m128i*>(input);
+
+#elif defined(USE_MMX)
+            constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+            const __m64 kZeros = _mm_setzero_si64();
+            const auto input_vector = reinterpret_cast<const __m64*>(input);
+
+#elif defined(USE_NEON)
+            constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+            const auto input_vector = reinterpret_cast<const int8x8_t*>(input);
+#endif
+
+            for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                const IndexType offset = i * kPaddedInputDimensions;
+
+#if defined(USE_AVX512)
+                __m512i sum = _mm512_setzero_si512();
+                const auto row = reinterpret_cast<const __m512i*>(&weights_[offset]);
+                for (IndexType j = 0; j < kNumChunks; ++j) {
+#if defined(USE_VNNI)
+                    sum = _mm512_dpbusd_epi32(sum, _mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
+#else
+                    __m512i product = _mm512_maddubs_epi16(_mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
+                    product = _mm512_madd_epi16(product, kOnes);
+                    sum = _mm512_add_epi32(sum, product);
+#endif
+                }
+
+                // Note: Changing kMaxSimdWidth from 32 to 64 breaks loading existing networks.
+                // As a result kPaddedInputDimensions may not be an even multiple of 64(512bit)
+                // and we have to do one more 256bit chunk.
+                if (kPaddedInputDimensions != kNumChunks * kSimdWidth * 2)
+                {
+                    const auto iv256  = reinterpret_cast<const __m256i*>(&input_vector[kNumChunks]);
+                    const auto row256 = reinterpret_cast<const __m256i*>(&row[kNumChunks]);
+#if defined(USE_VNNI)
+                    __m256i product256 = _mm256_dpbusd_epi32(
+                        _mm512_castsi512_si256(sum), _mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0]));
+                    sum = _mm512_inserti32x8(sum, product256, 0);
+#else
+                    __m256i product256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0]));
+                    sum = _mm512_add_epi32(sum, _mm512_cvtepi16_epi32(product256));
+#endif
+                }
+
+                output[i] = _mm512_reduce_add_epi32(sum) + biases_[i];
+
+#elif defined(USE_AVX2)
+                __m256i sum = _mm256_setzero_si256();
+                const auto row = reinterpret_cast<const __m256i*>(&weights_[offset]);
+                for (IndexType j = 0; j < kNumChunks; ++j) {
+#if defined(USE_VNNI)
+                    sum = _mm256_dpbusd_epi32(sum, _mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
+#else
+                    __m256i product = _mm256_maddubs_epi16(_mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
+                    product = _mm256_madd_epi16(product, kOnes);
+                    sum = _mm256_add_epi32(sum, product);
+#endif
+                }
+
+                __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
+                sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC));
+                sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB));
+                output[i] = _mm_cvtsi128_si32(sum128) + biases_[i];
+
+#elif defined(USE_SSSE3)
+                __m128i sum = _mm_setzero_si128();
+                const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
+                for (int j = 0; j < (int)kNumChunks - 1; j += 2) {
+                    __m128i product0 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j]));
+                    product0 = _mm_madd_epi16(product0, kOnes);
+                    sum = _mm_add_epi32(sum, product0);
+                    __m128i product1 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j+1]), _mm_load_si128(&row[j+1]));
+                    product1 = _mm_madd_epi16(product1, kOnes);
+                    sum = _mm_add_epi32(sum, product1);
+                }
+
+                if (kNumChunks & 0x1) {
+                    __m128i product = _mm_maddubs_epi16(_mm_load_si128(&input_vector[kNumChunks-1]), _mm_load_si128(&row[kNumChunks-1]));
+                    product = _mm_madd_epi16(product, kOnes);
+                    sum = _mm_add_epi32(sum, product);
+                }
+
+                sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC
+                sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB
+                output[i] = _mm_cvtsi128_si32(sum) + biases_[i];
+
+#elif defined(USE_SSE2)
+                __m128i sum_lo = _mm_cvtsi32_si128(biases_[i]);
+                __m128i sum_hi = kZeros;
+                const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
+                for (IndexType j = 0; j < kNumChunks; ++j) {
+                    __m128i row_j = _mm_load_si128(&row[j]);
+                    __m128i input_j = _mm_load_si128(&input_vector[j]);
+                    __m128i row_signs = _mm_cmpgt_epi8(kZeros, row_j);
+                    __m128i extended_row_lo = _mm_unpacklo_epi8(row_j, row_signs);
+                    __m128i extended_row_hi = _mm_unpackhi_epi8(row_j, row_signs);
+                    __m128i extended_input_lo = _mm_unpacklo_epi8(input_j, kZeros);
+                    __m128i extended_input_hi = _mm_unpackhi_epi8(input_j, kZeros);
+                    __m128i product_lo = _mm_madd_epi16(extended_row_lo, extended_input_lo);
+                    __m128i product_hi = _mm_madd_epi16(extended_row_hi, extended_input_hi);
+                    sum_lo = _mm_add_epi32(sum_lo, product_lo);
+                    sum_hi = _mm_add_epi32(sum_hi, product_hi);
+                }
+
+                __m128i sum = _mm_add_epi32(sum_lo, sum_hi);
+                __m128i sum_high_64 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2));
+                sum = _mm_add_epi32(sum, sum_high_64);
+                __m128i sum_second_32 = _mm_shufflelo_epi16(sum, _MM_SHUFFLE(1, 0, 3, 2));
+                sum = _mm_add_epi32(sum, sum_second_32);
+                output[i] = _mm_cvtsi128_si32(sum);
+
+#elif defined(USE_MMX)
+                __m64 sum_lo = _mm_cvtsi32_si64(biases_[i]);
+                __m64 sum_hi = kZeros;
+                const auto row = reinterpret_cast<const __m64*>(&weights_[offset]);
+                for (IndexType j = 0; j < kNumChunks; ++j) {
+                    __m64 row_j = row[j];
+                    __m64 input_j = input_vector[j];
+                    __m64 row_signs = _mm_cmpgt_pi8(kZeros, row_j);
+                    __m64 extended_row_lo = _mm_unpacklo_pi8(row_j, row_signs);
+                    __m64 extended_row_hi = _mm_unpackhi_pi8(row_j, row_signs);
+                    __m64 extended_input_lo = _mm_unpacklo_pi8(input_j, kZeros);
+                    __m64 extended_input_hi = _mm_unpackhi_pi8(input_j, kZeros);
+                    __m64 product_lo = _mm_madd_pi16(extended_row_lo, extended_input_lo);
+                    __m64 product_hi = _mm_madd_pi16(extended_row_hi, extended_input_hi);
+                    sum_lo = _mm_add_pi32(sum_lo, product_lo);
+                    sum_hi = _mm_add_pi32(sum_hi, product_hi);
+                }
+
+                __m64 sum = _mm_add_pi32(sum_lo, sum_hi);
+                sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum));
+                output[i] = _mm_cvtsi64_si32(sum);
+
+#elif defined(USE_NEON)
+                int32x4_t sum = {biases_[i]};
+                const auto row = reinterpret_cast<const int8x8_t*>(&weights_[offset]);
+                for (IndexType j = 0; j < kNumChunks; ++j) {
+                    int16x8_t product = vmull_s8(input_vector[j * 2], row[j * 2]);
+                    product = vmlal_s8(product, input_vector[j * 2 + 1], row[j * 2 + 1]);
+                    sum = vpadalq_s16(sum, product);
+                }
+
+                output[i] = sum[0] + sum[1] + sum[2] + sum[3];
+
+#else
+                OutputType sum = biases_[i];
+                for (IndexType j = 0; j < kInputDimensions; ++j) {
+                    sum += weights_[offset + j] * input[j];
+                }
+
+                output[i] = sum;
+#endif
+
+            }
+#if defined(USE_MMX)
+            _mm_empty();
+#endif
+            return output;
         }
-        sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC
-        sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB
-        output[i] = _mm_cvtsi128_si32(sum) + biases_[i];
 
-  #elif defined(USE_SSE2)
-        __m128i sum_lo = _mm_cvtsi32_si128(biases_[i]);
-        __m128i sum_hi = kZeros;
-        const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-          __m128i row_j = _mm_load_si128(&row[j]);
-          __m128i input_j = _mm_load_si128(&input_vector[j]);
-          __m128i row_signs = _mm_cmpgt_epi8(kZeros, row_j);
-          __m128i extended_row_lo = _mm_unpacklo_epi8(row_j, row_signs);
-          __m128i extended_row_hi = _mm_unpackhi_epi8(row_j, row_signs);
-          __m128i extended_input_lo = _mm_unpacklo_epi8(input_j, kZeros);
-          __m128i extended_input_hi = _mm_unpackhi_epi8(input_j, kZeros);
-          __m128i product_lo = _mm_madd_epi16(extended_row_lo, extended_input_lo);
-          __m128i product_hi = _mm_madd_epi16(extended_row_hi, extended_input_hi);
-          sum_lo = _mm_add_epi32(sum_lo, product_lo);
-          sum_hi = _mm_add_epi32(sum_hi, product_hi);
-        }
-        __m128i sum = _mm_add_epi32(sum_lo, sum_hi);
-        __m128i sum_high_64 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2));
-        sum = _mm_add_epi32(sum, sum_high_64);
-        __m128i sum_second_32 = _mm_shufflelo_epi16(sum, _MM_SHUFFLE(1, 0, 3, 2));
-        sum = _mm_add_epi32(sum, sum_second_32);
-        output[i] = _mm_cvtsi128_si32(sum);
+    private:
+        using BiasType = OutputType;
+        using WeightType = std::int8_t;
 
-  #elif defined(USE_MMX)
-        __m64 sum_lo = _mm_cvtsi32_si64(biases_[i]);
-        __m64 sum_hi = kZeros;
-        const auto row = reinterpret_cast<const __m64*>(&weights_[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-          __m64 row_j = row[j];
-          __m64 input_j = input_vector[j];
-          __m64 row_signs = _mm_cmpgt_pi8(kZeros, row_j);
-          __m64 extended_row_lo = _mm_unpacklo_pi8(row_j, row_signs);
-          __m64 extended_row_hi = _mm_unpackhi_pi8(row_j, row_signs);
-          __m64 extended_input_lo = _mm_unpacklo_pi8(input_j, kZeros);
-          __m64 extended_input_hi = _mm_unpackhi_pi8(input_j, kZeros);
-          __m64 product_lo = _mm_madd_pi16(extended_row_lo, extended_input_lo);
-          __m64 product_hi = _mm_madd_pi16(extended_row_hi, extended_input_hi);
-          sum_lo = _mm_add_pi32(sum_lo, product_lo);
-          sum_hi = _mm_add_pi32(sum_hi, product_hi);
-        }
-        __m64 sum = _mm_add_pi32(sum_lo, sum_hi);
-        sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum));
-        output[i] = _mm_cvtsi64_si32(sum);
+        // Make the learning class a friend
+        friend class Trainer<AffineTransform>;
 
-  #elif defined(USE_NEON)
-        int32x4_t sum = {biases_[i]};
-        const auto row = reinterpret_cast<const int8x8_t*>(&weights_[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-          int16x8_t product = vmull_s8(input_vector[j * 2], row[j * 2]);
-          product = vmlal_s8(product, input_vector[j * 2 + 1], row[j * 2 + 1]);
-          sum = vpadalq_s16(sum, product);
-        }
-        output[i] = sum[0] + sum[1] + sum[2] + sum[3];
+        PreviousLayer previous_layer_;
 
-  #else
-        OutputType sum = biases_[i];
-        for (IndexType j = 0; j < kInputDimensions; ++j) {
-          sum += weights_[offset + j] * input[j];
-        }
-        output[i] = sum;
-  #endif
-
-      }
-  #if defined(USE_MMX)
-      _mm_empty();
-  #endif
-      return output;
-    }
-
-   private:
-    using BiasType = OutputType;
-    using WeightType = std::int8_t;
-
-    // Make the learning class a friend
-    friend class Trainer<AffineTransform>;
-
-    PreviousLayer previous_layer_;
-
-    alignas(kCacheLineSize) BiasType biases_[kOutputDimensions];
-    alignas(kCacheLineSize)
-        WeightType weights_[kOutputDimensions * kPaddedInputDimensions];
-  };
+        alignas(kCacheLineSize) BiasType biases_[kOutputDimensions];
+        alignas(kCacheLineSize) WeightType weights_[kOutputDimensions * kPaddedInputDimensions];
+    };
 
 }  // namespace Eval::NNUE::Layers
 
diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h
index d923986e..0846f3df 100644
--- a/src/nnue/layers/clipped_relu.h
+++ b/src/nnue/layers/clipped_relu.h
@@ -1,19 +1,19 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
 
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
 
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 // Definition of layer ClippedReLU of NNUE evaluation function
@@ -21,160 +21,169 @@
 #ifndef NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
 #define NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
 
-#include "../nnue_common.h"
+#include "nnue/nnue_common.h"
+
+#include <string>
+#include <cstdint>
+#include <type_traits>
 
 namespace Eval::NNUE::Layers {
 
-  // Clipped ReLU
-  template <typename PreviousLayer>
-  class ClippedReLU {
-   public:
-    // Input/output type
-    using InputType = typename PreviousLayer::OutputType;
-    using OutputType = std::uint8_t;
-    static_assert(std::is_same<InputType, std::int32_t>::value, "");
+    // Clipped ReLU
+    template <typename PreviousLayer>
+    class ClippedReLU {
+    public:
+        // Input/output type
+        using InputType = typename PreviousLayer::OutputType;
 
-    // Number of input/output dimensions
-    static constexpr IndexType kInputDimensions =
-        PreviousLayer::kOutputDimensions;
-    static constexpr IndexType kOutputDimensions = kInputDimensions;
+        using OutputType = std::uint8_t;
 
-    // Size of forward propagation buffer used in this layer
-    static constexpr std::size_t kSelfBufferSize =
-        CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
+        static_assert(std::is_same<InputType, std::int32_t>::value, "");
 
-    // Size of the forward propagation buffer used from the input layer to this layer
-    static constexpr std::size_t kBufferSize =
-        PreviousLayer::kBufferSize + kSelfBufferSize;
+        // Number of input/output dimensions
+        static constexpr IndexType kInputDimensions =
+            PreviousLayer::kOutputDimensions;
 
-    // Hash value embedded in the evaluation file
-    static constexpr std::uint32_t GetHashValue() {
-      std::uint32_t hash_value = 0x538D24C7u;
-      hash_value += PreviousLayer::GetHashValue();
-      return hash_value;
-    }
+        static constexpr IndexType kOutputDimensions = kInputDimensions;
 
-    // A string that represents the structure from the input layer to this layer
-    static std::string GetStructureString() {
-      return "ClippedReLU[" +
-        std::to_string(kOutputDimensions) + "](" +
-        PreviousLayer::GetStructureString() + ")";
-    }
+        // Size of forward propagation buffer used in this layer
+        static constexpr std::size_t kSelfBufferSize =
+            CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
 
-    // Read network parameters
-    bool ReadParameters(std::istream& stream) {
-      return previous_layer_.ReadParameters(stream);
-    }
+        // Size of the forward propagation buffer used from the input layer to this layer
+        static constexpr std::size_t kBufferSize =
+            PreviousLayer::kBufferSize + kSelfBufferSize;
 
-    // write parameters
-    bool WriteParameters(std::ostream& stream) const {
-      return previous_layer_.WriteParameters(stream);
-    }
+        // Hash value embedded in the evaluation file
+        static constexpr std::uint32_t GetHashValue() {
+            std::uint32_t hash_value = 0x538D24C7u;
+            hash_value += PreviousLayer::GetHashValue();
+            return hash_value;
+        }
 
-    // Forward propagation
-    const OutputType* Propagate(
-        const TransformedFeatureType* transformed_features, char* buffer) const {
-      const auto input = previous_layer_.Propagate(
-          transformed_features, buffer + kSelfBufferSize);
-      const auto output = reinterpret_cast<OutputType*>(buffer);
+        // A string that represents the structure from the input layer to this layer
+        static std::string GetStructureString() {
+            return "ClippedReLU[" +
+                std::to_string(kOutputDimensions) + "](" +
+                PreviousLayer::GetStructureString() + ")";
+        }
 
-  #if defined(USE_AVX2)
-      constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
-      const __m256i kZero = _mm256_setzero_si256();
-      const __m256i kOffsets = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
-      const auto in = reinterpret_cast<const __m256i*>(input);
-      const auto out = reinterpret_cast<__m256i*>(output);
-      for (IndexType i = 0; i < kNumChunks; ++i) {
-        const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32(
-            _mm256_loadA_si256(&in[i * 4 + 0]),
-            _mm256_loadA_si256(&in[i * 4 + 1])), kWeightScaleBits);
-        const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32(
-            _mm256_loadA_si256(&in[i * 4 + 2]),
-            _mm256_loadA_si256(&in[i * 4 + 3])), kWeightScaleBits);
-        _mm256_storeA_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
-            _mm256_packs_epi16(words0, words1), kZero), kOffsets));
-      }
-      constexpr IndexType kStart = kNumChunks * kSimdWidth;
+        // Read network parameters
+        bool ReadParameters(std::istream& stream) {
+            return previous_layer_.ReadParameters(stream);
+        }
 
-  #elif defined(USE_SSE2)
-      constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
+        // write parameters
+        bool WriteParameters(std::ostream& stream) const {
+            return previous_layer_.WriteParameters(stream);
+        }
 
-  #ifdef USE_SSE41
-      const __m128i kZero = _mm_setzero_si128();
-  #else
-      const __m128i k0x80s = _mm_set1_epi8(-128);
-  #endif
+        // Forward propagation
+        const OutputType* Propagate(
+            const TransformedFeatureType* transformed_features, char* buffer) const {
 
-      const auto in = reinterpret_cast<const __m128i*>(input);
-      const auto out = reinterpret_cast<__m128i*>(output);
-      for (IndexType i = 0; i < kNumChunks; ++i) {
-        const __m128i words0 = _mm_srai_epi16(_mm_packs_epi32(
-            _mm_load_si128(&in[i * 4 + 0]),
-            _mm_load_si128(&in[i * 4 + 1])), kWeightScaleBits);
-        const __m128i words1 = _mm_srai_epi16(_mm_packs_epi32(
-            _mm_load_si128(&in[i * 4 + 2]),
-            _mm_load_si128(&in[i * 4 + 3])), kWeightScaleBits);
-        const __m128i packedbytes = _mm_packs_epi16(words0, words1);
-        _mm_store_si128(&out[i],
+            const auto input = previous_layer_.Propagate(
+                transformed_features, buffer + kSelfBufferSize);
+            const auto output = reinterpret_cast<OutputType*>(buffer);
 
-  #ifdef USE_SSE41
-          _mm_max_epi8(packedbytes, kZero)
-  #else
-          _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
-  #endif
+#if defined(USE_AVX2)
+            constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
+            const __m256i kZero = _mm256_setzero_si256();
+            const __m256i kOffsets = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+            const auto in = reinterpret_cast<const __m256i*>(input);
+            const auto out = reinterpret_cast<__m256i*>(output);
+            for (IndexType i = 0; i < kNumChunks; ++i) {
+                const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32(
+                    _mm256_loadA_si256(&in[i * 4 + 0]),
+                    _mm256_loadA_si256(&in[i * 4 + 1])), kWeightScaleBits);
+                const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32(
+                    _mm256_loadA_si256(&in[i * 4 + 2]),
+                    _mm256_loadA_si256(&in[i * 4 + 3])), kWeightScaleBits);
+                _mm256_storeA_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
+                    _mm256_packs_epi16(words0, words1), kZero), kOffsets));
+            }
 
-        );
-      }
-      constexpr IndexType kStart = kNumChunks * kSimdWidth;
+            constexpr IndexType kStart = kNumChunks * kSimdWidth;
 
-  #elif defined(USE_MMX)
-      constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
-      const __m64 k0x80s = _mm_set1_pi8(-128);
-      const auto in = reinterpret_cast<const __m64*>(input);
-      const auto out = reinterpret_cast<__m64*>(output);
-      for (IndexType i = 0; i < kNumChunks; ++i) {
-        const __m64 words0 = _mm_srai_pi16(
-            _mm_packs_pi32(in[i * 4 + 0], in[i * 4 + 1]),
-            kWeightScaleBits);
-        const __m64 words1 = _mm_srai_pi16(
-            _mm_packs_pi32(in[i * 4 + 2], in[i * 4 + 3]),
-            kWeightScaleBits);
-        const __m64 packedbytes = _mm_packs_pi16(words0, words1);
-        out[i] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
-      }
-      _mm_empty();
-      constexpr IndexType kStart = kNumChunks * kSimdWidth;
+#elif defined(USE_SSE2)
+            constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
 
-  #elif defined(USE_NEON)
-      constexpr IndexType kNumChunks = kInputDimensions / (kSimdWidth / 2);
-      const int8x8_t kZero = {0};
-      const auto in = reinterpret_cast<const int32x4_t*>(input);
-      const auto out = reinterpret_cast<int8x8_t*>(output);
-      for (IndexType i = 0; i < kNumChunks; ++i) {
-        int16x8_t shifted;
-        const auto pack = reinterpret_cast<int16x4_t*>(&shifted);
-        pack[0] = vqshrn_n_s32(in[i * 2 + 0], kWeightScaleBits);
-        pack[1] = vqshrn_n_s32(in[i * 2 + 1], kWeightScaleBits);
-        out[i] = vmax_s8(vqmovn_s16(shifted), kZero);
-      }
-      constexpr IndexType kStart = kNumChunks * (kSimdWidth / 2);
-  #else
-      constexpr IndexType kStart = 0;
-  #endif
+#if defined(USE_SSE41)
+            const __m128i kZero = _mm_setzero_si128();
+#else
+            const __m128i k0x80s = _mm_set1_epi8(-128);
+#endif
 
-      for (IndexType i = kStart; i < kInputDimensions; ++i) {
-        output[i] = static_cast<OutputType>(
-            std::max(0, std::min(127, input[i] >> kWeightScaleBits)));
-      }
-      return output;
-    }
+            const auto in = reinterpret_cast<const __m128i*>(input);
+            const auto out = reinterpret_cast<__m128i*>(output);
+            for (IndexType i = 0; i < kNumChunks; ++i) {
+                const __m128i words0 = _mm_srai_epi16(_mm_packs_epi32(
+                    _mm_load_si128(&in[i * 4 + 0]),
+                    _mm_load_si128(&in[i * 4 + 1])), kWeightScaleBits);
+                const __m128i words1 = _mm_srai_epi16(_mm_packs_epi32(
+                    _mm_load_si128(&in[i * 4 + 2]),
+                    _mm_load_si128(&in[i * 4 + 3])), kWeightScaleBits);
+                const __m128i packedbytes = _mm_packs_epi16(words0, words1);
+                _mm_store_si128(&out[i],
 
-   private:
-     // Make the learning class a friend
-     friend class Trainer<ClippedReLU>;
-     
-    PreviousLayer previous_layer_;
-  };
+#if defined(USE_SSE41)
+                    _mm_max_epi8(packedbytes, kZero)
+            #else
+                    _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
+#endif
+
+                );
+            }
+            constexpr IndexType kStart = kNumChunks * kSimdWidth;
+
+#elif defined(USE_MMX)
+            constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
+            const __m64 k0x80s = _mm_set1_pi8(-128);
+            const auto in = reinterpret_cast<const __m64*>(input);
+            const auto out = reinterpret_cast<__m64*>(output);
+            for (IndexType i = 0; i < kNumChunks; ++i) {
+                const __m64 words0 = _mm_srai_pi16(
+                    _mm_packs_pi32(in[i * 4 + 0], in[i * 4 + 1]),
+                    kWeightScaleBits);
+                const __m64 words1 = _mm_srai_pi16(
+                    _mm_packs_pi32(in[i * 4 + 2], in[i * 4 + 3]),
+                    kWeightScaleBits);
+                const __m64 packedbytes = _mm_packs_pi16(words0, words1);
+                out[i] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
+            }
+            _mm_empty();
+            constexpr IndexType kStart = kNumChunks * kSimdWidth;
+
+#elif defined(USE_NEON)
+            constexpr IndexType kNumChunks = kInputDimensions / (kSimdWidth / 2);
+            const int8x8_t kZero = {0};
+            const auto in = reinterpret_cast<const int32x4_t*>(input);
+            const auto out = reinterpret_cast<int8x8_t*>(output);
+            for (IndexType i = 0; i < kNumChunks; ++i) {
+                int16x8_t shifted;
+                const auto pack = reinterpret_cast<int16x4_t*>(&shifted);
+                pack[0] = vqshrn_n_s32(in[i * 2 + 0], kWeightScaleBits);
+                pack[1] = vqshrn_n_s32(in[i * 2 + 1], kWeightScaleBits);
+                out[i] = vmax_s8(vqmovn_s16(shifted), kZero);
+            }
+            constexpr IndexType kStart = kNumChunks * (kSimdWidth / 2);
+#else
+            constexpr IndexType kStart = 0;
+#endif
+
+            for (IndexType i = kStart; i < kInputDimensions; ++i) {
+                output[i] = static_cast<OutputType>(
+                    std::max(0, std::min(127, input[i] >> kWeightScaleBits)));
+            }
+            return output;
+        }
+
+    private:
+        // Make the learning class a friend
+        friend class Trainer<ClippedReLU>;
+
+        PreviousLayer previous_layer_;
+    };
 
 }  // namespace Eval::NNUE::Layers
 
diff --git a/src/nnue/layers/input_slice.h b/src/nnue/layers/input_slice.h
index 78756a39..9d9476a5 100644
--- a/src/nnue/layers/input_slice.h
+++ b/src/nnue/layers/input_slice.h
@@ -1,19 +1,19 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
 
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
 
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 // NNUE evaluation function layer InputSlice definition
@@ -21,59 +21,63 @@
 #ifndef NNUE_LAYERS_INPUT_SLICE_H_INCLUDED
 #define NNUE_LAYERS_INPUT_SLICE_H_INCLUDED
 
-#include "../nnue_common.h"
+#include "nnue/nnue_common.h"
+
+#include <string>
+#include <cstdint>
 
 namespace Eval::NNUE::Layers {
 
-// Input layer
-template <IndexType OutputDimensions, IndexType Offset = 0>
-class InputSlice {
- public:
-  // Need to maintain alignment
-  static_assert(Offset % kMaxSimdWidth == 0, "");
+  // Input layer
+  template <IndexType OutputDimensions, IndexType Offset = 0>
+  class InputSlice {
+  public:
+      // Need to maintain alignment
+      static_assert(Offset % kMaxSimdWidth == 0, "");
 
-  // Output type
-  using OutputType = TransformedFeatureType;
+      // Output type
+      using OutputType = TransformedFeatureType;
 
-  // Output dimensionality
-  static constexpr IndexType kOutputDimensions = OutputDimensions;
+      // Output dimensionality
+      static constexpr IndexType kOutputDimensions = OutputDimensions;
 
-  // Size of forward propagation buffer used from the input layer to this layer
-  static constexpr std::size_t kBufferSize = 0;
+      // Size of forward propagation buffer used from the input layer to this layer
+      static constexpr std::size_t kBufferSize = 0;
 
-  // Hash value embedded in the evaluation file
-  static constexpr std::uint32_t GetHashValue() {
-    std::uint32_t hash_value = 0xEC42E90Du;
-    hash_value ^= kOutputDimensions ^ (Offset << 10);
-    return hash_value;
-  }
+      // Hash value embedded in the evaluation file
+      static constexpr std::uint32_t GetHashValue() {
+          std::uint32_t hash_value = 0xEC42E90Du;
+          hash_value ^= kOutputDimensions ^ (Offset << 10);
+          return hash_value;
+      }
 
-  // A string that represents the structure from the input layer to this layer
-  static std::string GetStructureString() {
-    return "InputSlice[" + std::to_string(kOutputDimensions) + "(" +
-      std::to_string(Offset) + ":" +
-      std::to_string(Offset + kOutputDimensions) + ")]";
-  }
+      // A string that represents the structure from the input layer to this layer
+      static std::string GetStructureString() {
+          return "InputSlice[" + std::to_string(kOutputDimensions) + "(" +
+              std::to_string(Offset) + ":" +
+              std::to_string(Offset + kOutputDimensions) + ")]";
+      }
 
-  // Read network parameters
-  bool ReadParameters(std::istream& /*stream*/) {
-    return true;
-  }
+      // Read network parameters
+      bool ReadParameters(std::istream& /*stream*/) {
+          return true;
+      }
 
-  // write parameters
-  bool WriteParameters(std::ostream& /*stream*/) const {
-    return true;
-  }
+      // write parameters
+      bool WriteParameters(std::ostream& /*stream*/) const {
+          return true;
+      }
 
-  // Forward propagation
-  const OutputType* Propagate(
-      const TransformedFeatureType* transformed_features,
-      char* /*buffer*/) const {
-    return transformed_features + Offset;
-  }
+      // Forward propagation
+      const OutputType* Propagate(
+          const TransformedFeatureType* transformed_features,
+          char* /*buffer*/) const {
 
- private:
-};
+          return transformed_features + Offset;
+      }
+
+  private:
+  };
 
 }  // namespace Layers
 
diff --git a/src/nnue/layers/sum.h b/src/nnue/layers/sum.h
index 419ced89..c81f5850 100644
--- a/src/nnue/layers/sum.h
+++ b/src/nnue/layers/sum.h
@@ -1,159 +1,166 @@
-﻿// Definition of layer Sum of NNUE evaluation function
-
-#ifndef _NNUE_LAYERS_SUM_H_
+﻿#ifndef _NNUE_LAYERS_SUM_H_
 #define _NNUE_LAYERS_SUM_H_
 
-#include "../nnue_common.h"
+#include "nnue/nnue_common.h"
 
-namespace Eval {
+// Definition of layer Sum of NNUE evaluation function
+namespace Eval::NNUE::Layers {
 
-namespace NNUE {
+    // Layer that sums the output of multiple layers
+    template <typename FirstPreviousLayer, typename... RemainingPreviousLayers>
+    class Sum : public Sum<RemainingPreviousLayers...> {
+    private:
+        using Head = FirstPreviousLayer;
+        using Tail = Sum<RemainingPreviousLayers...>;
 
-namespace Layers {
+     public:
+        // Input/output type
+        using InputType = typename Head::OutputType;
 
-// Layer that sums the output of multiple layers
-template <typename FirstPreviousLayer, typename... RemainingPreviousLayers>
-class Sum : public Sum<RemainingPreviousLayers...> {
- private:
-  using Head = FirstPreviousLayer;
-  using Tail = Sum<RemainingPreviousLayers...>;
+        using OutputType = InputType;
 
- public:
-  // Input/output type
-  using InputType = typename Head::OutputType;
-  using OutputType = InputType;
-  static_assert(std::is_same<InputType, typename Tail::InputType>::value, "");
+        static_assert(std::is_same<InputType, typename Tail::InputType>::value, "");
 
-  // number of input/output dimensions
-  static constexpr IndexType kInputDimensions = Head::kOutputDimensions;
-  static constexpr IndexType kOutputDimensions = kInputDimensions;
-  static_assert(kInputDimensions == Tail::kInputDimensions ,"");
+        // number of input/output dimensions
+        static constexpr IndexType kInputDimensions = Head::kOutputDimensions;
 
-  // Size of forward propagation buffer used in this layer
-  static constexpr std::size_t kSelfBufferSize =
-      CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
+        static constexpr IndexType kOutputDimensions = kInputDimensions;
 
-  // Size of the forward propagation buffer used from the input layer to this layer
-  static constexpr std::size_t kBufferSize =
-      std::max(Head::kBufferSize + kSelfBufferSize, Tail::kBufferSize);
+        static_assert(kInputDimensions == Tail::kInputDimensions ,"");
 
-  // Hash value embedded in the evaluation function file
-  static constexpr std::uint32_t GetHashValue() {
-    std::uint32_t hash_value = 0xBCE400B4u;
-    hash_value ^= Head::GetHashValue() >> 1;
-    hash_value ^= Head::GetHashValue() << 31;
-    hash_value ^= Tail::GetHashValue() >> 2;
-    hash_value ^= Tail::GetHashValue() << 30;
-    return hash_value;
-  }
+        // Size of forward propagation buffer used in this layer
+        static constexpr std::size_t kSelfBufferSize =
+            CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
 
-  // A string that represents the structure from the input layer to this layer
-  static std::string GetStructureString() {
-    return "Sum[" +
-        std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")";
-  }
+        // Size of the forward propagation buffer used from the input layer to this layer
+        static constexpr std::size_t kBufferSize =
+            std::max(Head::kBufferSize + kSelfBufferSize, Tail::kBufferSize);
 
-  // read parameters
-  bool ReadParameters(std::istream& stream) {
-    if (!Tail::ReadParameters(stream)) return false;
-    return previous_layer_.ReadParameters(stream);
-  }
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t GetHashValue() {
+            std::uint32_t hash_value = 0xBCE400B4u;
+            hash_value ^= Head::GetHashValue() >> 1;
+            hash_value ^= Head::GetHashValue() << 31;
+            hash_value ^= Tail::GetHashValue() >> 2;
+            hash_value ^= Tail::GetHashValue() << 30;
+            return hash_value;
+        }
 
-  // write parameters
-  bool WriteParameters(std::ostream& stream) const {
-    if (!Tail::WriteParameters(stream)) return false;
-    return previous_layer_.WriteParameters(stream);
-  }
+        // A string that represents the structure from the input layer to this layer
+        static std::string GetStructureString() {
+            return "Sum[" +
+                std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")";
+        }
 
-  // forward propagation
-  const OutputType* Propagate(
-      const TransformedFeatureType* transformed_features, char* buffer) const {
-    Tail::Propagate(transformed_features, buffer);
-    const auto head_output = previous_layer_.Propagate(
-        transformed_features, buffer + kSelfBufferSize);
-    const auto output = reinterpret_cast<OutputType*>(buffer);
-    for (IndexType i = 0; i <kOutputDimensions; ++i) {
-      output[i] += head_output[i];
-    }
-    return output;
-  }
+        // read parameters
+        bool ReadParameters(std::istream& stream) {
+            if (!Tail::ReadParameters(stream))
+                return false;
 
- protected:
-  // A string that represents the list of layers to be summed
-  static std::string GetSummandsString() {
-    return Head::GetStructureString() + "," + Tail::GetSummandsString();
-  }
+            return previous_layer_.ReadParameters(stream);
+        }
 
-  // Make the learning class a friend
-  friend class Trainer<Sum>;
+        // write parameters
+        bool WriteParameters(std::ostream& stream) const {
+            if (!Tail::WriteParameters(stream))
+                return false;
 
-  // the layer immediately before this layer
-  FirstPreviousLayer previous_layer_;
-};
+            return previous_layer_.WriteParameters(stream);
+        }
 
-// Layer that sums the output of multiple layers (when there is one template argument)
-template <typename PreviousLayer>
-class Sum<PreviousLayer> {
- public:
-  // Input/output type
-  using InputType = typename PreviousLayer::OutputType;
-  using OutputType = InputType;
+        // forward propagation
+        const OutputType* Propagate(
+            const TransformedFeatureType* transformed_features, char* buffer) const {
 
-  // number of input/output dimensions
-  static constexpr IndexType kInputDimensions =
-      PreviousLayer::kOutputDimensions;
-  static constexpr IndexType kOutputDimensions = kInputDimensions;
+            Tail::Propagate(transformed_features, buffer);
 
-  // Size of the forward propagation buffer used from the input layer to this layer
-  static constexpr std::size_t kBufferSize = PreviousLayer::kBufferSize;
+            const auto head_output = previous_layer_.Propagate(
+                transformed_features, buffer + kSelfBufferSize);
 
-  // Hash value embedded in the evaluation function file
-  static constexpr std::uint32_t GetHashValue() {
-    std::uint32_t hash_value = 0xBCE400B4u;
-    hash_value ^= PreviousLayer::GetHashValue() >> 1;
-    hash_value ^= PreviousLayer::GetHashValue() << 31;
-    return hash_value;
-  }
+            const auto output = reinterpret_cast<OutputType*>(buffer);
 
-  // A string that represents the structure from the input layer to this layer
-  static std::string GetStructureString() {
-    return "Sum[" +
-        std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")";
-  }
+            for (IndexType i = 0; i <kOutputDimensions; ++i) {
+                output[i] += head_output[i];
+            }
 
-  // read parameters
-  bool ReadParameters(std::istream& stream) {
-    return previous_layer_.ReadParameters(stream);
-  }
+            return output;
+        }
 
-  // write parameters
-  bool WriteParameters(std::ostream& stream) const {
-    return previous_layer_.WriteParameters(stream);
-  }
+    protected:
+        // A string that represents the list of layers to be summed
+        static std::string GetSummandsString() {
+            return Head::GetStructureString() + "," + Tail::GetSummandsString();
+        }
 
-  // forward propagation
-  const OutputType* Propagate(
-      const TransformedFeatureType* transformed_features, char* buffer) const {
-    return previous_layer_.Propagate(transformed_features, buffer);
-  }
+        // Make the learning class a friend
+        friend class Trainer<Sum>;
 
- protected:
-  // A string that represents the list of layers to be summed
-  static std::string GetSummandsString() {
-    return PreviousLayer::GetStructureString();
-  }
+        // the layer immediately before this layer
+        FirstPreviousLayer previous_layer_;
+    };
 
-  // Make the learning class a friend
-  friend class Trainer<Sum>;
+    // Layer that sums the output of multiple layers (when there is one template argument)
+    template <typename PreviousLayer>
+    class Sum<PreviousLayer> {
+    public:
+        // Input/output type
+        using InputType = typename PreviousLayer::OutputType;
 
-  // the layer immediately before this layer
-  PreviousLayer previous_layer_;
-};
+        using OutputType = InputType;
 
-}  // namespace Layers
+        // number of input/output dimensions
+        static constexpr IndexType kInputDimensions =
+            PreviousLayer::kOutputDimensions;
 
-}  // namespace NNUE
+        static constexpr IndexType kOutputDimensions = kInputDimensions;
 
-}  // namespace Eval
+        // Size of the forward propagation buffer used from the input layer to this layer
+        static constexpr std::size_t kBufferSize = PreviousLayer::kBufferSize;
+
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t GetHashValue() {
+            std::uint32_t hash_value = 0xBCE400B4u;
+            hash_value ^= PreviousLayer::GetHashValue() >> 1;
+            hash_value ^= PreviousLayer::GetHashValue() << 31;
+            return hash_value;
+        }
+
+        // A string that represents the structure from the input layer to this layer
+        static std::string GetStructureString() {
+            return "Sum[" +
+                std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")";
+        }
+
+        // read parameters
+        bool ReadParameters(std::istream& stream) {
+            return previous_layer_.ReadParameters(stream);
+        }
+
+        // write parameters
+        bool WriteParameters(std::ostream& stream) const {
+            return previous_layer_.WriteParameters(stream);
+        }
+
+        // forward propagation
+        const OutputType* Propagate(
+            const TransformedFeatureType* transformed_features, char* buffer) const {
+
+            return previous_layer_.Propagate(transformed_features, buffer);
+        }
+
+    protected:
+        // A string that represents the list of layers to be summed
+        static std::string GetSummandsString() {
+            return PreviousLayer::GetStructureString();
+        }
+
+        // Make the learning class a friend
+        friend class Trainer<Sum>;
+
+        // the layer immediately before this layer
+        PreviousLayer previous_layer_;
+    };
+
+}  // namespace Eval::NNUE::Layers
 
 #endif

From 281d520cc2bb0123efd230fce45119b57f0bae0d Mon Sep 17 00:00:00 2001
From: mstembera <MissingEmail@email>
Date: Sun, 18 Oct 2020 04:23:28 -0700
Subject: [PATCH 210/398] Update default net to nn-eba324f53044.nnue

The new net is based on the previous net 04cf2b4ed1da but with the biases
for the 1st hidden layer tuned SPSA, see the SPSA session on fishtest there:
https://tests.stockfishchess.org/tests/view/5f875213dcdad978fe8c5211

Thanks to @vondele for writing out the net, see discussion in this thread:
https://github.com/mstembera/Stockfish/commit/432da86721647dff1d9426a7cdcfd2dbada8155e

Passed STC:
LLR: 2.94 (-2.94,2.94) {-0.25,1.25}
Total: 15000 W: 1640 L: 1483 D: 11877
Ptnml(0-2): 50, 1183, 4908, 1278, 81
https://tests.stockfishchess.org/tests/view/5f8955e20fea1a44ec4f0a5d

Passed LTC:
LLR: 2.96 (-2.94,2.94) {0.25,1.25}
Total: 81272 W: 3948 L: 3682 D: 73642
Ptnml(0-2): 64, 3194, 33856, 3456, 66
https://tests.stockfishchess.org/tests/view/5f89e8efeae8a6e60644d6e7

closes https://github.com/official-stockfish/Stockfish/pull/3187

Bench: 3762411
---
 src/evaluate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/evaluate.h b/src/evaluate.h
index 6a17f284..6a8603ad 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -36,7 +36,7 @@ namespace Eval {
   // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
   // for the build process (profile-build and fishtest) to work. Do not change the
   // name of the macro, as it is used in the Makefile.
-  #define EvalFileDefaultName   "nn-04cf2b4ed1da.nnue"
+  #define EvalFileDefaultName   "nn-eba324f53044.nnue"
 
   namespace NNUE {
 

From 560c776397483feaaa0deb5b666f46ff3f5b655f Mon Sep 17 00:00:00 2001
From: Vizvezdenec <Vizvezdenec@gmail.com>
Date: Sat, 17 Oct 2020 13:40:10 +0200
Subject: [PATCH 211/398] Do more reductions for late quiet moves in case of
 consecutive fail highs.

Idea of this patch can be described as following - in case we have consecutive fail highs and we reach late enough moves at root node probability of remaining quiet moves being able to produce even bigger value than moves that produced previous cutoff (so ones that should be high in move ordering but now they fail to produce beta cutoff because we actually reached high move count) should be quiet small so we can reduce them more.

passed STC
LLR: 2.94 (-2.94,2.94) {-0.25,1.25}
Total: 53392 W: 5681 L: 5474 D: 42237
Ptnml(0-2): 214, 4104, 17894, 4229, 255
https://tests.stockfishchess.org/tests/view/5f88501adcdad978fe8c527e

passed LTC
LLR: 2.94 (-2.94,2.94) {0.25,1.25}
Total: 59136 W: 2773 L: 2564 D: 53799
Ptnml(0-2): 30, 2117, 25078, 2300, 43
https://tests.stockfishchess.org/tests/view/5f884dbfdcdad978fe8c527a

closes https://github.com/official-stockfish/Stockfish/pull/3184

Bench: 4066972
---
 src/search.cpp | 5 ++++-
 src/thread.h   | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/search.cpp b/src/search.cpp
index eaa79fb9..ab58ca64 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -417,7 +417,7 @@ void Thread::search() {
           // Start with a small aspiration window and, in the case of a fail
           // high/low, re-search with a bigger window until we don't fail
           // high/low anymore.
-          int failedHighCnt = 0;
+          failedHighCnt = 0;
           while (true)
           {
               Depth adjustedDepth = std::max(1, rootDepth - failedHighCnt - searchAgainCounter);
@@ -1177,6 +1177,9 @@ moves_loop: // When in check, search starts from here
               if (ttCapture)
                   r++;
 
+              // Increase reduction at root if failing high
+              r += rootNode ? thisThread->failedHighCnt * thisThread->failedHighCnt * moveCount / 512 : 0;
+
               // Increase reduction for cut nodes (~10 Elo)
               if (cutNode)
                   r += 2;
diff --git a/src/thread.h b/src/thread.h
index 34b99015..6a73423b 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -73,6 +73,7 @@ public:
   CapturePieceToHistory captureHistory;
   ContinuationHistory continuationHistory[2][2];
   Score contempt;
+  int failedHighCnt;
 };
 
 
From ea8eb415de3bea3f6943e9257b747721741e4197 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 14 Oct 2020 21:00:05 +0200
Subject: [PATCH 212/398] Cleanup trainer features.

---
 src/nnue/trainer/features/factorizer.h        | 181 +++++++++---------
 .../trainer/features/factorizer_feature_set.h | 171 +++++++++--------
 .../trainer/features/factorizer_half_kp.h     | 155 ++++++++-------
 3 files changed, 256 insertions(+), 251 deletions(-)

diff --git a/src/nnue/trainer/features/factorizer.h b/src/nnue/trainer/features/factorizer.h
index 43950de2..784fe047 100644
--- a/src/nnue/trainer/features/factorizer.h
+++ b/src/nnue/trainer/features/factorizer.h
@@ -1,106 +1,109 @@
-﻿// NNUE evaluation function feature conversion class template
-
-#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_H_
+﻿#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_H_
 #define _NNUE_TRAINER_FEATURES_FACTORIZER_H_
 
-#include "../../nnue_common.h"
-#include "../trainer.h"
+#include "nnue/nnue_common.h"
 
-namespace Eval {
+#include "nnue/trainer/trainer.h"
 
-namespace NNUE {
+// NNUE evaluation function feature conversion class template
+namespace Eval::NNUE::Features {
 
-namespace Features {
+    // Class template that converts input features into learning features
+    // By default, the learning feature is the same as the original input feature, and specialized as necessary
+    template <typename FeatureType>
+    class Factorizer {
+    public:
+        // Get the dimensionality of the learning feature
+        static constexpr IndexType GetDimensions() {
+            return FeatureType::kDimensions;
+        }
 
-// Class template that converts input features into learning features
-// By default, the learning feature is the same as the original input feature, and specialized as necessary
-template <typename FeatureType>
-class Factorizer {
- public:
-  // Get the dimensionality of the learning feature
-  static constexpr IndexType GetDimensions() {
-    return FeatureType::kDimensions;
-  }
+        // Get index of learning feature and scale of learning rate
+        static void AppendTrainingFeatures(
+            IndexType base_index, std::vector<TrainingFeature>* training_features) {
 
-  // Get index of learning feature and scale of learning rate
-  static void AppendTrainingFeatures(
-      IndexType base_index, std::vector<TrainingFeature>* training_features) {
-    assert(base_index <FeatureType::kDimensions);
-    training_features->emplace_back(base_index);
-  }
-};
+            assert(base_index <FeatureType::kDimensions);
+            training_features->emplace_back(base_index);
+        }
+    };
 
-// Learning feature information
-struct FeatureProperties {
-  bool active;
-  IndexType dimensions;
-};
+    // Learning feature information
+    struct FeatureProperties {
+        bool active;
+        IndexType dimensions;
+    };
 
-// Add the original input features to the learning features
-template <typename FeatureType>
-IndexType AppendBaseFeature(
-    FeatureProperties properties, IndexType base_index,
-    std::vector<TrainingFeature>* training_features) {
-  assert(properties.dimensions == FeatureType::kDimensions);
-  assert(base_index < FeatureType::kDimensions);
-  training_features->emplace_back(base_index);
-  return properties.dimensions;
-}
+    // Add the original input features to the learning features
+    template <typename FeatureType>
+    IndexType AppendBaseFeature(
+        FeatureProperties properties, IndexType base_index,
+        std::vector<TrainingFeature>* training_features) {
 
-// If the learning rate scale is not 0, inherit other types of learning features
-template <typename FeatureType>
-IndexType InheritFeaturesIfRequired(
-    IndexType index_offset, FeatureProperties properties, IndexType base_index,
-    std::vector<TrainingFeature>* training_features) {
-  if (!properties.active) {
-    return 0;
-  }
-  assert(properties.dimensions == Factorizer<FeatureType>::GetDimensions());
-  assert(base_index < FeatureType::kDimensions);
-  const auto start = training_features->size();
-  Factorizer<FeatureType>::AppendTrainingFeatures(
-      base_index, training_features);
-  for (auto i = start; i < training_features->size(); ++i) {
-    auto& feature = (*training_features)[i];
-    assert(feature.GetIndex() < Factorizer<FeatureType>::GetDimensions());
-    feature.ShiftIndex(index_offset);
-  }
-  return properties.dimensions;
-}
-
-// Return the index difference as needed, without adding learning features
-// Call instead of InheritFeaturesIfRequired() if there are no corresponding features
-IndexType SkipFeatures(FeatureProperties properties) {
-  if (!properties.active) {
-    return 0;
-  }
-  return properties.dimensions;
-}
-
-// Get the dimensionality of the learning feature
-template <std::size_t N>
-constexpr IndexType GetActiveDimensions(
-    const FeatureProperties (&properties)[N]) {
-  static_assert(N > 0, "");
-  IndexType dimensions = properties[0].dimensions;
-  for (std::size_t i = 1; i < N; ++i) {
-    if (properties[i].active) {
-      dimensions += properties[i].dimensions;
+        assert(properties.dimensions == FeatureType::kDimensions);
+        assert(base_index < FeatureType::kDimensions);
+        training_features->emplace_back(base_index);
+        return properties.dimensions;
     }
-  }
-  return dimensions;
-}
 
-// get the number of elements in the array
-template <typename T, std::size_t N>
-constexpr std::size_t GetArrayLength(const T (&/*array*/)[N]) {
-  return N;
-}
+    // If the learning rate scale is not 0, inherit other types of learning features
+    template <typename FeatureType>
+    IndexType InheritFeaturesIfRequired(
+        IndexType index_offset, FeatureProperties properties, IndexType base_index,
+        std::vector<TrainingFeature>* training_features) {
 
-}  // namespace Features
+        if (!properties.active) {
+            return 0;
+        }
 
-}  // namespace NNUE
+        assert(properties.dimensions == Factorizer<FeatureType>::GetDimensions());
+        assert(base_index < FeatureType::kDimensions);
 
-}  // namespace Eval
+        const auto start = training_features->size();
+        Factorizer<FeatureType>::AppendTrainingFeatures(
+            base_index, training_features);
+
+        for (auto i = start; i < training_features->size(); ++i) {
+            auto& feature = (*training_features)[i];
+            assert(feature.GetIndex() < Factorizer<FeatureType>::GetDimensions());
+            feature.ShiftIndex(index_offset);
+        }
+
+        return properties.dimensions;
+    }
+
+    // Return the index difference as needed, without adding learning features
+    // Call instead of InheritFeaturesIfRequired() if there are no corresponding features
+    IndexType SkipFeatures(FeatureProperties properties) {
+        if (!properties.active)
+            return 0;
+
+        return properties.dimensions;
+    }
+
+    // Get the dimensionality of the learning feature
+    template <std::size_t N>
+    constexpr IndexType GetActiveDimensions(
+        const FeatureProperties (&properties)[N]) {
+
+        static_assert(N > 0, "");
+
+        IndexType dimensions = properties[0].dimensions;
+
+        for (std::size_t i = 1; i < N; ++i) {
+            if (properties[i].active) {
+                dimensions += properties[i].dimensions;
+            }
+        }
+
+        return dimensions;
+    }
+
+    // get the number of elements in the array
+    template <typename T, std::size_t N>
+    constexpr std::size_t GetArrayLength(const T (&/*array*/)[N]) {
+        return N;
+    }
+
+}  // namespace Eval::NNUE::Features
 
 #endif
diff --git a/src/nnue/trainer/features/factorizer_feature_set.h b/src/nnue/trainer/features/factorizer_feature_set.h
index caf6608b..d272a453 100644
--- a/src/nnue/trainer/features/factorizer_feature_set.h
+++ b/src/nnue/trainer/features/factorizer_feature_set.h
@@ -1,100 +1,105 @@
-﻿// Specialization for feature set of feature conversion class template of NNUE evaluation function
-
-#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_
+﻿#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_
 #define _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_
 
-#include "../../features/feature_set.h"
 #include "factorizer.h"
 
-namespace Eval {
+#include "nnue/features/feature_set.h"
 
-namespace NNUE {
+// Specialization for feature set of feature conversion class template of NNUE evaluation function
+namespace Eval::NNUE::Features {
 
-namespace Features {
+    // Class template that converts input features into learning features
+    // Specialization for FeatureSet
+    template <typename FirstFeatureType, typename... RemainingFeatureTypes>
+    class Factorizer<FeatureSet<FirstFeatureType, RemainingFeatureTypes...>> {
+    private:
+        using Head = Factorizer<FeatureSet<FirstFeatureType>>;
+        using Tail = Factorizer<FeatureSet<RemainingFeatureTypes...>>;
 
-// Class template that converts input features into learning features
-// Specialization for FeatureSet
-template <typename FirstFeatureType, typename... RemainingFeatureTypes>
-class Factorizer<FeatureSet<FirstFeatureType, RemainingFeatureTypes...>> {
- private:
-  using Head = Factorizer<FeatureSet<FirstFeatureType>>;
-  using Tail = Factorizer<FeatureSet<RemainingFeatureTypes...>>;
+    public:
+        // number of dimensions of original input features
+        static constexpr IndexType kBaseDimensions =
+            FeatureSet<FirstFeatureType, RemainingFeatureTypes...>::kDimensions;
 
- public:
-  // number of dimensions of original input features
-  static constexpr IndexType kBaseDimensions =
-      FeatureSet<FirstFeatureType, RemainingFeatureTypes...>::kDimensions;
-
-  // Get the dimensionality of the learning feature
-  static constexpr IndexType GetDimensions() {
-    return Head::GetDimensions() + Tail::GetDimensions();
-  }
-
-  // Get index of learning feature and scale of learning rate
-  static void AppendTrainingFeatures(
-      IndexType base_index, std::vector<TrainingFeature>* training_features,
-      IndexType base_dimensions = kBaseDimensions) {
-    assert(base_index < kBaseDimensions);
-    constexpr auto boundary = FeatureSet<RemainingFeatureTypes...>::kDimensions;
-    if (base_index < boundary) {
-      Tail::AppendTrainingFeatures(
-          base_index, training_features, base_dimensions);
-    } else {
-      const auto start = training_features->size();
-      Head::AppendTrainingFeatures(
-          base_index - boundary, training_features, base_dimensions);
-      for (auto i = start; i < training_features->size(); ++i) {
-        auto& feature = (*training_features)[i];
-        const auto index = feature.GetIndex();
-        assert(index < Head::GetDimensions() ||
-                   (index >= base_dimensions &&
-                    index < base_dimensions +
-                            Head::GetDimensions() - Head::kBaseDimensions));
-        if (index < Head::kBaseDimensions) {
-          feature.ShiftIndex(Tail::kBaseDimensions);
-        } else {
-          feature.ShiftIndex(Tail::GetDimensions() - Tail::kBaseDimensions);
+        // Get the dimensionality of the learning feature
+        static constexpr IndexType GetDimensions() {
+            return Head::GetDimensions() + Tail::GetDimensions();
         }
-      }
-    }
-  }
-};
 
-// Class template that converts input features into learning features
-// Specialization when FeatureSet has one template argument
-template <typename FeatureType>
-class Factorizer<FeatureSet<FeatureType>> {
-public:
-  // number of dimensions of original input features
-  static constexpr IndexType kBaseDimensions = FeatureType::kDimensions;
+        // Get index of learning feature and scale of learning rate
+        static void AppendTrainingFeatures(
+            IndexType base_index, std::vector<TrainingFeature>* training_features,
+            IndexType base_dimensions = kBaseDimensions) {
 
-  // Get the dimensionality of the learning feature
-  static constexpr IndexType GetDimensions() {
-    return Factorizer<FeatureType>::GetDimensions();
-  }
+            assert(base_index < kBaseDimensions);
 
-  // Get index of learning feature and scale of learning rate
-  static void AppendTrainingFeatures(
-      IndexType base_index, std::vector<TrainingFeature>* training_features,
-      IndexType base_dimensions = kBaseDimensions) {
-    assert(base_index < kBaseDimensions);
-    const auto start = training_features->size();
-    Factorizer<FeatureType>::AppendTrainingFeatures(
-        base_index, training_features);
-    for (auto i = start; i < training_features->size(); ++i) {
-      auto& feature = (*training_features)[i];
-      assert(feature.GetIndex() < Factorizer<FeatureType>::GetDimensions());
-      if (feature.GetIndex() >= kBaseDimensions) {
-        feature.ShiftIndex(base_dimensions - kBaseDimensions);
-      }
-    }
-  }
-};
+            constexpr auto boundary = FeatureSet<RemainingFeatureTypes...>::kDimensions;
 
-}  // namespace Features
+            if (base_index < boundary) {
+                Tail::AppendTrainingFeatures(
+                    base_index, training_features, base_dimensions);
+            }
+            else {
+                const auto start = training_features->size();
 
-}  // namespace NNUE
+                Head::AppendTrainingFeatures(
+                    base_index - boundary, training_features, base_dimensions);
 
-}  // namespace Eval
+                for (auto i = start; i < training_features->size(); ++i) {
+                    auto& feature = (*training_features)[i];
+                    const auto index = feature.GetIndex();
+
+                    assert(index < Head::GetDimensions() ||
+                               (index >= base_dimensions &&
+                                index < base_dimensions +
+                                        Head::GetDimensions() - Head::kBaseDimensions));
+
+                    if (index < Head::kBaseDimensions) {
+                        feature.ShiftIndex(Tail::kBaseDimensions);
+                    }
+                    else {
+                        feature.ShiftIndex(Tail::GetDimensions() - Tail::kBaseDimensions);
+                    }
+                }
+            }
+        }
+    };
+
+    // Class template that converts input features into learning features
+    // Specialization when FeatureSet has one template argument
+    template <typename FeatureType>
+    class Factorizer<FeatureSet<FeatureType>> {
+    public:
+        // number of dimensions of original input features
+        static constexpr IndexType kBaseDimensions = FeatureType::kDimensions;
+
+        // Get the dimensionality of the learning feature
+        static constexpr IndexType GetDimensions() {
+            return Factorizer<FeatureType>::GetDimensions();
+        }
+
+        // Get index of learning feature and scale of learning rate
+        static void AppendTrainingFeatures(
+            IndexType base_index, std::vector<TrainingFeature>* training_features,
+            IndexType base_dimensions = kBaseDimensions) {
+
+            assert(base_index < kBaseDimensions);
+
+            const auto start = training_features->size();
+
+            Factorizer<FeatureType>::AppendTrainingFeatures(
+                base_index, training_features);
+
+            for (auto i = start; i < training_features->size(); ++i) {
+                auto& feature = (*training_features)[i];
+                assert(feature.GetIndex() < Factorizer<FeatureType>::GetDimensions());
+                if (feature.GetIndex() >= kBaseDimensions) {
+                    feature.ShiftIndex(base_dimensions - kBaseDimensions);
+                }
+            }
+        }
+    };
+
+}  // namespace Eval::NNUE::Features
 
 #endif
diff --git a/src/nnue/trainer/features/factorizer_half_kp.h b/src/nnue/trainer/features/factorizer_half_kp.h
index 70a6acca..1ed5bdd3 100644
--- a/src/nnue/trainer/features/factorizer_half_kp.h
+++ b/src/nnue/trainer/features/factorizer_half_kp.h
@@ -1,99 +1,96 @@
-﻿// Specialization of NNUE evaluation function feature conversion class template for HalfKP
-
-#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_
+﻿#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_
 #define _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_
 
-#include "../../features/half_kp.h"
-#include "../../features/p.h"
-#include "../../features/half_relative_kp.h"
 #include "factorizer.h"
 
-namespace Eval {
+#include "nnue/features/half_kp.h"
+#include "nnue/features/p.h"
+#include "nnue/features/half_relative_kp.h"
 
-namespace NNUE {
+// Specialization of NNUE evaluation function feature conversion class template for HalfKP
+namespace Eval::NNUE::Features {
 
-namespace Features {
+    // Class template that converts input features into learning features
+    // Specialization for HalfKP
+    template <Side AssociatedKing>
+    class Factorizer<HalfKP<AssociatedKing>> {
+    private:
+        using FeatureType = HalfKP<AssociatedKing>;
 
-// Class template that converts input features into learning features
-// Specialization for HalfKP
-template <Side AssociatedKing>
-class Factorizer<HalfKP<AssociatedKing>> {
- private:
-  using FeatureType = HalfKP<AssociatedKing>;
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions =
+            FeatureType::kMaxActiveDimensions;
 
-  // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-  static constexpr IndexType kMaxActiveDimensions =
-      FeatureType::kMaxActiveDimensions;
+        // Type of learning feature
+        enum TrainingFeatureType {
+            kFeaturesHalfKP,
+            kFeaturesHalfK,
+            kFeaturesP,
+            kFeaturesHalfRelativeKP,
+            kNumTrainingFeatureTypes,
+        };
 
-  // Type of learning feature
-  enum TrainingFeatureType {
-    kFeaturesHalfKP,
-    kFeaturesHalfK,
-    kFeaturesP,
-    kFeaturesHalfRelativeKP,
-    kNumTrainingFeatureTypes,
-  };
+        // Learning feature information
+        static constexpr FeatureProperties kProperties[] = {
+            // kFeaturesHalfKP
+            {true, FeatureType::kDimensions},
+            // kFeaturesHalfK
+            {true, SQUARE_NB},
+            // kFeaturesP
+            {true, Factorizer<P>::GetDimensions()},
+            // kFeaturesHalfRelativeKP
+            {true, Factorizer<HalfRelativeKP<AssociatedKing>>::GetDimensions()},
+        };
 
-  // Learning feature information
-  static constexpr FeatureProperties kProperties[] = {
-    // kFeaturesHalfKP
-    {true, FeatureType::kDimensions},
-    // kFeaturesHalfK
-    {true, SQUARE_NB},
-    // kFeaturesP
-    {true, Factorizer<P>::GetDimensions()},
-    // kFeaturesHalfRelativeKP
-    {true, Factorizer<HalfRelativeKP<AssociatedKing>>::GetDimensions()},
-  };
-  static_assert(GetArrayLength(kProperties) == kNumTrainingFeatureTypes, "");
+        static_assert(GetArrayLength(kProperties) == kNumTrainingFeatureTypes, "");
 
- public:
-  // Get the dimensionality of the learning feature
-  static constexpr IndexType GetDimensions() {
-    return GetActiveDimensions(kProperties);
-  }
+    public:
+        // Get the dimensionality of the learning feature
+        static constexpr IndexType GetDimensions() {
+            return GetActiveDimensions(kProperties);
+        }
 
-  // Get index of learning feature and scale of learning rate
-  static void AppendTrainingFeatures(
-      IndexType base_index, std::vector<TrainingFeature>* training_features) {
-    // kFeaturesHalfKP
-    IndexType index_offset = AppendBaseFeature<FeatureType>(
-        kProperties[kFeaturesHalfKP], base_index, training_features);
+        // Get index of learning feature and scale of learning rate
+        static void AppendTrainingFeatures(
+            IndexType base_index, std::vector<TrainingFeature>* training_features) {
 
-    const auto sq_k = static_cast<Square>(base_index / PS_END);
-    const auto p = static_cast<IndexType>(base_index % PS_END);
-    // kFeaturesHalfK
-    {
-      const auto& properties = kProperties[kFeaturesHalfK];
-      if (properties.active) {
-        training_features->emplace_back(index_offset + sq_k);
-        index_offset += properties.dimensions;
-      }
-    }
-    // kFeaturesP
-    index_offset += InheritFeaturesIfRequired<P>(
-        index_offset, kProperties[kFeaturesP], p, training_features);
-    // kFeaturesHalfRelativeKP
-    if (p >= PS_W_PAWN) {
-      index_offset += InheritFeaturesIfRequired<HalfRelativeKP<AssociatedKing>>(
-          index_offset, kProperties[kFeaturesHalfRelativeKP],
-          HalfRelativeKP<AssociatedKing>::MakeIndex(sq_k, p),
-          training_features);
-    } else {
-      index_offset += SkipFeatures(kProperties[kFeaturesHalfRelativeKP]);
-    }
+            // kFeaturesHalfKP
+            IndexType index_offset = AppendBaseFeature<FeatureType>(
+                kProperties[kFeaturesHalfKP], base_index, training_features);
 
-    assert(index_offset == GetDimensions());
-  }
-};
+            const auto sq_k = static_cast<Square>(base_index / PS_END);
+            const auto p = static_cast<IndexType>(base_index % PS_END);
 
-template <Side AssociatedKing>
-constexpr FeatureProperties Factorizer<HalfKP<AssociatedKing>>::kProperties[];
+            // kFeaturesHalfK
+            {
+                const auto& properties = kProperties[kFeaturesHalfK];
+                if (properties.active) {
+                    training_features->emplace_back(index_offset + sq_k);
+                    index_offset += properties.dimensions;
+                }
+            }
 
-}  // namespace Features
+            // kFeaturesP
+            index_offset += InheritFeaturesIfRequired<P>(
+                index_offset, kProperties[kFeaturesP], p, training_features);
+            // kFeaturesHalfRelativeKP
+            if (p >= PS_W_PAWN) {
+                index_offset += InheritFeaturesIfRequired<HalfRelativeKP<AssociatedKing>>(
+                    index_offset, kProperties[kFeaturesHalfRelativeKP],
+                    HalfRelativeKP<AssociatedKing>::MakeIndex(sq_k, p),
+                    training_features);
+            }
+            else {
+                index_offset += SkipFeatures(kProperties[kFeaturesHalfRelativeKP]);
+            }
 
-}  // namespace NNUE
+            assert(index_offset == GetDimensions());
+        }
+    };
 
-}  // namespace Eval
+    template <Side AssociatedKing>
+    constexpr FeatureProperties Factorizer<HalfKP<AssociatedKing>>::kProperties[];
+
+}  // namespace Eval::NNUE::Features
 
 #endif

From c286f9cd7d875aa4bf61ae12998e68f2e8fcb1a4 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 14 Oct 2020 21:26:03 +0200
Subject: [PATCH 213/398] Cleanup trainer.

---
 src/nnue/trainer/trainer.h                    | 205 +++---
 src/nnue/trainer/trainer_affine_transform.h   | 578 +++++++--------
 src/nnue/trainer/trainer_clipped_relu.h       | 228 +++---
 .../trainer/trainer_feature_transformer.h     | 661 +++++++++---------
 src/nnue/trainer/trainer_input_slice.h        | 432 ++++++------
 src/nnue/trainer/trainer_sum.h                | 312 +++++----
 6 files changed, 1263 insertions(+), 1153 deletions(-)

diff --git a/src/nnue/trainer/trainer.h b/src/nnue/trainer/trainer.h
index 659863ad..7d9b66ee 100644
--- a/src/nnue/trainer/trainer.h
+++ b/src/nnue/trainer/trainer.h
@@ -1,121 +1,134 @@
-﻿// Common header of class template for learning NNUE evaluation function
-
-#ifndef _NNUE_TRAINER_H_
+﻿#ifndef _NNUE_TRAINER_H_
 #define _NNUE_TRAINER_H_
 
-#include "../nnue_common.h"
-#include "../features/index_list.h"
+#include "nnue/nnue_common.h"
+#include "nnue/features/index_list.h"
 
 #include <sstream>
+
 #if defined(USE_BLAS)
 static_assert(std::is_same<LearnFloatType, float>::value, "");
 #include <cblas.h>
 #endif
 
-namespace Eval {
+// Common header of class template for learning NNUE evaluation function
+namespace Eval::NNUE {
 
-namespace NNUE {
+    // Ponanza constant used in the relation between evaluation value and winning percentage
+    constexpr double kPonanzaConstant = 600.0;
 
-// Ponanza constant used in the relation between evaluation value and winning percentage
-constexpr double kPonanzaConstant = 600.0;
+    // Class that represents one index of learning feature
+    class TrainingFeature {
+        using StorageType = std::uint32_t;
+        static_assert(std::is_unsigned<StorageType>::value, "");
 
-// Class that represents one index of learning feature
-class TrainingFeature {
-  using StorageType = std::uint32_t;
-  static_assert(std::is_unsigned<StorageType>::value, "");
+    public:
+        static constexpr std::uint32_t kIndexBits = 24;
 
- public:
-  static constexpr std::uint32_t kIndexBits = 24;
-  static_assert(kIndexBits < std::numeric_limits<StorageType>::digits, "");
-  static constexpr std::uint32_t kCountBits =
-      std::numeric_limits<StorageType>::digits - kIndexBits;
+        static_assert(kIndexBits < std::numeric_limits<StorageType>::digits, "");
 
-  explicit TrainingFeature(IndexType index) :
-      index_and_count_((index << kCountBits) | 1) {
-    assert(index < (1 << kIndexBits));
-  }
-  TrainingFeature& operator+=(const TrainingFeature& other) {
-    assert(other.GetIndex() == GetIndex());
-    assert(other.GetCount() + GetCount() < (1 << kCountBits));
-    index_and_count_ += other.GetCount();
-    return *this;
-  }
-  IndexType GetIndex() const {
-    return static_cast<IndexType>(index_and_count_ >> kCountBits);
-  }
-  void ShiftIndex(IndexType offset) {
-    assert(GetIndex() + offset < (1 << kIndexBits));
-    index_and_count_ += offset << kCountBits;
-  }
-  IndexType GetCount() const {
-    return static_cast<IndexType>(index_and_count_ & ((1 << kCountBits) - 1));
-  }
-  bool operator<(const TrainingFeature& other) const {
-    return index_and_count_ < other.index_and_count_;
-  }
+        static constexpr std::uint32_t kCountBits =
+            std::numeric_limits<StorageType>::digits - kIndexBits;
 
- private:
-  StorageType index_and_count_;
-};
+        explicit TrainingFeature(IndexType index) :
+            index_and_count_((index << kCountBits) | 1) {
 
-// Structure that represents one sample of training data
-struct Example {
-  std::vector<TrainingFeature> training_features[2];
-  Learner::PackedSfenValue psv;
-  int sign;
-  double weight;
-};
+            assert(index < (1 << kIndexBits));
+        }
 
-// Message used for setting hyperparameters
-struct Message {
-  Message(const std::string& message_name, const std::string& message_value = ""):
-      name(message_name), value(message_value), num_peekers(0), num_receivers(0) {}
-  const std::string name;
-  const std::string value;
-  std::uint32_t num_peekers;
-  std::uint32_t num_receivers;
-};
+        TrainingFeature& operator+=(const TrainingFeature& other) {
+            assert(other.GetIndex() == GetIndex());
+            assert(other.GetCount() + GetCount() < (1 << kCountBits));
+            index_and_count_ += other.GetCount();
+            return *this;
+        }
 
-// determine whether to accept the message
-bool ReceiveMessage(const std::string& name, Message* message) {
-  const auto subscript = "[" + std::to_string(message->num_peekers) + "]";
-  if (message->name.substr(0, name.size() + 1) == name + "[") {
-    ++message->num_peekers;
-  }
-  if (message->name == name || message->name == name + subscript) {
-    ++message->num_receivers;
-    return true;
-  }
-  return false;
-}
+        IndexType GetIndex() const {
+            return static_cast<IndexType>(index_and_count_ >> kCountBits);
+        }
 
-// split the string
-std::vector<std::string> Split(const std::string& input, char delimiter) {
-  std::istringstream stream(input);
-  std::string field;
-  std::vector<std::string> fields;
-  while (std::getline(stream, field, delimiter)) {
-    fields.push_back(field);
-  }
-  return fields;
-}
+        void ShiftIndex(IndexType offset) {
+            assert(GetIndex() + offset < (1 << kIndexBits));
+            index_and_count_ += offset << kCountBits;
+        }
 
-// round a floating point number to an integer
-template <typename IntType>
-IntType Round(double value) {
-  return static_cast<IntType>(std::floor(value + 0.5));
-}
+        IndexType GetCount() const {
+            return static_cast<IndexType>(index_and_count_ & ((1 << kCountBits) - 1));
+        }
 
-// make_shared with alignment
-template <typename T, typename... ArgumentTypes>
-std::shared_ptr<T> MakeAlignedSharedPtr(ArgumentTypes&&... arguments) {
-  const auto ptr = new(std_aligned_alloc(alignof(T), sizeof(T)))
-      T(std::forward<ArgumentTypes>(arguments)...);
-  return std::shared_ptr<T>(ptr, AlignedDeleter<T>());
-}
+        bool operator<(const TrainingFeature& other) const {
+            return index_and_count_ < other.index_and_count_;
+        }
 
-}  // namespace NNUE
+    private:
+        StorageType index_and_count_;
+    };
 
-}  // namespace Eval
+    // Structure that represents one sample of training data
+    struct Example {
+        std::vector<TrainingFeature> training_features[2];
+        Learner::PackedSfenValue psv;
+        int sign;
+        double weight;
+    };
+
+    // Message used for setting hyperparameters
+    struct Message {
+        Message(const std::string& message_name, const std::string& message_value = "") :
+            name(message_name), value(message_value), num_peekers(0), num_receivers(0)
+        {
+        }
+
+        const std::string name;
+        const std::string value;
+        std::uint32_t num_peekers;
+        std::uint32_t num_receivers;
+    };
+
+    // determine whether to accept the message
+    bool ReceiveMessage(const std::string& name, Message* message) {
+        const auto subscript = "[" + std::to_string(message->num_peekers) + "]";
+
+        if (message->name.substr(0, name.size() + 1) == name + "[") {
+            ++message->num_peekers;
+        }
+
+        if (message->name == name || message->name == name + subscript) {
+            ++message->num_receivers;
+            return true;
+        }
+
+        return false;
+    }
+
+    // split the string
+    std::vector<std::string> Split(const std::string& input, char delimiter) {
+        std::istringstream stream(input);
+        std::string field;
+        std::vector<std::string> fields;
+
+        while (std::getline(stream, field, delimiter)) {
+            fields.push_back(field);
+        }
+
+        return fields;
+    }
+
+    // round a floating point number to an integer
+    template <typename IntType>
+    IntType Round(double value) {
+        return static_cast<IntType>(std::floor(value + 0.5));
+    }
+
+    // make_shared with alignment
+    template <typename T, typename... ArgumentTypes>
+    std::shared_ptr<T> MakeAlignedSharedPtr(ArgumentTypes&&... arguments) {
+        const auto ptr = new(std_aligned_alloc(alignof(T), sizeof(T)))
+            T(std::forward<ArgumentTypes>(arguments)...);
+
+        return std::shared_ptr<T>(ptr, AlignedDeleter<T>());
+    }
+
+}  // namespace Eval::NNUE
 
 #endif
diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h
index 415b7dc8..dd70b8fb 100644
--- a/src/nnue/trainer/trainer_affine_transform.h
+++ b/src/nnue/trainer/trainer_affine_transform.h
@@ -1,297 +1,329 @@
-﻿// Specialization of NNUE evaluation function learning class template for AffineTransform
-
-#ifndef _NNUE_TRAINER_AFFINE_TRANSFORM_H_
+﻿#ifndef _NNUE_TRAINER_AFFINE_TRANSFORM_H_
 #define _NNUE_TRAINER_AFFINE_TRANSFORM_H_
 
-#include "../../learn/learn.h"
-#include "../layers/affine_transform.h"
 #include "trainer.h"
 
+#include "learn/learn.h"
+
+#include "nnue/layers/affine_transform.h"
+
 #include <random>
 
-namespace Eval {
+// Specialization of NNUE evaluation function learning class template for AffineTransform
+namespace Eval::NNUE {
 
-namespace NNUE {
+    // Learning: Affine transformation layer
+    template <typename PreviousLayer, IndexType OutputDimensions>
+    class Trainer<Layers::AffineTransform<PreviousLayer, OutputDimensions>> {
+    private:
+        // Type of layer to learn
+        using LayerType = Layers::AffineTransform<PreviousLayer, OutputDimensions>;
 
-// Learning: Affine transformation layer
-template <typename PreviousLayer, IndexType OutputDimensions>
-class Trainer<Layers::AffineTransform<PreviousLayer, OutputDimensions>> {
- private:
-  // Type of layer to learn
-  using LayerType = Layers::AffineTransform<PreviousLayer, OutputDimensions>;
+    public:
+        // factory function
+        static std::shared_ptr<Trainer> Create(
+            LayerType* target_layer, FeatureTransformer* ft) {
 
- public:
-  // factory function
-  static std::shared_ptr<Trainer> Create(
-      LayerType* target_layer, FeatureTransformer* ft) {
-    return std::shared_ptr<Trainer>(
-        new Trainer(target_layer, ft));
-  }
-
-  // Set options such as hyperparameters
-  void SendMessage(Message* message) {
-    previous_layer_trainer_->SendMessage(message);
-    if (ReceiveMessage("momentum", message)) {
-      momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
-    }
-    if (ReceiveMessage("learning_rate_scale", message)) {
-      learning_rate_scale_ =
-          static_cast<LearnFloatType>(std::stod(message->value));
-    }
-    if (ReceiveMessage("reset", message)) {
-      DequantizeParameters();
-    }
-    if (ReceiveMessage("quantize_parameters", message)) {
-      QuantizeParameters();
-    }
-  }
-
-  // Initialize the parameters with random numbers
-  template <typename RNG>
-  void Initialize(RNG& rng) {
-    previous_layer_trainer_->Initialize(rng);
-    if (kIsOutputLayer) {
-      // Initialize output layer with 0
-      std::fill(std::begin(biases_), std::end(biases_),
-                static_cast<LearnFloatType>(0.0));
-      std::fill(std::begin(weights_), std::end(weights_),
-                static_cast<LearnFloatType>(0.0));
-    } else {
-      // Assuming that the input distribution is unit-mean 0.5, equal variance,
-      // Initialize the output distribution so that each unit has a mean of 0.5 and the same variance as the input
-      const double kSigma = 1.0 / std::sqrt(kInputDimensions);
-      auto distribution = std::normal_distribution<double>(0.0, kSigma);
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        double sum = 0.0;
-        for (IndexType j = 0; j < kInputDimensions; ++j) {
-          const auto weight = static_cast<LearnFloatType>(distribution(rng));
-          weights_[kInputDimensions * i + j] = weight;
-          sum += weight;
+            return std::shared_ptr<Trainer>(
+                new Trainer(target_layer, ft));
         }
-        biases_[i] = static_cast<LearnFloatType>(0.5 - 0.5 * sum);
-      }
-    }
-    QuantizeParameters();
-  }
 
-  // forward propagation
-  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
-    if (output_.size() < kOutputDimensions * batch.size()) {
-      output_.resize(kOutputDimensions * batch.size());
-      gradients_.resize(kInputDimensions * batch.size());
-    }
-    batch_size_ = static_cast<IndexType>(batch.size());
-    batch_input_ = previous_layer_trainer_->Propagate(batch);
+        // Set options such as hyperparameters
+        void SendMessage(Message* message) {
+            previous_layer_trainer_->SendMessage(message);
+
+            if (ReceiveMessage("momentum", message)) {
+                momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
+            }
+
+            if (ReceiveMessage("learning_rate_scale", message)) {
+                learning_rate_scale_ =
+                    static_cast<LearnFloatType>(std::stod(message->value));
+            }
+
+            if (ReceiveMessage("reset", message)) {
+                DequantizeParameters();
+            }
+
+            if (ReceiveMessage("quantize_parameters", message)) {
+                QuantizeParameters();
+            }
+        }
+
+        // Initialize the parameters with random numbers
+        template <typename RNG>
+        void Initialize(RNG& rng) {
+            previous_layer_trainer_->Initialize(rng);
+
+            if (kIsOutputLayer) {
+                // Initialize output layer with 0
+                std::fill(std::begin(biases_), std::end(biases_),
+                          static_cast<LearnFloatType>(0.0));
+                std::fill(std::begin(weights_), std::end(weights_),
+                          static_cast<LearnFloatType>(0.0));
+            }
+            else {
+                // Assuming that the input distribution is unit-mean 0.5, equal variance,
+                // Initialize the output distribution so that each unit has a mean of 0.5 and the same variance as the input
+                const double kSigma = 1.0 / std::sqrt(kInputDimensions);
+                auto distribution = std::normal_distribution<double>(0.0, kSigma);
+
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                    double sum = 0.0;
+                      for (IndexType j = 0; j < kInputDimensions; ++j) {
+                          const auto weight = static_cast<LearnFloatType>(distribution(rng));
+                          weights_[kInputDimensions * i + j] = weight;
+                          sum += weight;
+                      }
+
+                    biases_[i] = static_cast<LearnFloatType>(0.5 - 0.5 * sum);
+                }
+            }
+
+            QuantizeParameters();
+        }
+
+        // forward propagation
+        const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+            if (output_.size() < kOutputDimensions * batch.size()) {
+                output_.resize(kOutputDimensions * batch.size());
+                gradients_.resize(kInputDimensions * batch.size());
+            }
+
+            batch_size_ = static_cast<IndexType>(batch.size());
+            batch_input_ = previous_layer_trainer_->Propagate(batch);
 #if defined(USE_BLAS)
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      cblas_scopy(kOutputDimensions, biases_, 1, &output_[batch_offset], 1);
-    }
-    cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
-                kOutputDimensions, batch_size_, kInputDimensions, 1.0,
-                weights_, kInputDimensions,
-                batch_input_, kInputDimensions,
-                1.0, &output_[0], kOutputDimensions);
-#else
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType input_batch_offset = kInputDimensions * b;
-      const IndexType output_batch_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        double sum = biases_[i];
-        for (IndexType j = 0; j < kInputDimensions; ++j) {
-          const IndexType index = kInputDimensions * i + j;
-          sum += weights_[index] * batch_input_[input_batch_offset + j];
-        }
-        output_[output_batch_offset + i] = static_cast<LearnFloatType>(sum);
-      }
-    }
-#endif
-    return output_.data();
-  }
+            for (IndexType b = 0; b < batch_size_; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                cblas_scopy(kOutputDimensions, biases_, 1, &output_[batch_offset], 1);
+            }
+
+            cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
+                        kOutputDimensions, batch_size_, kInputDimensions, 1.0,
+                        weights_, kInputDimensions,
+                        batch_input_, kInputDimensions,
+                        1.0, &output_[0], kOutputDimensions);
+#else
+            for (IndexType b = 0; b < batch_size_; ++b) {
+                const IndexType input_batch_offset = kInputDimensions * b;
+                const IndexType output_batch_offset = kOutputDimensions * b;
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                    double sum = biases_[i];
+                    for (IndexType j = 0; j < kInputDimensions; ++j) {
+                        const IndexType index = kInputDimensions * i + j;
+                        sum += weights_[index] * batch_input_[input_batch_offset + j];
+                    }
+
+                    output_[output_batch_offset + i] = static_cast<LearnFloatType>(sum);
+                }
+            }
+
+#endif
+            return output_.data();
+        }
+
+        // backpropagation
+        void Backpropagate(const LearnFloatType* gradients,
+                           LearnFloatType learning_rate) {
+
+            const LearnFloatType local_learning_rate =
+                learning_rate * learning_rate_scale_;
 
-  // backpropagation
-  void Backpropagate(const LearnFloatType* gradients,
-                     LearnFloatType learning_rate) {
-    const LearnFloatType local_learning_rate =
-        learning_rate * learning_rate_scale_;
 #if defined(USE_BLAS)
-    // backpropagate
-    cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
-                kInputDimensions, batch_size_, kOutputDimensions, 1.0,
-                weights_, kInputDimensions,
-                gradients, kOutputDimensions,
-                0.0, &gradients_[0], kInputDimensions);
-    // update
-    cblas_sscal(kOutputDimensions, momentum_, biases_diff_, 1);
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      cblas_saxpy(kOutputDimensions, 1.0,
-                  &gradients[batch_offset], 1, biases_diff_, 1);
-    }
-    cblas_saxpy(kOutputDimensions, -local_learning_rate,
-                biases_diff_, 1, biases_, 1);
-    cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
-                kOutputDimensions, kInputDimensions, batch_size_, 1.0,
-                gradients, kOutputDimensions,
-                batch_input_, kInputDimensions,
-                momentum_, weights_diff_, kInputDimensions);
-    cblas_saxpy(kOutputDimensions * kInputDimensions, -local_learning_rate,
-                weights_diff_, 1, weights_, 1);
+            // backpropagate
+            cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+                        kInputDimensions, batch_size_, kOutputDimensions, 1.0,
+                        weights_, kInputDimensions,
+                        gradients, kOutputDimensions,
+                        0.0, &gradients_[0], kInputDimensions);
+
+            // update
+            cblas_sscal(kOutputDimensions, momentum_, biases_diff_, 1);
+            for (IndexType b = 0; b < batch_size_; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                cblas_saxpy(kOutputDimensions, 1.0,
+                          &gradients[batch_offset], 1, biases_diff_, 1);
+            }
+
+            cblas_saxpy(kOutputDimensions, -local_learning_rate,
+                        biases_diff_, 1, biases_, 1);
+
+            cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
+                        kOutputDimensions, kInputDimensions, batch_size_, 1.0,
+                        gradients, kOutputDimensions,
+                        batch_input_, kInputDimensions,
+                        momentum_, weights_diff_, kInputDimensions);
+            cblas_saxpy(kOutputDimensions * kInputDimensions, -local_learning_rate,
+                        weights_diff_, 1, weights_, 1);
+
 #else
-    // backpropagate
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType input_batch_offset = kInputDimensions * b;
-      const IndexType output_batch_offset = kOutputDimensions * b;
-      for (IndexType j = 0; j < kInputDimensions; ++j) {
-        double sum = 0.0;
-        for (IndexType i = 0; i < kOutputDimensions; ++i) {
-          const IndexType index = kInputDimensions * i + j;
-          sum += weights_[index] * gradients[output_batch_offset + i];
-        }
-        gradients_[input_batch_offset + j] = static_cast<LearnFloatType>(sum);
-      }
-    }
-    // update
-    for (IndexType i = 0; i < kOutputDimensions; ++i) {
-      biases_diff_[i] *= momentum_;
-    }
-    for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
-      weights_diff_[i] *= momentum_;
-    }
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType input_batch_offset = kInputDimensions * b;
-      const IndexType output_batch_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        biases_diff_[i] += gradients[output_batch_offset + i];
-      }
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        for (IndexType j = 0; j < kInputDimensions; ++j) {
-          const IndexType index = kInputDimensions * i + j;
-          weights_diff_[index] += gradients[output_batch_offset + i] *
-              batch_input_[input_batch_offset + j];
-        }
-      }
-    }
-    for (IndexType i = 0; i < kOutputDimensions; ++i) {
-      biases_[i] -= local_learning_rate * biases_diff_[i];
-    }
-    for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
-      weights_[i] -= local_learning_rate * weights_diff_[i];
-    }
-#endif
-    previous_layer_trainer_->Backpropagate(gradients_.data(), learning_rate);
-  }
+            // backpropagate
+            for (IndexType b = 0; b < batch_size_; ++b) {
+                const IndexType input_batch_offset = kInputDimensions * b;
+                const IndexType output_batch_offset = kOutputDimensions * b;
+                for (IndexType j = 0; j < kInputDimensions; ++j) {
+                    double sum = 0.0;
+                    for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                        const IndexType index = kInputDimensions * i + j;
+                        sum += weights_[index] * gradients[output_batch_offset + i];
+                    }
+                    gradients_[input_batch_offset + j] = static_cast<LearnFloatType>(sum);
+                }
+            }
 
- private:
-  // constructor
-  Trainer(LayerType* target_layer, FeatureTransformer* ft) :
-      batch_size_(0),
-      batch_input_(nullptr),
-      previous_layer_trainer_(Trainer<PreviousLayer>::Create(
-          &target_layer->previous_layer_, ft)),
-      target_layer_(target_layer),
-      biases_(),
-      weights_(),
-      biases_diff_(),
-      weights_diff_(),
-      momentum_(0.2),
-      learning_rate_scale_(1.0) {
-    DequantizeParameters();
-  }
+            // update
+            for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                biases_diff_[i] *= momentum_;
+            }
 
-  // Weight saturation and parameterization
-  void QuantizeParameters() {
-    for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
-      weights_[i] = std::max(-kMaxWeightMagnitude,
-                             std::min(+kMaxWeightMagnitude, weights_[i]));
-    }
-    for (IndexType i = 0; i < kOutputDimensions; ++i) {
-      target_layer_->biases_[i] =
-          Round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
-    }
-    for (IndexType i = 0; i < kOutputDimensions; ++i) {
-      const auto offset = kInputDimensions * i;
-      const auto padded_offset = LayerType::kPaddedInputDimensions * i;
-      for (IndexType j = 0; j < kInputDimensions; ++j) {
-        target_layer_->weights_[padded_offset + j] =
-            Round<typename LayerType::WeightType>(
-                weights_[offset + j] * kWeightScale);
-      }
-    }
-  }
+            for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
+                weights_diff_[i] *= momentum_;
+            }
 
-  // read parameterized integer
-  void DequantizeParameters() {
-    for (IndexType i = 0; i < kOutputDimensions; ++i) {
-      biases_[i] = static_cast<LearnFloatType>(
-          target_layer_->biases_[i] / kBiasScale);
-    }
-    for (IndexType i = 0; i < kOutputDimensions; ++i) {
-      const auto offset = kInputDimensions * i;
-      const auto padded_offset = LayerType::kPaddedInputDimensions * i;
-      for (IndexType j = 0; j < kInputDimensions; ++j) {
-        weights_[offset + j] = static_cast<LearnFloatType>(
-            target_layer_->weights_[padded_offset + j] / kWeightScale);
-      }
-    }
-    std::fill(std::begin(biases_diff_), std::end(biases_diff_),
-              static_cast<LearnFloatType>(0.0));
-    std::fill(std::begin(weights_diff_), std::end(weights_diff_),
-              static_cast<LearnFloatType>(0.0));
-  }
+            for (IndexType b = 0; b < batch_size_; ++b) {
+                const IndexType input_batch_offset = kInputDimensions * b;
+                const IndexType output_batch_offset = kOutputDimensions * b;
 
-  // number of input/output dimensions
-  static constexpr IndexType kInputDimensions = LayerType::kInputDimensions;
-  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                    biases_diff_[i] += gradients[output_batch_offset + i];
+                }
 
-  // If the output dimensionality is 1, the output layer
-  static constexpr bool kIsOutputLayer = kOutputDimensions == 1;
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                    for (IndexType j = 0; j < kInputDimensions; ++j) {
+                        const IndexType index = kInputDimensions * i + j;
+                        weights_diff_[index] += gradients[output_batch_offset + i] *
+                            batch_input_[input_batch_offset + j];
+                    }
+                }
+            }
 
-  // Coefficient used for parameterization
-  static constexpr LearnFloatType kActivationScale =
-      std::numeric_limits<std::int8_t>::max();
-  static constexpr LearnFloatType kBiasScale = kIsOutputLayer ?
-      (kPonanzaConstant * FV_SCALE) :
-      ((1 << kWeightScaleBits) * kActivationScale);
-  static constexpr LearnFloatType kWeightScale = kBiasScale / kActivationScale;
+            for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                biases_[i] -= local_learning_rate * biases_diff_[i];
+            }
 
-  // Upper limit of absolute value of weight used to prevent overflow when parameterizing integers
-  static constexpr LearnFloatType kMaxWeightMagnitude =
-      std::numeric_limits<typename LayerType::WeightType>::max() / kWeightScale;
-
-  // number of samples in mini-batch
-  IndexType batch_size_;
-
-  // Input mini batch
-  const LearnFloatType* batch_input_;
-
-  // Trainer of the previous layer
-  const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
-
-  // layer to learn
-  LayerType* const target_layer_;
-
-  // parameter
-  LearnFloatType biases_[kOutputDimensions];
-  LearnFloatType weights_[kOutputDimensions * kInputDimensions];
-
-  // Buffer used for updating parameters
-  LearnFloatType biases_diff_[kOutputDimensions];
-  LearnFloatType weights_diff_[kOutputDimensions * kInputDimensions];
-
-  // Forward propagation buffer
-  std::vector<LearnFloatType> output_;
-
-  // buffer for back propagation
-  std::vector<LearnFloatType> gradients_;
-
-  // hyper parameter
-  LearnFloatType momentum_;
-  LearnFloatType learning_rate_scale_;
-};
-
-}  // namespace NNUE
-
-}  // namespace Eval
+            for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
+                weights_[i] -= local_learning_rate * weights_diff_[i];
+            }
+
+#endif
+            previous_layer_trainer_->Backpropagate(gradients_.data(), learning_rate);
+        }
+
+    private:
+        // constructor
+        Trainer(LayerType* target_layer, FeatureTransformer* ft) :
+            batch_size_(0),
+            batch_input_(nullptr),
+            previous_layer_trainer_(Trainer<PreviousLayer>::Create(
+                &target_layer->previous_layer_, ft)),
+            target_layer_(target_layer),
+            biases_(),
+            weights_(),
+            biases_diff_(),
+            weights_diff_(),
+            momentum_(0.2),
+            learning_rate_scale_(1.0) {
+
+            DequantizeParameters();
+        }
+
+        // Weight saturation and parameterization
+        void QuantizeParameters() {
+            for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
+                weights_[i] = std::max(-kMaxWeightMagnitude,
+                                       std::min(+kMaxWeightMagnitude, weights_[i]));
+            }
+
+            for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                target_layer_->biases_[i] =
+                    Round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
+            }
+
+            for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                const auto offset = kInputDimensions * i;
+                const auto padded_offset = LayerType::kPaddedInputDimensions * i;
+                for (IndexType j = 0; j < kInputDimensions; ++j) {
+                    target_layer_->weights_[padded_offset + j] =
+                        Round<typename LayerType::WeightType>(
+                            weights_[offset + j] * kWeightScale);
+                }
+            }
+        }
+
+        // read parameterized integer
+        void DequantizeParameters() {
+            for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                biases_[i] = static_cast<LearnFloatType>(
+                    target_layer_->biases_[i] / kBiasScale);
+            }
+
+            for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                const auto offset = kInputDimensions * i;
+                const auto padded_offset = LayerType::kPaddedInputDimensions * i;
+                for (IndexType j = 0; j < kInputDimensions; ++j) {
+                    weights_[offset + j] = static_cast<LearnFloatType>(
+                        target_layer_->weights_[padded_offset + j] / kWeightScale);
+                }
+            }
+
+            std::fill(std::begin(biases_diff_), std::end(biases_diff_),
+                      static_cast<LearnFloatType>(0.0));
+            std::fill(std::begin(weights_diff_), std::end(weights_diff_),
+                      static_cast<LearnFloatType>(0.0));
+        }
+
+        // number of input/output dimensions
+        static constexpr IndexType kInputDimensions = LayerType::kInputDimensions;
+        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+
+        // If the output dimensionality is 1, the output layer
+        static constexpr bool kIsOutputLayer = kOutputDimensions == 1;
+
+        // Coefficient used for parameterization
+        static constexpr LearnFloatType kActivationScale =
+            std::numeric_limits<std::int8_t>::max();
+
+        static constexpr LearnFloatType kBiasScale = kIsOutputLayer ?
+            (kPonanzaConstant * FV_SCALE) :
+            ((1 << kWeightScaleBits) * kActivationScale);
+
+        static constexpr LearnFloatType kWeightScale = kBiasScale / kActivationScale;
+
+        // Upper limit of absolute value of weight used to prevent overflow when parameterizing integers
+        static constexpr LearnFloatType kMaxWeightMagnitude =
+            std::numeric_limits<typename LayerType::WeightType>::max() / kWeightScale;
+
+        // number of samples in mini-batch
+        IndexType batch_size_;
+
+        // Input mini batch
+        const LearnFloatType* batch_input_;
+
+        // Trainer of the previous layer
+        const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
+
+        // layer to learn
+        LayerType* const target_layer_;
+
+        // parameter
+        LearnFloatType biases_[kOutputDimensions];
+        LearnFloatType weights_[kOutputDimensions * kInputDimensions];
+
+        // Buffer used for updating parameters
+        LearnFloatType biases_diff_[kOutputDimensions];
+        LearnFloatType weights_diff_[kOutputDimensions * kInputDimensions];
+
+        // Forward propagation buffer
+        std::vector<LearnFloatType> output_;
+
+        // buffer for back propagation
+        std::vector<LearnFloatType> gradients_;
+
+        // hyper parameter
+        LearnFloatType momentum_;
+        LearnFloatType learning_rate_scale_;
+    };
+
+}  // namespace Eval::NNUE
 
 #endif
diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index cf7a2447..902c2747 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -1,138 +1,142 @@
-﻿// Specialization of NNUE evaluation function learning class template for ClippedReLU
-
-#ifndef _NNUE_TRAINER_CLIPPED_RELU_H_
+﻿#ifndef _NNUE_TRAINER_CLIPPED_RELU_H_
 #define _NNUE_TRAINER_CLIPPED_RELU_H_
 
-#include "../../learn/learn.h"
-#include "../layers/clipped_relu.h"
 #include "trainer.h"
 
-namespace Eval {
+#include "learn/learn.h"
 
-namespace NNUE {
+#include "nnue/layers/clipped_relu.h"
 
-// Learning: Affine transformation layer
-template <typename PreviousLayer>
-class Trainer<Layers::ClippedReLU<PreviousLayer>> {
- private:
-  // Type of layer to learn
-  using LayerType = Layers::ClippedReLU<PreviousLayer>;
+// Specialization of NNUE evaluation function learning class template for ClippedReLU
+namespace Eval::NNUE {
 
- public:
-  // factory function
-  static std::shared_ptr<Trainer> Create(
-      LayerType* target_layer, FeatureTransformer* ft) {
-    return std::shared_ptr<Trainer>(
-        new Trainer(target_layer, ft));
-  }
+    // Learning: Affine transformation layer
+    template <typename PreviousLayer>
+    class Trainer<Layers::ClippedReLU<PreviousLayer>> {
+    private:
+        // Type of layer to learn
+        using LayerType = Layers::ClippedReLU<PreviousLayer>;
 
-  // Set options such as hyperparameters
-  void SendMessage(Message* message) {
-    previous_layer_trainer_->SendMessage(message);
-    if (ReceiveMessage("check_health", message)) {
-      CheckHealth();
-    }
-  }
+    public:
+        // factory function
+        static std::shared_ptr<Trainer> Create(
+            LayerType* target_layer, FeatureTransformer* ft) {
 
-  // Initialize the parameters with random numbers
-  template <typename RNG>
-  void Initialize(RNG& rng) {
-    previous_layer_trainer_->Initialize(rng);
-  }
+            return std::shared_ptr<Trainer>(
+                new Trainer(target_layer, ft));
+        }
 
-  // forward propagation
-  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
-    if (output_.size() < kOutputDimensions * batch.size()) {
-      output_.resize(kOutputDimensions * batch.size());
-      gradients_.resize(kInputDimensions * batch.size());
-    }
-    const auto input = previous_layer_trainer_->Propagate(batch);
-    batch_size_ = static_cast<IndexType>(batch.size());
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        const IndexType index = batch_offset + i;
-        output_[index] = std::max(+kZero, std::min(+kOne, input[index]));
-        min_activations_[i] = std::min(min_activations_[i], output_[index]);
-        max_activations_[i] = std::max(max_activations_[i], output_[index]);
-      }
-    }
-    return output_.data();
-  }
+        // Set options such as hyperparameters
+        void SendMessage(Message* message) {
+            previous_layer_trainer_->SendMessage(message);
+            if (ReceiveMessage("check_health", message)) {
+                CheckHealth();
+            }
+        }
 
-  // backpropagation
-  void Backpropagate(const LearnFloatType* gradients,
-                     LearnFloatType learning_rate) {
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        const IndexType index = batch_offset + i;
-        gradients_[index] = gradients[index] *
-            (output_[index] > kZero) * (output_[index] < kOne);
-      }
-    }
-    previous_layer_trainer_->Backpropagate(gradients_.data(), learning_rate);
-  }
+        // Initialize the parameters with random numbers
+        template <typename RNG>
+        void Initialize(RNG& rng) {
+            previous_layer_trainer_->Initialize(rng);
+        }
 
- private:
-  // constructor
-  Trainer(LayerType* target_layer, FeatureTransformer* ft) :
-      batch_size_(0),
-      previous_layer_trainer_(Trainer<PreviousLayer>::Create(
-          &target_layer->previous_layer_, ft)),
-      target_layer_(target_layer) {
-    std::fill(std::begin(min_activations_), std::end(min_activations_),
-              std::numeric_limits<LearnFloatType>::max());
-    std::fill(std::begin(max_activations_), std::end(max_activations_),
-              std::numeric_limits<LearnFloatType>::lowest());
-  }
+        // forward propagation
+        const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+            if (output_.size() < kOutputDimensions * batch.size()) {
+              output_.resize(kOutputDimensions * batch.size());
+              gradients_.resize(kInputDimensions * batch.size());
+            }
 
-  // Check if there are any problems with learning
-  void CheckHealth() {
-    const auto largest_min_activation = *std::max_element(
-        std::begin(min_activations_), std::end(min_activations_));
-    const auto smallest_max_activation = *std::min_element(
-        std::begin(max_activations_), std::end(max_activations_));
-    std::cout << "INFO: largest min activation = " << largest_min_activation
-              << ", smallest max activation = " << smallest_max_activation
-              << std::endl;
+            const auto input = previous_layer_trainer_->Propagate(batch);
+            batch_size_ = static_cast<IndexType>(batch.size());
+            for (IndexType b = 0; b < batch_size_; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                    const IndexType index = batch_offset + i;
+                    output_[index] = std::max(+kZero, std::min(+kOne, input[index]));
+                    min_activations_[i] = std::min(min_activations_[i], output_[index]);
+                    max_activations_[i] = std::max(max_activations_[i], output_[index]);
+                }
+            }
 
-    std::fill(std::begin(min_activations_), std::end(min_activations_),
-              std::numeric_limits<LearnFloatType>::max());
-    std::fill(std::begin(max_activations_), std::end(max_activations_),
-              std::numeric_limits<LearnFloatType>::lowest());
-  }
+            return output_.data();
+        }
 
-  // number of input/output dimensions
-  static constexpr IndexType kInputDimensions = LayerType::kOutputDimensions;
-  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+        // backpropagation
+        void Backpropagate(const LearnFloatType* gradients,
+                           LearnFloatType learning_rate) {
 
-  // LearnFloatType constant
-  static constexpr LearnFloatType kZero = static_cast<LearnFloatType>(0.0);
-  static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
+            for (IndexType b = 0; b < batch_size_; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                    const IndexType index = batch_offset + i;
+                    gradients_[index] = gradients[index] *
+                        (output_[index] > kZero) * (output_[index] < kOne);
+                }
+            }
 
-  // number of samples in mini-batch
-  IndexType batch_size_;
+            previous_layer_trainer_->Backpropagate(gradients_.data(), learning_rate);
+        }
 
-  // Trainer of the previous layer
-  const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
+    private:
+        // constructor
+        Trainer(LayerType* target_layer, FeatureTransformer* ft) :
+            batch_size_(0),
+            previous_layer_trainer_(Trainer<PreviousLayer>::Create(
+                &target_layer->previous_layer_, ft)),
+            target_layer_(target_layer) {
 
-  // layer to learn
-  LayerType* const target_layer_;
+            std::fill(std::begin(min_activations_), std::end(min_activations_),
+                      std::numeric_limits<LearnFloatType>::max());
+            std::fill(std::begin(max_activations_), std::end(max_activations_),
+                      std::numeric_limits<LearnFloatType>::lowest());
+        }
 
-  // Forward propagation buffer
-  std::vector<LearnFloatType> output_;
+        // Check if there are any problems with learning
+        void CheckHealth() {
+            const auto largest_min_activation = *std::max_element(
+                std::begin(min_activations_), std::end(min_activations_));
+            const auto smallest_max_activation = *std::min_element(
+                std::begin(max_activations_), std::end(max_activations_));
 
-  // buffer for back propagation
-  std::vector<LearnFloatType> gradients_;
+            std::cout << "INFO: largest min activation = " << largest_min_activation
+                      << ", smallest max activation = " << smallest_max_activation
+                      << std::endl;
 
-  // Health check statistics
-  LearnFloatType min_activations_[kOutputDimensions];
-  LearnFloatType max_activations_[kOutputDimensions];
-};
+            std::fill(std::begin(min_activations_), std::end(min_activations_),
+                      std::numeric_limits<LearnFloatType>::max());
+            std::fill(std::begin(max_activations_), std::end(max_activations_),
+                      std::numeric_limits<LearnFloatType>::lowest());
+        }
 
-}  // namespace NNUE
+        // number of input/output dimensions
+        static constexpr IndexType kInputDimensions = LayerType::kOutputDimensions;
+        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
 
-}  // namespace Eval
+        // LearnFloatType constant
+        static constexpr LearnFloatType kZero = static_cast<LearnFloatType>(0.0);
+        static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
+
+        // number of samples in mini-batch
+        IndexType batch_size_;
+
+        // Trainer of the previous layer
+        const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
+
+        // layer to learn
+        LayerType* const target_layer_;
+
+        // Forward propagation buffer
+        std::vector<LearnFloatType> output_;
+
+        // buffer for back propagation
+        std::vector<LearnFloatType> gradients_;
+
+        // Health check statistics
+        LearnFloatType min_activations_[kOutputDimensions];
+        LearnFloatType max_activations_[kOutputDimensions];
+    };
+
+}  // namespace Eval::NNUE
 
 #endif
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 225c91fc..f403e413 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -1,13 +1,14 @@
-﻿// Specialization for feature transformer of learning class template of NNUE evaluation function
-
-#ifndef _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
+﻿#ifndef _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
 #define _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
 
-#include "../../learn/learn.h"
-#include "../nnue_feature_transformer.h"
 #include "trainer.h"
+
 #include "features/factorizer_feature_set.h"
 
+#include "learn/learn.h"
+
+#include "nnue/nnue_feature_transformer.h"
+
 #include <array>
 #include <bitset>
 #include <numeric>
@@ -18,356 +19,392 @@
 #include <omp.h>
 #endif
 
-namespace Eval {
+// Specialization for feature transformer of learning class template of NNUE evaluation function
+namespace Eval::NNUE {
 
-namespace NNUE {
+    // Learning: Input feature converter
+    template <>
+    class Trainer<FeatureTransformer> {
+    private:
+        // Type of layer to learn
+        using LayerType = FeatureTransformer;
 
-// Learning: Input feature converter
-template <>
-class Trainer<FeatureTransformer> {
- private:
-  // Type of layer to learn
-  using LayerType = FeatureTransformer;
+    public:
+        template <typename T>
+        friend struct AlignedDeleter;
 
- public:
-  template <typename T>
-  friend struct AlignedDeleter;
-  template <typename T, typename... ArgumentTypes>
-  friend std::shared_ptr<T> MakeAlignedSharedPtr(ArgumentTypes&&... arguments);
+        template <typename T, typename... ArgumentTypes>
+        friend std::shared_ptr<T> MakeAlignedSharedPtr(ArgumentTypes&&... arguments);
 
-  // factory function
-  static std::shared_ptr<Trainer> Create(LayerType* target_layer) {
-    return MakeAlignedSharedPtr<Trainer>(target_layer);
-  }
+        // factory function
+        static std::shared_ptr<Trainer> Create(LayerType* target_layer) {
+            return MakeAlignedSharedPtr<Trainer>(target_layer);
+        }
 
-  // Set options such as hyperparameters
-  void SendMessage(Message* message) {
-    if (ReceiveMessage("momentum", message)) {
-      momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
-    }
-    if (ReceiveMessage("learning_rate_scale", message)) {
-      learning_rate_scale_ =
-          static_cast<LearnFloatType>(std::stod(message->value));
-    }
-    if (ReceiveMessage("reset", message)) {
-      DequantizeParameters();
-    }
-    if (ReceiveMessage("quantize_parameters", message)) {
-      QuantizeParameters();
-    }
-    if (ReceiveMessage("clear_unobserved_feature_weights", message)) {
-      ClearUnobservedFeatureWeights();
-    }
-    if (ReceiveMessage("check_health", message)) {
-      CheckHealth();
-    }
-  }
+        // Set options such as hyperparameters
+        void SendMessage(Message* message) {
+            if (ReceiveMessage("momentum", message)) {
+                momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
+            }
 
-  // Initialize the parameters with random numbers
-  template <typename RNG>
-  void Initialize(RNG& rng) {
-    std::fill(std::begin(weights_), std::end(weights_), +kZero);
-    const double kSigma = 0.1 / std::sqrt(RawFeatures::kMaxActiveDimensions);
-    auto distribution = std::normal_distribution<double>(0.0, kSigma);
-    for (IndexType i = 0; i < kHalfDimensions * RawFeatures::kDimensions; ++i) {
-      const auto weight = static_cast<LearnFloatType>(distribution(rng));
-      weights_[i] = weight;
-    }
-    for (IndexType i = 0; i < kHalfDimensions; ++i) {
-      biases_[i] = static_cast<LearnFloatType>(0.5);
-    }
-    QuantizeParameters();
-  }
+            if (ReceiveMessage("learning_rate_scale", message)) {
+                learning_rate_scale_ =
+                    static_cast<LearnFloatType>(std::stod(message->value));
+            }
 
-  // forward propagation
-  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
-    if (output_.size() < kOutputDimensions * batch.size()) {
-      output_.resize(kOutputDimensions * batch.size());
-      gradients_.resize(kOutputDimensions * batch.size());
-    }
-    batch_ = &batch;
-    // affine transform
+            if (ReceiveMessage("reset", message)) {
+                DequantizeParameters();
+            }
+
+            if (ReceiveMessage("quantize_parameters", message)) {
+                QuantizeParameters();
+            }
+
+            if (ReceiveMessage("clear_unobserved_feature_weights", message)) {
+                ClearUnobservedFeatureWeights();
+            }
+
+            if (ReceiveMessage("check_health", message)) {
+                CheckHealth();
+            }
+        }
+
+        // Initialize the parameters with random numbers
+        template <typename RNG>
+        void Initialize(RNG& rng) {
+            std::fill(std::begin(weights_), std::end(weights_), +kZero);
+
+            const double kSigma = 0.1 / std::sqrt(RawFeatures::kMaxActiveDimensions);
+            auto distribution = std::normal_distribution<double>(0.0, kSigma);
+
+            for (IndexType i = 0; i < kHalfDimensions * RawFeatures::kDimensions; ++i) {
+                const auto weight = static_cast<LearnFloatType>(distribution(rng));
+                weights_[i] = weight;
+            }
+
+            for (IndexType i = 0; i < kHalfDimensions; ++i) {
+                biases_[i] = static_cast<LearnFloatType>(0.5);
+            }
+
+            QuantizeParameters();
+        }
+
+        // forward propagation
+        const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+            if (output_.size() < kOutputDimensions * batch.size()) {
+                output_.resize(kOutputDimensions * batch.size());
+                gradients_.resize(kOutputDimensions * batch.size());
+            }
+
+            batch_ = &batch;
+            // affine transform
 #pragma omp parallel for
-    for (IndexType b = 0; b < batch.size(); ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType c = 0; c < 2; ++c) {
-        const IndexType output_offset = batch_offset + kHalfDimensions * c;
+            for (IndexType b = 0; b < batch.size(); ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType c = 0; c < 2; ++c) {
+                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
 #if defined(USE_BLAS)
-        cblas_scopy(kHalfDimensions, biases_, 1, &output_[output_offset], 1);
-        for (const auto& feature : batch[b].training_features[c]) {
-          const IndexType weights_offset = kHalfDimensions * feature.GetIndex();
-          cblas_saxpy(kHalfDimensions, (float)feature.GetCount(),
-                      &weights_[weights_offset], 1, &output_[output_offset], 1);
-        }
+                    cblas_scopy(kHalfDimensions, biases_, 1, &output_[output_offset], 1);
+                    for (const auto& feature : batch[b].training_features[c]) {
+                        const IndexType weights_offset = kHalfDimensions * feature.GetIndex();
+                        cblas_saxpy(kHalfDimensions, (float)feature.GetCount(),
+                                    &weights_[weights_offset], 1, &output_[output_offset], 1);
+                    }
 #else
-        for (IndexType i = 0; i < kHalfDimensions; ++i) {
-          output_[output_offset + i] = biases_[i];
-        }
-        for (const auto& feature : batch[b].training_features[c]) {
-          const IndexType weights_offset = kHalfDimensions * feature.GetIndex();
-          for (IndexType i = 0; i < kHalfDimensions; ++i) {
-            output_[output_offset + i] +=
-                feature.GetCount() * weights_[weights_offset + i];
-          }
-        }
+                    for (IndexType i = 0; i < kHalfDimensions; ++i) {
+                        output_[output_offset + i] = biases_[i];
+                    }
+                    for (const auto& feature : batch[b].training_features[c]) {
+                        const IndexType weights_offset = kHalfDimensions * feature.GetIndex();
+                        for (IndexType i = 0; i < kHalfDimensions; ++i) {
+                            output_[output_offset + i] +=
+                                feature.GetCount() * weights_[weights_offset + i];
+                        }
+                    }
 #endif
-      }
-    }
-    // clipped ReLU
-    for (IndexType b = 0; b < batch.size(); ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        const IndexType index = batch_offset + i;
-        min_pre_activation_ = std::min(min_pre_activation_, output_[index]);
-        max_pre_activation_ = std::max(max_pre_activation_, output_[index]);
-        output_[index] = std::max(+kZero, std::min(+kOne, output_[index]));
-        const IndexType t = i % kHalfDimensions;
-        min_activations_[t] = std::min(min_activations_[t], output_[index]);
-        max_activations_[t] = std::max(max_activations_[t], output_[index]);
-      }
-    }
-    return output_.data();
-  }
+                }
+            }
 
-  // backpropagation
-  void Backpropagate(const LearnFloatType* gradients,
-                     LearnFloatType learning_rate) {
-    const LearnFloatType local_learning_rate =
-        learning_rate * learning_rate_scale_;
-    for (IndexType b = 0; b < batch_->size(); ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        const IndexType index = batch_offset + i;
-        gradients_[index] = gradients[index] *
-            ((output_[index] > kZero) * (output_[index] < kOne));
-      }
-    }
-    // Since the weight matrix updates only the columns corresponding to the features that appeared in the input,
-    // Correct the learning rate and adjust the scale without using momentum
-    const LearnFloatType effective_learning_rate =
-        static_cast<LearnFloatType>(local_learning_rate / (1.0 - momentum_));
+            // clipped ReLU
+            for (IndexType b = 0; b < batch.size(); ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                    const IndexType index = batch_offset + i;
+                    min_pre_activation_ = std::min(min_pre_activation_, output_[index]);
+                    max_pre_activation_ = std::max(max_pre_activation_, output_[index]);
+                    output_[index] = std::max(+kZero, std::min(+kOne, output_[index]));
+                    const IndexType t = i % kHalfDimensions;
+                    min_activations_[t] = std::min(min_activations_[t], output_[index]);
+                    max_activations_[t] = std::max(max_activations_[t], output_[index]);
+                }
+            }
+
+            return output_.data();
+        }
+
+        // backpropagation
+        void Backpropagate(const LearnFloatType* gradients,
+                           LearnFloatType learning_rate) {
+
+            const LearnFloatType local_learning_rate =
+                learning_rate * learning_rate_scale_;
+
+            for (IndexType b = 0; b < batch_->size(); ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                    const IndexType index = batch_offset + i;
+                    gradients_[index] = gradients[index] *
+                        ((output_[index] > kZero) * (output_[index] < kOne));
+                }
+            }
+
+            // Since the weight matrix updates only the columns corresponding to the features that appeared in the input,
+            // Correct the learning rate and adjust the scale without using momentum
+            const LearnFloatType effective_learning_rate =
+                static_cast<LearnFloatType>(local_learning_rate / (1.0 - momentum_));
 #if defined(USE_BLAS)
-    cblas_sscal(kHalfDimensions, momentum_, biases_diff_, 1);
-    for (IndexType b = 0; b < batch_->size(); ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType c = 0; c < 2; ++c) {
-        const IndexType output_offset = batch_offset + kHalfDimensions * c;
-        cblas_saxpy(kHalfDimensions, 1.0,
-                    &gradients_[output_offset], 1, biases_diff_, 1);
-      }
-    }
-    cblas_saxpy(kHalfDimensions, -local_learning_rate,
-                biases_diff_, 1, biases_, 1);
+            cblas_sscal(kHalfDimensions, momentum_, biases_diff_, 1);
+            for (IndexType b = 0; b < batch_->size(); ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType c = 0; c < 2; ++c) {
+                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
+                    cblas_saxpy(kHalfDimensions, 1.0,
+                                &gradients_[output_offset], 1, biases_diff_, 1);
+                }
+            }
+
+            cblas_saxpy(kHalfDimensions, -local_learning_rate,
+                        biases_diff_, 1, biases_, 1);
+
 #pragma omp parallel
-    {
+            {
 #if defined(_OPENMP)
-      const IndexType num_threads = omp_get_num_threads();
-      const IndexType thread_index = omp_get_thread_num();
+                const IndexType num_threads = omp_get_num_threads();
+                const IndexType thread_index = omp_get_thread_num();
 #endif
-      for (IndexType b = 0; b < batch_->size(); ++b) {
-        const IndexType batch_offset = kOutputDimensions * b;
-        for (IndexType c = 0; c < 2; ++c) {
-          const IndexType output_offset = batch_offset + kHalfDimensions * c;
-          for (const auto& feature : (*batch_)[b].training_features[c]) {
+                for (IndexType b = 0; b < batch_->size(); ++b) {
+                    const IndexType batch_offset = kOutputDimensions * b;
+                    for (IndexType c = 0; c < 2; ++c) {
+                        const IndexType output_offset = batch_offset + kHalfDimensions * c;
+                        for (const auto& feature : (*batch_)[b].training_features[c]) {
 #if defined(_OPENMP)
-            if (feature.GetIndex() % num_threads != thread_index) continue;
+                            if (feature.GetIndex() % num_threads != thread_index)
+                                continue;
 #endif
-            const IndexType weights_offset =
-                kHalfDimensions * feature.GetIndex();
-            const auto scale = static_cast<LearnFloatType>(
-                effective_learning_rate / feature.GetCount());
-            cblas_saxpy(kHalfDimensions, -scale,
-                        &gradients_[output_offset], 1,
-                        &weights_[weights_offset], 1);
-          }
-        }
-      }
-    }
+                            const IndexType weights_offset =
+                                kHalfDimensions * feature.GetIndex();
+                            const auto scale = static_cast<LearnFloatType>(
+                                effective_learning_rate / feature.GetCount());
+
+                            cblas_saxpy(kHalfDimensions, -scale,
+                                        &gradients_[output_offset], 1,
+                                        &weights_[weights_offset], 1);
+                        }
+                    }
+                }
+            }
+
 #else
-    for (IndexType i = 0; i < kHalfDimensions; ++i) {
-      biases_diff_[i] *= momentum_;
-    }
-    for (IndexType b = 0; b < batch_->size(); ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType c = 0; c < 2; ++c) {
-        const IndexType output_offset = batch_offset + kHalfDimensions * c;
-        for (IndexType i = 0; i < kHalfDimensions; ++i) {
-          biases_diff_[i] += gradients_[output_offset + i];
-        }
-      }
-    }
-    for (IndexType i = 0; i < kHalfDimensions; ++i) {
-      biases_[i] -= local_learning_rate * biases_diff_[i];
-    }
-    for (IndexType b = 0; b < batch_->size(); ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType c = 0; c < 2; ++c) {
-        const IndexType output_offset = batch_offset + kHalfDimensions * c;
-        for (const auto& feature : (*batch_)[b].training_features[c]) {
-          const IndexType weights_offset = kHalfDimensions * feature.GetIndex();
-          const auto scale = static_cast<LearnFloatType>(
-              effective_learning_rate / feature.GetCount());
-          for (IndexType i = 0; i < kHalfDimensions; ++i) {
-            weights_[weights_offset + i] -=
-                scale * gradients_[output_offset + i];
-          }
-        }
-      }
-    }
+            for (IndexType i = 0; i < kHalfDimensions; ++i) {
+                biases_diff_[i] *= momentum_;
+            }
+
+            for (IndexType b = 0; b < batch_->size(); ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType c = 0; c < 2; ++c) {
+                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
+                    for (IndexType i = 0; i < kHalfDimensions; ++i) {
+                        biases_diff_[i] += gradients_[output_offset + i];
+                    }
+                }
+            }
+
+            for (IndexType i = 0; i < kHalfDimensions; ++i) {
+                biases_[i] -= local_learning_rate * biases_diff_[i];
+            }
+
+            for (IndexType b = 0; b < batch_->size(); ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType c = 0; c < 2; ++c) {
+                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
+                    for (const auto& feature : (*batch_)[b].training_features[c]) {
+                        const IndexType weights_offset = kHalfDimensions * feature.GetIndex();
+                        const auto scale = static_cast<LearnFloatType>(
+                            effective_learning_rate / feature.GetCount());
+
+                        for (IndexType i = 0; i < kHalfDimensions; ++i) {
+                            weights_[weights_offset + i] -=
+                                scale * gradients_[output_offset + i];
+                        }
+                    }
+                }
+            }
+
 #endif
-    for (IndexType b = 0; b < batch_->size(); ++b) {
-      for (IndexType c = 0; c < 2; ++c) {
-        for (const auto& feature : (*batch_)[b].training_features[c]) {
-          observed_features.set(feature.GetIndex());
+            for (IndexType b = 0; b < batch_->size(); ++b) {
+                for (IndexType c = 0; c < 2; ++c) {
+                    for (const auto& feature : (*batch_)[b].training_features[c]) {
+                        observed_features.set(feature.GetIndex());
+                    }
+                }
+            }
         }
-      }
-    }
-  }
 
- private:
-  // constructor
-  Trainer(LayerType* target_layer) :
-      batch_(nullptr),
-      target_layer_(target_layer),
-      biases_(),
-      weights_(),
-      biases_diff_(),
-      momentum_(0.2),
-      learning_rate_scale_(1.0) {
-    min_pre_activation_ = std::numeric_limits<LearnFloatType>::max();
-    max_pre_activation_ = std::numeric_limits<LearnFloatType>::lowest();
-    std::fill(std::begin(min_activations_), std::end(min_activations_),
-              std::numeric_limits<LearnFloatType>::max());
-    std::fill(std::begin(max_activations_), std::end(max_activations_),
-              std::numeric_limits<LearnFloatType>::lowest());
-    DequantizeParameters();
-  }
+    private:
+        // constructor
+        Trainer(LayerType* target_layer) :
+            batch_(nullptr),
+            target_layer_(target_layer),
+            biases_(),
+            weights_(),
+            biases_diff_(),
+            momentum_(0.2),
+            learning_rate_scale_(1.0) {
+
+            min_pre_activation_ = std::numeric_limits<LearnFloatType>::max();
+            max_pre_activation_ = std::numeric_limits<LearnFloatType>::lowest();
+
+            std::fill(std::begin(min_activations_), std::end(min_activations_),
+                      std::numeric_limits<LearnFloatType>::max());
+            std::fill(std::begin(max_activations_), std::end(max_activations_),
+                      std::numeric_limits<LearnFloatType>::lowest());
+
+            DequantizeParameters();
+        }
+
+        // Weight saturation and parameterization
+        void QuantizeParameters() {
+            for (IndexType i = 0; i < kHalfDimensions; ++i) {
+                target_layer_->biases_[i] =
+                    Round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
+            }
+
+            std::vector<TrainingFeature> training_features;
 
-  // Weight saturation and parameterization
-  void QuantizeParameters() {
-    for (IndexType i = 0; i < kHalfDimensions; ++i) {
-      target_layer_->biases_[i] =
-          Round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
-    }
-    std::vector<TrainingFeature> training_features;
 #pragma omp parallel for private(training_features)
-    for (IndexType j = 0; j < RawFeatures::kDimensions; ++j) {
-      training_features.clear();
-      Features::Factorizer<RawFeatures>::AppendTrainingFeatures(
-          j, &training_features);
-      for (IndexType i = 0; i < kHalfDimensions; ++i) {
-        double sum = 0.0;
-        for (const auto& feature : training_features) {
-          sum += weights_[kHalfDimensions * feature.GetIndex() + i];
+            for (IndexType j = 0; j < RawFeatures::kDimensions; ++j) {
+                training_features.clear();
+                Features::Factorizer<RawFeatures>::AppendTrainingFeatures(
+                    j, &training_features);
+
+                for (IndexType i = 0; i < kHalfDimensions; ++i) {
+                    double sum = 0.0;
+                    for (const auto& feature : training_features) {
+                        sum += weights_[kHalfDimensions * feature.GetIndex() + i];
+                    }
+
+                    target_layer_->weights_[kHalfDimensions * j + i] =
+                        Round<typename LayerType::WeightType>(sum * kWeightScale);
+                }
+            }
         }
-        target_layer_->weights_[kHalfDimensions * j + i] =
-            Round<typename LayerType::WeightType>(sum * kWeightScale);
-      }
-    }
-  }
 
-  // read parameterized integer
-  void DequantizeParameters() {
-    for (IndexType i = 0; i < kHalfDimensions; ++i) {
-      biases_[i] = static_cast<LearnFloatType>(
-          target_layer_->biases_[i] / kBiasScale);
-    }
-    std::fill(std::begin(weights_), std::end(weights_), +kZero);
-    for (IndexType i = 0; i < kHalfDimensions * RawFeatures::kDimensions; ++i) {
-      weights_[i] = static_cast<LearnFloatType>(
-          target_layer_->weights_[i] / kWeightScale);
-    }
-    std::fill(std::begin(biases_diff_), std::end(biases_diff_), +kZero);
-  }
+        // read parameterized integer
+        void DequantizeParameters() {
+            for (IndexType i = 0; i < kHalfDimensions; ++i) {
+                biases_[i] = static_cast<LearnFloatType>(
+                    target_layer_->biases_[i] / kBiasScale);
+            }
 
-  // Set the weight corresponding to the feature that does not appear in the learning data to 0
-  void ClearUnobservedFeatureWeights() {
-    for (IndexType i = 0; i < kInputDimensions; ++i) {
-      if (!observed_features.test(i)) {
-        std::fill(std::begin(weights_) + kHalfDimensions * i,
-                  std::begin(weights_) + kHalfDimensions * (i + 1), +kZero);
-      }
-    }
-    QuantizeParameters();
-  }
+            std::fill(std::begin(weights_), std::end(weights_), +kZero);
 
-  // Check if there are any problems with learning
-  void CheckHealth() {
-    std::cout << "INFO: observed " << observed_features.count()
-              << " (out of " << kInputDimensions << ") features" << std::endl;
+            for (IndexType i = 0; i < kHalfDimensions * RawFeatures::kDimensions; ++i) {
+                weights_[i] = static_cast<LearnFloatType>(
+                    target_layer_->weights_[i] / kWeightScale);
+            }
 
-    constexpr LearnFloatType kPreActivationLimit =
-        std::numeric_limits<typename LayerType::WeightType>::max() /
-        kWeightScale;
-    std::cout << "INFO: (min, max) of pre-activations = "
-              << min_pre_activation_ << ", "
-              << max_pre_activation_ << " (limit = "
-              << kPreActivationLimit << ")" << std::endl;
+            std::fill(std::begin(biases_diff_), std::end(biases_diff_), +kZero);
+        }
 
-    const auto largest_min_activation = *std::max_element(
-        std::begin(min_activations_), std::end(min_activations_));
-    const auto smallest_max_activation = *std::min_element(
-        std::begin(max_activations_), std::end(max_activations_));
-    std::cout << "INFO: largest min activation = " << largest_min_activation
-              << ", smallest max activation = " << smallest_max_activation
-              << std::endl;
+        // Set the weight corresponding to the feature that does not appear in the learning data to 0
+        void ClearUnobservedFeatureWeights() {
+            for (IndexType i = 0; i < kInputDimensions; ++i) {
+                if (!observed_features.test(i)) {
+                    std::fill(std::begin(weights_) + kHalfDimensions * i,
+                              std::begin(weights_) + kHalfDimensions * (i + 1), +kZero);
+                }
+            }
 
-    std::fill(std::begin(min_activations_), std::end(min_activations_),
-              std::numeric_limits<LearnFloatType>::max());
-    std::fill(std::begin(max_activations_), std::end(max_activations_),
-              std::numeric_limits<LearnFloatType>::lowest());
-  }
+            QuantizeParameters();
+        }
 
-  // number of input/output dimensions
-  static constexpr IndexType kInputDimensions =
-      Features::Factorizer<RawFeatures>::GetDimensions();
-  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
-  static constexpr IndexType kHalfDimensions = LayerType::kHalfDimensions;
+        // Check if there are any problems with learning
+        void CheckHealth() {
+            std::cout << "INFO: observed " << observed_features.count()
+                      << " (out of " << kInputDimensions << ") features" << std::endl;
 
-  // Coefficient used for parameterization
-  static constexpr LearnFloatType kActivationScale =
-      std::numeric_limits<std::int8_t>::max();
-  static constexpr LearnFloatType kBiasScale = kActivationScale;
-  static constexpr LearnFloatType kWeightScale = kActivationScale;
+            constexpr LearnFloatType kPreActivationLimit =
+                std::numeric_limits<typename LayerType::WeightType>::max() /
+                kWeightScale;
 
-  // LearnFloatType constant
-  static constexpr LearnFloatType kZero = static_cast<LearnFloatType>(0.0);
-  static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
+            std::cout << "INFO: (min, max) of pre-activations = "
+                      << min_pre_activation_ << ", "
+                      << max_pre_activation_ << " (limit = "
+                      << kPreActivationLimit << ")" << std::endl;
 
-  // mini batch
-  const std::vector<Example>* batch_;
+            const auto largest_min_activation = *std::max_element(
+                std::begin(min_activations_), std::end(min_activations_));
+            const auto smallest_max_activation = *std::min_element(
+                std::begin(max_activations_), std::end(max_activations_));
 
-  // layer to learn
-  LayerType* const target_layer_;
+            std::cout << "INFO: largest min activation = " << largest_min_activation
+                      << ", smallest max activation = " << smallest_max_activation
+                      << std::endl;
 
-  // parameter
-  alignas(kCacheLineSize) LearnFloatType biases_[kHalfDimensions];
-  alignas(kCacheLineSize)
-      LearnFloatType weights_[kHalfDimensions * kInputDimensions];
+            std::fill(std::begin(min_activations_), std::end(min_activations_),
+                      std::numeric_limits<LearnFloatType>::max());
+            std::fill(std::begin(max_activations_), std::end(max_activations_),
+                      std::numeric_limits<LearnFloatType>::lowest());
+        }
 
-  // Buffer used for updating parameters
-  LearnFloatType biases_diff_[kHalfDimensions];
-  std::vector<LearnFloatType> gradients_;
+        // number of input/output dimensions
+        static constexpr IndexType kInputDimensions =
+            Features::Factorizer<RawFeatures>::GetDimensions();
+        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+        static constexpr IndexType kHalfDimensions = LayerType::kHalfDimensions;
 
-  // Forward propagation buffer
-  std::vector<LearnFloatType> output_;
+        // Coefficient used for parameterization
+        static constexpr LearnFloatType kActivationScale =
+            std::numeric_limits<std::int8_t>::max();
+        static constexpr LearnFloatType kBiasScale = kActivationScale;
+        static constexpr LearnFloatType kWeightScale = kActivationScale;
 
-  // Features that appeared in the training data
-  std::bitset<kInputDimensions> observed_features;
+        // LearnFloatType constant
+        static constexpr LearnFloatType kZero = static_cast<LearnFloatType>(0.0);
+        static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
 
-  // hyper parameter
-  LearnFloatType momentum_;
-  LearnFloatType learning_rate_scale_;
+        // mini batch
+        const std::vector<Example>* batch_;
 
-  // Health check statistics
-  LearnFloatType min_pre_activation_;
-  LearnFloatType max_pre_activation_;
-  LearnFloatType min_activations_[kHalfDimensions];
-  LearnFloatType max_activations_[kHalfDimensions];
-};
+        // layer to learn
+        LayerType* const target_layer_;
 
-}  // namespace NNUE
+        // parameter
+        alignas(kCacheLineSize) LearnFloatType biases_[kHalfDimensions];
+        alignas(kCacheLineSize)
+            LearnFloatType weights_[kHalfDimensions * kInputDimensions];
 
-}  // namespace Eval
+        // Buffer used for updating parameters
+        LearnFloatType biases_diff_[kHalfDimensions];
+        std::vector<LearnFloatType> gradients_;
+
+        // Forward propagation buffer
+        std::vector<LearnFloatType> output_;
+
+        // Features that appeared in the training data
+        std::bitset<kInputDimensions> observed_features;
+
+        // hyper parameter
+        LearnFloatType momentum_;
+        LearnFloatType learning_rate_scale_;
+
+        // Health check statistics
+        LearnFloatType min_pre_activation_;
+        LearnFloatType max_pre_activation_;
+        LearnFloatType min_activations_[kHalfDimensions];
+        LearnFloatType max_activations_[kHalfDimensions];
+    };
+
+}  // namespace Eval::NNUE
 
 #endif
diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
index e2cd0c25..45dcbacc 100644
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -1,247 +1,267 @@
-﻿// Specialization of NNUE evaluation function learning class template for InputSlice
-
-#ifndef _NNUE_TRAINER_INPUT_SLICE_H_
+﻿#ifndef _NNUE_TRAINER_INPUT_SLICE_H_
 #define _NNUE_TRAINER_INPUT_SLICE_H_
 
-#include "../../learn/learn.h"
-#include "../layers/input_slice.h"
 #include "trainer.h"
 
-namespace Eval {
+#include "learn/learn.h"
 
-namespace NNUE {
+#include "nnue/layers/input_slice.h"
 
-// Learning: Input layer
-class SharedInputTrainer {
- public:
-  // factory function
-  static std::shared_ptr<SharedInputTrainer> Create(
-      FeatureTransformer* ft) {
-    static std::shared_ptr<SharedInputTrainer> instance;
-    if (!instance) {
-      instance.reset(new SharedInputTrainer(ft));
-    }
-    ++instance->num_referrers_;
-    return instance;
-  }
+// Specialization of NNUE evaluation function learning class template for InputSlice
+namespace Eval::NNUE {
 
-  // Set options such as hyperparameters
-  void SendMessage(Message* message) {
-    if (num_calls_ == 0) {
-      current_operation_ = Operation::kSendMessage;
-      feature_transformer_trainer_->SendMessage(message);
-    }
-    assert(current_operation_ == Operation::kSendMessage);
-    if (++num_calls_ == num_referrers_) {
-      num_calls_ = 0;
-      current_operation_ = Operation::kNone;
-    }
-  }
+    // Learning: Input layer
+    class SharedInputTrainer {
+    public:
+        // factory function
+        static std::shared_ptr<SharedInputTrainer> Create(
+            FeatureTransformer* ft) {
 
-  // Initialize the parameters with random numbers
-  template <typename RNG>
-  void Initialize(RNG& rng) {
-    if (num_calls_ == 0) {
-      current_operation_ = Operation::kInitialize;
-      feature_transformer_trainer_->Initialize(rng);
-    }
-    assert(current_operation_ == Operation::kInitialize);
-    if (++num_calls_ == num_referrers_) {
-      num_calls_ = 0;
-      current_operation_ = Operation::kNone;
-    }
-  }
+            static std::shared_ptr<SharedInputTrainer> instance;
 
-  // forward propagation
-  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
-    if (gradients_.size() < kInputDimensions * batch.size()) {
-      gradients_.resize(kInputDimensions * batch.size());
-    }
-    batch_size_ = static_cast<IndexType>(batch.size());
-    if (num_calls_ == 0) {
-      current_operation_ = Operation::kPropagate;
-      output_ = feature_transformer_trainer_->Propagate(batch);
-    }
-    assert(current_operation_ == Operation::kPropagate);
-    if (++num_calls_ == num_referrers_) {
-      num_calls_ = 0;
-      current_operation_ = Operation::kNone;
-    }
-    return output_;
-  }
+            if (!instance) {
+                instance.reset(new SharedInputTrainer(ft));
+            }
 
-  // backpropagation
-  void Backpropagate(const LearnFloatType* gradients,
-                     LearnFloatType learning_rate) {
-    if (num_referrers_ == 1) {
-      feature_transformer_trainer_->Backpropagate(gradients, learning_rate);
-      return;
-    }
-    if (num_calls_ == 0) {
-      current_operation_ = Operation::kBackPropagate;
-      for (IndexType b = 0; b < batch_size_; ++b) {
-        const IndexType batch_offset = kInputDimensions * b;
-        for (IndexType i = 0; i < kInputDimensions; ++i) {
-          gradients_[batch_offset + i] = static_cast<LearnFloatType>(0.0);
+            ++instance->num_referrers_;
+
+            return instance;
         }
-      }
-    }
-    assert(current_operation_ == Operation::kBackPropagate);
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType batch_offset = kInputDimensions * b;
-      for (IndexType i = 0; i < kInputDimensions; ++i) {
-        gradients_[batch_offset + i] += gradients[batch_offset + i];
-      }
-    }
-    if (++num_calls_ == num_referrers_) {
-      feature_transformer_trainer_->Backpropagate(
-          gradients_.data(), learning_rate);
-      num_calls_ = 0;
-      current_operation_ = Operation::kNone;
-    }
-  }
 
- private:
-  // constructor
-  SharedInputTrainer(FeatureTransformer* ft) :
-      batch_size_(0),
-      num_referrers_(0),
-      num_calls_(0),
-      current_operation_(Operation::kNone),
-      feature_transformer_trainer_(Trainer<FeatureTransformer>::Create(
-          ft)),
-      output_(nullptr) {
-  }
+        // Set options such as hyperparameters
+        void SendMessage(Message* message) {
+            if (num_calls_ == 0) {
+                current_operation_ = Operation::kSendMessage;
+                feature_transformer_trainer_->SendMessage(message);
+            }
 
-  // number of input/output dimensions
-  static constexpr IndexType kInputDimensions =
-      FeatureTransformer::kOutputDimensions;
+            assert(current_operation_ == Operation::kSendMessage);
 
-  // type of processing
-  enum class Operation {
-    kNone,
-    kSendMessage,
-    kInitialize,
-    kPropagate,
-    kBackPropagate,
-  };
+            if (++num_calls_ == num_referrers_) {
+                num_calls_ = 0;
+                current_operation_ = Operation::kNone;
+            }
+        }
 
-  // number of samples in mini-batch
-  IndexType batch_size_;
+        // Initialize the parameters with random numbers
+        template <typename RNG>
+        void Initialize(RNG& rng) {
+            if (num_calls_ == 0) {
+                current_operation_ = Operation::kInitialize;
+                feature_transformer_trainer_->Initialize(rng);
+            }
 
-  // number of layers sharing this layer as input
-  std::uint32_t num_referrers_;
+            assert(current_operation_ == Operation::kInitialize);
 
-  // Number of times the current process has been called
-  std::uint32_t num_calls_;
+            if (++num_calls_ == num_referrers_) {
+                num_calls_ = 0;
+                current_operation_ = Operation::kNone;
+            }
+        }
 
-  // current processing type
-  Operation current_operation_;
+        // forward propagation
+        const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+            if (gradients_.size() < kInputDimensions * batch.size()) {
+                gradients_.resize(kInputDimensions * batch.size());
+            }
 
-  // Trainer of input feature converter
-  const std::shared_ptr<Trainer<FeatureTransformer>>
-      feature_transformer_trainer_;
+            batch_size_ = static_cast<IndexType>(batch.size());
 
-  // pointer to output shared for forward propagation
-  const LearnFloatType* output_;
+            if (num_calls_ == 0) {
+                current_operation_ = Operation::kPropagate;
+                output_ = feature_transformer_trainer_->Propagate(batch);
+            }
 
-  // buffer for back propagation
-  std::vector<LearnFloatType> gradients_;
-};
+            assert(current_operation_ == Operation::kPropagate);
 
-// Learning: Input layer
-template <IndexType OutputDimensions, IndexType Offset>
-class Trainer<Layers::InputSlice<OutputDimensions, Offset>> {
- private:
-  // Type of layer to learn
-  using LayerType = Layers::InputSlice<OutputDimensions, Offset>;
+            if (++num_calls_ == num_referrers_) {
+                num_calls_ = 0;
+                current_operation_ = Operation::kNone;
+            }
 
- public:
-  // factory function
-  static std::shared_ptr<Trainer> Create(
-      LayerType* /*target_layer*/, FeatureTransformer* ft) {
-    return std::shared_ptr<Trainer>(new Trainer(ft));
-  }
+            return output_;
+        }
 
-  // Set options such as hyperparameters
-  void SendMessage(Message* message) {
-    shared_input_trainer_->SendMessage(message);
-  }
+        // backpropagation
+        void Backpropagate(const LearnFloatType* gradients,
+                           LearnFloatType learning_rate) {
 
-  // Initialize the parameters with random numbers
-  template <typename RNG>
-  void Initialize(RNG& rng) {
-    shared_input_trainer_->Initialize(rng);
-  }
+            if (num_referrers_ == 1) {
+                feature_transformer_trainer_->Backpropagate(gradients, learning_rate);
+                return;
+            }
 
-  // forward propagation
-  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
-    if (output_.size() < kOutputDimensions * batch.size()) {
-      output_.resize(kOutputDimensions * batch.size());
-      gradients_.resize(kInputDimensions * batch.size());
-    }
-    batch_size_ = static_cast<IndexType>(batch.size());
-    const auto input = shared_input_trainer_->Propagate(batch);
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType input_offset = kInputDimensions * b;
-      const IndexType output_offset = kOutputDimensions * b;
+            if (num_calls_ == 0) {
+                current_operation_ = Operation::kBackPropagate;
+                for (IndexType b = 0; b < batch_size_; ++b) {
+                    const IndexType batch_offset = kInputDimensions * b;
+                    for (IndexType i = 0; i < kInputDimensions; ++i) {
+                        gradients_[batch_offset + i] = static_cast<LearnFloatType>(0.0);
+                    }
+                }
+            }
+
+            assert(current_operation_ == Operation::kBackPropagate);
+
+            for (IndexType b = 0; b < batch_size_; ++b) {
+                const IndexType batch_offset = kInputDimensions * b;
+                for (IndexType i = 0; i < kInputDimensions; ++i) {
+                    gradients_[batch_offset + i] += gradients[batch_offset + i];
+                }
+            }
+
+            if (++num_calls_ == num_referrers_) {
+                feature_transformer_trainer_->Backpropagate(
+                    gradients_.data(), learning_rate);
+                num_calls_ = 0;
+                current_operation_ = Operation::kNone;
+            }
+        }
+
+    private:
+        // constructor
+        SharedInputTrainer(FeatureTransformer* ft) :
+            batch_size_(0),
+            num_referrers_(0),
+            num_calls_(0),
+            current_operation_(Operation::kNone),
+            feature_transformer_trainer_(Trainer<FeatureTransformer>::Create(
+                ft)),
+            output_(nullptr) {
+        }
+
+        // number of input/output dimensions
+        static constexpr IndexType kInputDimensions =
+            FeatureTransformer::kOutputDimensions;
+
+        // type of processing
+        enum class Operation {
+            kNone,
+            kSendMessage,
+            kInitialize,
+            kPropagate,
+            kBackPropagate,
+        };
+
+        // number of samples in mini-batch
+        IndexType batch_size_;
+
+        // number of layers sharing this layer as input
+        std::uint32_t num_referrers_;
+
+        // Number of times the current process has been called
+        std::uint32_t num_calls_;
+
+        // current processing type
+        Operation current_operation_;
+
+        // Trainer of input feature converter
+        const std::shared_ptr<Trainer<FeatureTransformer>>
+            feature_transformer_trainer_;
+
+        // pointer to output shared for forward propagation
+        const LearnFloatType* output_;
+
+        // buffer for back propagation
+        std::vector<LearnFloatType> gradients_;
+    };
+
+    // Learning: Input layer
+    template <IndexType OutputDimensions, IndexType Offset>
+    class Trainer<Layers::InputSlice<OutputDimensions, Offset>> {
+    private:
+        // Type of layer to learn
+        using LayerType = Layers::InputSlice<OutputDimensions, Offset>;
+
+    public:
+        // factory function
+        static std::shared_ptr<Trainer> Create(
+            LayerType* /*target_layer*/, FeatureTransformer* ft) {
+
+            return std::shared_ptr<Trainer>(new Trainer(ft));
+        }
+
+        // Set options such as hyperparameters
+        void SendMessage(Message* message) {
+            shared_input_trainer_->SendMessage(message);
+        }
+
+        // Initialize the parameters with random numbers
+        template <typename RNG>
+        void Initialize(RNG& rng) {
+            shared_input_trainer_->Initialize(rng);
+        }
+
+        // forward propagation
+        const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+            if (output_.size() < kOutputDimensions * batch.size()) {
+              output_.resize(kOutputDimensions * batch.size());
+              gradients_.resize(kInputDimensions * batch.size());
+            }
+
+            batch_size_ = static_cast<IndexType>(batch.size());
+
+            const auto input = shared_input_trainer_->Propagate(batch);
+            for (IndexType b = 0; b < batch_size_; ++b) {
+                const IndexType input_offset = kInputDimensions * b;
+                const IndexType output_offset = kOutputDimensions * b;
 #if defined(USE_BLAS)
-      cblas_scopy(kOutputDimensions, &input[input_offset + Offset], 1,
-                  &output_[output_offset], 1);
+                cblas_scopy(kOutputDimensions, &input[input_offset + Offset], 1,
+                            &output_[output_offset], 1);
 #else
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        output_[output_offset + i] = input[input_offset + Offset + i];
-      }
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                    output_[output_offset + i] = input[input_offset + Offset + i];
+                }
 #endif
-    }
-    return output_.data();
-  }
+            }
 
-  // backpropagation
-  void Backpropagate(const LearnFloatType* gradients,
-                     LearnFloatType learning_rate) {
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType input_offset = kInputDimensions * b;
-      const IndexType output_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kInputDimensions; ++i) {
-        if ((int)i < (int)Offset || i >= Offset + kOutputDimensions) {
-          gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
-        } else {
-          gradients_[input_offset + i] = gradients[output_offset + i - Offset];
+            return output_.data();
         }
-      }
-    }
-    shared_input_trainer_->Backpropagate(gradients_.data(), learning_rate);
-  }
 
- private:
-  // constructor
-  Trainer(FeatureTransformer* ft):
-      batch_size_(0),
-      shared_input_trainer_(SharedInputTrainer::Create(ft)) {
-  }
+        // backpropagation
+        void Backpropagate(const LearnFloatType* gradients,
+                           LearnFloatType learning_rate) {
 
-  // number of input/output dimensions
-  static constexpr IndexType kInputDimensions =
-      FeatureTransformer::kOutputDimensions;
-  static constexpr IndexType kOutputDimensions = OutputDimensions;
-  static_assert(Offset + kOutputDimensions <= kInputDimensions, "");
+            for (IndexType b = 0; b < batch_size_; ++b) {
+                const IndexType input_offset = kInputDimensions * b;
+                const IndexType output_offset = kOutputDimensions * b;
+                for (IndexType i = 0; i < kInputDimensions; ++i) {
+                    if ((int)i < (int)Offset || i >= Offset + kOutputDimensions) {
+                        gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
+                    } else {
+                        gradients_[input_offset + i] = gradients[output_offset + i - Offset];
+                    }
+                }
+            }
+            shared_input_trainer_->Backpropagate(gradients_.data(), learning_rate);
+        }
 
-  // number of samples in mini-batch
-  IndexType batch_size_;
+    private:
+        // constructor
+        Trainer(FeatureTransformer* ft):
+            batch_size_(0),
+            shared_input_trainer_(SharedInputTrainer::Create(ft)) {
+        }
 
-  // Trainer of shared input layer
-  const std::shared_ptr<SharedInputTrainer> shared_input_trainer_;
+        // number of input/output dimensions
+        static constexpr IndexType kInputDimensions =
+            FeatureTransformer::kOutputDimensions;
+        static constexpr IndexType kOutputDimensions = OutputDimensions;
+        static_assert(Offset + kOutputDimensions <= kInputDimensions, "");
 
-  // Forward propagation buffer
-  std::vector<LearnFloatType> output_;
+        // number of samples in mini-batch
+        IndexType batch_size_;
 
-  // buffer for back propagation
-  std::vector<LearnFloatType> gradients_;
-};
+        // Trainer of shared input layer
+        const std::shared_ptr<SharedInputTrainer> shared_input_trainer_;
 
-}  // namespace NNUE
+        // Forward propagation buffer
+        std::vector<LearnFloatType> output_;
 
-}  // namespace Eval
+        // buffer for back propagation
+        std::vector<LearnFloatType> gradients_;
+    };
+
+}  // namespace Eval::NNUE
 
 #endif
diff --git a/src/nnue/trainer/trainer_sum.h b/src/nnue/trainer/trainer_sum.h
index 65a0b681..9904704b 100644
--- a/src/nnue/trainer/trainer_sum.h
+++ b/src/nnue/trainer/trainer_sum.h
@@ -1,186 +1,190 @@
-﻿// Specialization of NNUE evaluation function learning class template for Sum
-
-#ifndef _NNUE_TRAINER_SUM_H_
+﻿#ifndef _NNUE_TRAINER_SUM_H_
 #define _NNUE_TRAINER_SUM_H_
 
 #include "../../learn/learn.h"
 #include "../layers/sum.h"
 #include "trainer.h"
 
-namespace Eval {
+// Specialization of NNUE evaluation function learning class template for Sum
+namespace Eval::NNUE {
 
-namespace NNUE {
+    // Learning: A layer that sums the outputs of multiple layers
+    template <typename FirstPreviousLayer, typename... RemainingPreviousLayers>
+    class Trainer<Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>> :
+          Trainer<Layers::Sum<RemainingPreviousLayers...>> {
+    private:
+        // Type of layer to learn
+        using LayerType = Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>;
+        using Tail = Trainer<Layers::Sum<RemainingPreviousLayers...>>;
 
-// Learning: A layer that sums the outputs of multiple layers
-template <typename FirstPreviousLayer, typename... RemainingPreviousLayers>
-class Trainer<Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>> :
-      Trainer<Layers::Sum<RemainingPreviousLayers...>> {
- private:
-  // Type of layer to learn
-  using LayerType = Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>;
-  using Tail = Trainer<Layers::Sum<RemainingPreviousLayers...>>;
+    public:
+        // factory function
+        static std::shared_ptr<Trainer> Create(
+            LayerType* target_layer, FeatureTransformer* ft) {
 
- public:
-  // factory function
-  static std::shared_ptr<Trainer> Create(
-      LayerType* target_layer, FeatureTransformer* ft) {
-    return std::shared_ptr<Trainer>(
-        new Trainer(target_layer, ft));
-  }
+            return std::shared_ptr<Trainer>(
+                new Trainer(target_layer, ft));
+        }
 
-  // Set options such as hyperparameters
-  void SendMessage(Message* message) {
-    // The results of other member functions do not depend on the processing order, so
-    // Tail is processed first for the purpose of simplifying the implementation, but
-    // SendMessage processes Head first to make it easier to understand subscript correspondence
-    previous_layer_trainer_->SendMessage(message);
-    Tail::SendMessage(message);
-  }
+        // Set options such as hyperparameters
+        void SendMessage(Message* message) {
+            // The results of other member functions do not depend on the processing order, so
+            // Tail is processed first for the purpose of simplifying the implementation, but
+            // SendMessage processes Head first to make it easier to understand subscript correspondence
+            previous_layer_trainer_->SendMessage(message);
+            Tail::SendMessage(message);
+        }
 
-  // Initialize the parameters with random numbers
-  template <typename RNG>
-  void Initialize(RNG& rng) {
-    Tail::Initialize(rng);
-    previous_layer_trainer_->Initialize(rng);
-  }
+        // Initialize the parameters with random numbers
+        template <typename RNG>
+        void Initialize(RNG& rng) {
+            Tail::Initialize(rng);
+            previous_layer_trainer_->Initialize(rng);
+        }
+
+        // forward propagation
+        /*const*/ LearnFloatType* Propagate(const std::vector<Example>& batch) {
+            batch_size_ = static_cast<IndexType>(batch.size());
+            auto output = Tail::Propagate(batch);
+            const auto head_output = previous_layer_trainer_->Propagate(batch);
 
-  // forward propagation
-  /*const*/ LearnFloatType* Propagate(const std::vector<Example>& batch) {
-    batch_size_ = static_cast<IndexType>(batch.size());
-    auto output = Tail::Propagate(batch);
-    const auto head_output = previous_layer_trainer_->Propagate(batch);
 #if defined(USE_BLAS)
-    cblas_saxpy(kOutputDimensions * batch_size_, 1.0,
-                head_output, 1, output, 1);
+            cblas_saxpy(kOutputDimensions * batch_size_, 1.0,
+                        head_output, 1, output, 1);
 #else
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        output[batch_offset + i] += head_output[batch_offset + i];
-      }
-    }
+            for (IndexType b = 0; b < batch_size_; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                    output[batch_offset + i] += head_output[batch_offset + i];
+                }
+            }
+
 #endif
-    return output;
-  }
+            return output;
+        }
 
-  // backpropagation
-  void Backpropagate(const LearnFloatType* gradients,
-                     LearnFloatType learning_rate) {
-    Tail::Backpropagate(gradients, learning_rate);
-    previous_layer_trainer_->Backpropagate(gradients, learning_rate);
-  }
+        // backpropagation
+        void Backpropagate(const LearnFloatType* gradients,
+                           LearnFloatType learning_rate) {
 
- private:
-  // constructor
-  Trainer(LayerType* target_layer, FeatureTransformer* ft):
-      Tail(target_layer, ft),
-      batch_size_(0),
-      previous_layer_trainer_(Trainer<FirstPreviousLayer>::Create(
-          &target_layer->previous_layer_, ft)),
-      target_layer_(target_layer) {
-  }
+            Tail::Backpropagate(gradients, learning_rate);
+            previous_layer_trainer_->Backpropagate(gradients, learning_rate);
+        }
 
-  // number of input/output dimensions
-  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+    private:
+        // constructor
+        Trainer(LayerType* target_layer, FeatureTransformer* ft):
+            Tail(target_layer, ft),
+            batch_size_(0),
+            previous_layer_trainer_(Trainer<FirstPreviousLayer>::Create(
+                &target_layer->previous_layer_, ft)),
+            target_layer_(target_layer) {
+        }
 
-  // make subclass friend
-  template <typename SumLayer>
-  friend class Trainer;
+        // number of input/output dimensions
+        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
 
-  // number of samples in mini-batch
-  IndexType batch_size_;
+        // make subclass friend
+        template <typename SumLayer>
+        friend class Trainer;
 
-  // Trainer of the previous layer
-  const std::shared_ptr<Trainer<FirstPreviousLayer>> previous_layer_trainer_;
+        // number of samples in mini-batch
+        IndexType batch_size_;
 
-  // layer to learn
-  LayerType* const target_layer_;
-};
+        // Trainer of the previous layer
+        const std::shared_ptr<Trainer<FirstPreviousLayer>> previous_layer_trainer_;
+
+        // layer to learn
+        LayerType* const target_layer_;
+    };
 
 
-// Learning: Layer that takes the sum of the outputs of multiple layers (when there is one template argument)
-template <typename PreviousLayer>
-class Trainer<Layers::Sum<PreviousLayer>> {
- private:
-  // Type of layer to learn
-  using LayerType = Layers::Sum<PreviousLayer>;
+    // Learning: Layer that takes the sum of the outputs of multiple layers (when there is one template argument)
+    template <typename PreviousLayer>
+    class Trainer<Layers::Sum<PreviousLayer>> {
+    private:
+        // Type of layer to learn
+        using LayerType = Layers::Sum<PreviousLayer>;
 
- public:
-  // factory function
-  static std::shared_ptr<Trainer> Create(
-      LayerType* target_layer, FeatureTransformer* ft) {
-    return std::shared_ptr<Trainer>(
-        new Trainer(target_layer, ft));
-  }
+    public:
+        // factory function
+        static std::shared_ptr<Trainer> Create(
+            LayerType* target_layer, FeatureTransformer* ft) {
 
-  // Set options such as hyperparameters
-  void SendMessage(Message* message) {
-    previous_layer_trainer_->SendMessage(message);
-  }
+            return std::shared_ptr<Trainer>(
+                new Trainer(target_layer, ft));
+        }
 
-  // Initialize the parameters with random numbers
-  template <typename RNG>
-  void Initialize(RNG& rng) {
-    previous_layer_trainer_->Initialize(rng);
-  }
+        // Set options such as hyperparameters
+        void SendMessage(Message* message) {
+            previous_layer_trainer_->SendMessage(message);
+        }
+
+        // Initialize the parameters with random numbers
+        template <typename RNG>
+        void Initialize(RNG& rng) {
+            previous_layer_trainer_->Initialize(rng);
+        }
+
+        // forward propagation
+        /*const*/ LearnFloatType* Propagate(const std::vector<Example>& batch) {
+            if (output_.size() < kOutputDimensions * batch.size()) {
+                output_.resize(kOutputDimensions * batch.size());
+            }
+
+            batch_size_ = static_cast<IndexType>(batch.size());
+            const auto output = previous_layer_trainer_->Propagate(batch);
 
-  // forward propagation
-  /*const*/ LearnFloatType* Propagate(const std::vector<Example>& batch) {
-    if (output_.size() < kOutputDimensions * batch.size()) {
-      output_.resize(kOutputDimensions * batch.size());
-    }
-    batch_size_ = static_cast<IndexType>(batch.size());
-    const auto output = previous_layer_trainer_->Propagate(batch);
 #if defined(USE_BLAS)
-    cblas_scopy(kOutputDimensions * batch_size_, output, 1, &output_[0], 1);
+            cblas_scopy(kOutputDimensions * batch_size_, output, 1, &output_[0], 1);
 #else
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        output_[batch_offset + i] = output[batch_offset + i];
-      }
-    }
-#endif
-    return output_.data();
-  }
-
-  // backpropagation
-  void Backpropagate(const LearnFloatType* gradients,
-                     LearnFloatType learning_rate) {
-    previous_layer_trainer_->Backpropagate(gradients, learning_rate);
-  }
-
- private:
-  // constructor
-  Trainer(LayerType* target_layer, FeatureTransformer* ft) :
-      batch_size_(0),
-      previous_layer_trainer_(Trainer<PreviousLayer>::Create(
-          &target_layer->previous_layer_, ft)),
-      target_layer_(target_layer) {
-  }
-
-  // number of input/output dimensions
-  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
-
-  // make subclass friend
-  template <typename SumLayer>
-  friend class Trainer;
-
-  // number of samples in mini-batch
-  IndexType batch_size_;
-
-  // Trainer of the previous layer
-  const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
-
-  // layer to learn
-  LayerType* const target_layer_;
-
-  // Forward propagation buffer
-  std::vector<LearnFloatType> output_;
-};
-
-}  // namespace NNUE
-
-}  // namespace Eval
+            for (IndexType b = 0; b < batch_size_; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                    output_[batch_offset + i] = output[batch_offset + i];
+                }
+            }
+
+#endif
+            return output_.data();
+        }
+
+        // backpropagation
+        void Backpropagate(const LearnFloatType* gradients,
+                           LearnFloatType learning_rate) {
+
+            previous_layer_trainer_->Backpropagate(gradients, learning_rate);
+        }
+
+    private:
+        // constructor
+        Trainer(LayerType* target_layer, FeatureTransformer* ft) :
+            batch_size_(0),
+            previous_layer_trainer_(Trainer<PreviousLayer>::Create(
+                &target_layer->previous_layer_, ft)),
+            target_layer_(target_layer) {
+        }
+
+        // number of input/output dimensions
+        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+
+        // make subclass friend
+        template <typename SumLayer>
+        friend class Trainer;
+
+        // number of samples in mini-batch
+        IndexType batch_size_;
+
+        // Trainer of the previous layer
+        const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
+
+        // layer to learn
+        LayerType* const target_layer_;
+
+        // Forward propagation buffer
+        std::vector<LearnFloatType> output_;
+    };
+
+}  // namespace Eval::NNUE
 
 #endif

From 497f689aa360dafc7e4b5d4b702b09c524cb84b2 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 14 Oct 2020 21:45:38 +0200
Subject: [PATCH 214/398] Cleanup nnue

---
 src/nnue/evaluate_nnue.cpp          |  14 +-
 src/nnue/evaluate_nnue.h            | 139 ++---
 src/nnue/evaluate_nnue_learner.cpp  | 354 +++++++------
 src/nnue/evaluate_nnue_learner.h    |  41 +-
 src/nnue/nnue_accumulator.h         |  38 +-
 src/nnue/nnue_architecture.h        |  37 +-
 src/nnue/nnue_common.h              | 152 +++---
 src/nnue/nnue_feature_transformer.h | 791 ++++++++++++++--------------
 src/nnue/nnue_test_command.cpp      | 390 +++++++-------
 src/nnue/nnue_test_command.h        |  17 +-
 10 files changed, 1003 insertions(+), 970 deletions(-)

diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index 4d8a4b66..0d504468 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -214,13 +214,13 @@ namespace Eval::NNUE {
 
     std::string eval_file = std::string(Options["EvalFile"]);
 
-    #if defined(DEFAULT_NNUE_DIRECTORY)
-    #define stringify2(x) #x
-    #define stringify(x) stringify2(x)
+#if defined(DEFAULT_NNUE_DIRECTORY)
+#define stringify2(x) #x
+#define stringify(x) stringify2(x)
     std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory , stringify(DEFAULT_NNUE_DIRECTORY) };
-    #else
+#else
     std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory };
-    #endif
+#endif
 
     for (std::string directory : dirs)
         if (eval_file_loaded != eval_file)
@@ -238,8 +238,8 @@ namespace Eval::NNUE {
             }
         }
 
-    #undef stringify2
-    #undef stringify
+#undef stringify2
+#undef stringify
   }
 
   /// NNUE::verify() verifies that the last net used was loaded successfully
diff --git a/src/nnue/evaluate_nnue.h b/src/nnue/evaluate_nnue.h
index 5335713b..e6ddc7fd 100644
--- a/src/nnue/evaluate_nnue.h
+++ b/src/nnue/evaluate_nnue.h
@@ -1,23 +1,21 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
 
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
 
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-// header used in NNUE evaluation function
-
 #ifndef NNUE_EVALUATE_NNUE_H_INCLUDED
 #define NNUE_EVALUATE_NNUE_H_INCLUDED
 
@@ -25,79 +23,82 @@
 
 #include <memory>
 
+// header used in NNUE evaluation function
 namespace Eval::NNUE {
 
-  enum struct UseNNUEMode
-  {
-    False,
-    True,
-    Pure
-  };
+    enum struct UseNNUEMode
+    {
+        False,
+        True,
+        Pure
+    };
 
-  // Hash value of evaluation function structure
-  constexpr std::uint32_t kHashValue =
-      FeatureTransformer::GetHashValue() ^ Network::GetHashValue();
+    // Hash value of evaluation function structure
+    constexpr std::uint32_t kHashValue =
+        FeatureTransformer::GetHashValue() ^ Network::GetHashValue();
 
-  // Deleter for automating release of memory area
-  template <typename T>
-  struct AlignedDeleter {
-    void operator()(T* ptr) const {
-      ptr->~T();
-      std_aligned_free(ptr);
-    }
-  };
+    // Deleter for automating release of memory area
+    template <typename T>
+    struct AlignedDeleter {
+        void operator()(T* ptr) const {
+            ptr->~T();
+            std_aligned_free(ptr);
+        }
+    };
 
-  template <typename T>
-  struct LargePageDeleter {
-    void operator()(T* ptr) const {
-      ptr->~T();
-      aligned_large_pages_free(ptr);
-    }
-  };
+    template <typename T>
+    struct LargePageDeleter {
+        void operator()(T* ptr) const {
+            ptr->~T();
+            aligned_large_pages_free(ptr);
+        }
+    };
 
-  template <typename T>
-  using AlignedPtr = std::unique_ptr<T, AlignedDeleter<T>>;
+    template <typename T>
+    using AlignedPtr = std::unique_ptr<T, AlignedDeleter<T>>;
 
-  template <typename T>
-  using LargePagePtr = std::unique_ptr<T, LargePageDeleter<T>>;
+    template <typename T>
+    using LargePagePtr = std::unique_ptr<T, LargePageDeleter<T>>;
 
-  // Input feature converter
-  extern LargePagePtr<FeatureTransformer> feature_transformer;
+    // Input feature converter
+    extern LargePagePtr<FeatureTransformer> feature_transformer;
 
-  // Evaluation function
-  extern AlignedPtr<Network> network;
+    // Evaluation function
+    extern AlignedPtr<Network> network;
 
-  // Evaluation function file name
-  extern std::string fileName;
+    // Evaluation function file name
+    extern std::string fileName;
 
-  // Saved evaluation function file name
-  extern std::string savedfileName;
+    // Saved evaluation function file name
+    extern std::string savedfileName;
 
-  extern UseNNUEMode useNNUE;
-  extern std::string eval_file_loaded;
+    extern UseNNUEMode useNNUE;
 
-  // Get a string that represents the structure of the evaluation function
-  std::string GetArchitectureString();
+    extern std::string eval_file_loaded;
 
-  // read the header
-  bool ReadHeader(std::istream& stream,
-    std::uint32_t* hash_value, std::string* architecture);
+    // Get a string that represents the structure of the evaluation function
+    std::string GetArchitectureString();
 
-  // write the header
-  bool WriteHeader(std::ostream& stream,
-    std::uint32_t hash_value, const std::string& architecture);
+    // read the header
+    bool ReadHeader(std::istream& stream,
+        std::uint32_t* hash_value, std::string* architecture);
 
-  // read evaluation function parameters
-  bool ReadParameters(std::istream& stream);
+    // write the header
+    bool WriteHeader(std::ostream& stream,
+        std::uint32_t hash_value, const std::string& architecture);
 
-  // write evaluation function parameters
-  bool WriteParameters(std::ostream& stream);
+    // read evaluation function parameters
+    bool ReadParameters(std::istream& stream);
 
-  Value evaluate(const Position& pos);
-  bool load_eval(std::string name, std::istream& stream);
-  void init();
-  void verify_eval_file_loaded();
-  void verify_any_net_loaded();
+    // write evaluation function parameters
+    bool WriteParameters(std::ostream& stream);
+
+    Value evaluate(const Position& pos);
+    bool load_eval(std::string name, std::istream& stream);
+    void init();
+
+    void verify_eval_file_loaded();
+    void verify_any_net_loaded();
 
 }  // namespace Eval::NNUE
 
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 2d6c6db3..92ecd8d2 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -1,18 +1,10 @@
-﻿// Code for learning NNUE evaluation function
-
-#include <random>
+﻿#include <random>
 #include <fstream>
 #include <filesystem>
 
-#include "../learn/learn.h"
-
-#include "../position.h"
-#include "../uci.h"
-#include "../misc.h"
-#include "../thread_win32_osx.h"
-
 #include "evaluate_nnue.h"
 #include "evaluate_nnue_learner.h"
+
 #include "trainer/features/factorizer_feature_set.h"
 #include "trainer/features/factorizer_half_kp.h"
 #include "trainer/trainer_feature_transformer.h"
@@ -21,191 +13,207 @@
 #include "trainer/trainer_clipped_relu.h"
 #include "trainer/trainer_sum.h"
 
+#include "position.h"
+#include "uci.h"
+#include "misc.h"
+#include "thread_win32_osx.h"
+
+#include "learn/learn.h"
+
 // Learning rate scale
 double global_learning_rate;
 
+// Code for learning NNUE evaluation function
 namespace Eval::NNUE {
 
-  namespace {
+    namespace {
 
-    // learning data
-    std::vector<Example> examples;
+        // learning data
+        std::vector<Example> examples;
 
-    // Mutex for exclusive control of examples
-    std::mutex examples_mutex;
+        // Mutex for exclusive control of examples
+        std::mutex examples_mutex;
 
-    // number of samples in mini-batch
-    uint64_t batch_size;
+        // number of samples in mini-batch
+        uint64_t batch_size;
 
-    // random number generator
-    std::mt19937 rng;
+        // random number generator
+        std::mt19937 rng;
 
-    // learner
-    std::shared_ptr<Trainer<Network>> trainer;
+        // learner
+        std::shared_ptr<Trainer<Network>> trainer;
 
-    // Tell the learner options such as hyperparameters
-    void SendMessages(std::vector<Message> messages) {
-      for (auto& message : messages) {
-        trainer->SendMessage(&message);
-        assert(message.num_receivers > 0);
-      }
-    }
-
-  }  // namespace
-
-  // Initialize learning
-  void InitializeTraining(const std::string& seed) {
-    std::cout << "Initializing NN training for "
-              << GetArchitectureString() << std::endl;
-
-    assert(feature_transformer);
-    assert(network);
-    trainer = Trainer<Network>::Create(network.get(), feature_transformer.get());
-    rng.seed(PRNG(seed).rand<uint64_t>());
-
-    if (Options["SkipLoadingEval"]) {
-      trainer->Initialize(rng);
-    }
-  }
-
-  // set the number of samples in the mini-batch
-  void SetBatchSize(uint64_t size) {
-    assert(size > 0);
-    batch_size = size;
-  }
-  
-  // Set options such as hyperparameters
-  void SetOptions(const std::string& options) {
-    std::vector<Message> messages;
-    for (const auto& option : Split(options, ',')) {
-      const auto fields = Split(option, '=');
-      assert(fields.size() == 1 || fields.size() == 2);
-      if (fields.size() == 1) {
-        messages.emplace_back(fields[0]);
-      } else {
-        messages.emplace_back(fields[0], fields[1]);
-      }
-    }
-    SendMessages(std::move(messages));
-  }
-
-  // Reread the evaluation function parameters for learning from the file
-  void RestoreParameters(const std::string& dir_name) {
-    const std::string file_name = Path::Combine(dir_name, NNUE::savedfileName);
-    std::ifstream stream(file_name, std::ios::binary);
-#ifndef NDEBUG
-    bool result =
-#endif
-    ReadParameters(stream);
-#ifndef NDEBUG
-    assert(result);
-#endif
-
-    SendMessages({{"reset"}});
-  }
-
-  void FinalizeNet() {
-    SendMessages({{"clear_unobserved_feature_weights"}});
-  }
-
-  // Add 1 sample of learning data
-  void AddExample(Position& pos, Color rootColor,
-                  const Learner::PackedSfenValue& psv, double weight) {
-    Example example;
-    if (rootColor == pos.side_to_move()) {
-      example.sign = 1;
-    } else {
-      example.sign = -1;
-    }
-    example.psv = psv;
-    example.weight = weight;
-
-    Features::IndexList active_indices[2];
-    for (const auto trigger : kRefreshTriggers) {
-      RawFeatures::AppendActiveIndices(pos, trigger, active_indices);
-    }
-    if (pos.side_to_move() != WHITE) {
-      active_indices[0].swap(active_indices[1]);
-    }
-    for (const auto color : Colors) {
-      std::vector<TrainingFeature> training_features;
-      for (const auto base_index : active_indices[color]) {
-        static_assert(Features::Factorizer<RawFeatures>::GetDimensions() <
-                      (1 << TrainingFeature::kIndexBits), "");
-        Features::Factorizer<RawFeatures>::AppendTrainingFeatures(
-            base_index, &training_features);
-      }
-      std::sort(training_features.begin(), training_features.end());
-
-      auto& unique_features = example.training_features[color];
-      for (const auto& feature : training_features) {
-        if (!unique_features.empty() &&
-            feature.GetIndex() == unique_features.back().GetIndex()) {
-          unique_features.back() += feature;
-        } else {
-          unique_features.push_back(feature);
+        // Tell the learner options such as hyperparameters
+        void SendMessages(std::vector<Message> messages) {
+            for (auto& message : messages) {
+                trainer->SendMessage(&message);
+                assert(message.num_receivers > 0);
+            }
+        }
+
+    }  // namespace
+
+    // Initialize learning
+    void InitializeTraining(const std::string& seed) {
+        std::cout << "Initializing NN training for "
+                  << GetArchitectureString() << std::endl;
+
+        assert(feature_transformer);
+        assert(network);
+        trainer = Trainer<Network>::Create(network.get(), feature_transformer.get());
+        rng.seed(PRNG(seed).rand<uint64_t>());
+
+        if (Options["SkipLoadingEval"]) {
+            trainer->Initialize(rng);
         }
-      }
     }
 
-    std::lock_guard<std::mutex> lock(examples_mutex);
-    examples.push_back(std::move(example));
-  }
-
-  // update the evaluation function parameters
-  void UpdateParameters() {
-    assert(batch_size > 0);
-
-    const auto learning_rate = static_cast<LearnFloatType>(
-        global_learning_rate / batch_size);
-
-    std::lock_guard<std::mutex> lock(examples_mutex);
-    std::shuffle(examples.begin(), examples.end(), rng);
-    while (examples.size() >= batch_size) {
-      std::vector<Example> batch(examples.end() - batch_size, examples.end());
-      examples.resize(examples.size() - batch_size);
-
-      const auto network_output = trainer->Propagate(batch);
-
-      std::vector<LearnFloatType> gradients(batch.size());
-      for (std::size_t b = 0; b < batch.size(); ++b) {
-        const auto shallow = static_cast<Value>(Round<std::int32_t>(
-            batch[b].sign * network_output[b] * kPonanzaConstant));
-        const auto& psv = batch[b].psv;
-        const double gradient = batch[b].sign * Learner::calc_grad(shallow, psv);
-        gradients[b] = static_cast<LearnFloatType>(gradient * batch[b].weight);
-      }
-
-      trainer->Backpropagate(gradients.data(), learning_rate);
+    // set the number of samples in the mini-batch
+    void SetBatchSize(uint64_t size) {
+        assert(size > 0);
+        batch_size = size;
     }
-    SendMessages({{"quantize_parameters"}});
-  }
 
-  // Check if there are any problems with learning
-  void CheckHealth() {
-    SendMessages({{"check_health"}});
-  }
+    // Set options such as hyperparameters
+    void SetOptions(const std::string& options) {
+        std::vector<Message> messages;
+        for (const auto& option : Split(options, ',')) {
+          const auto fields = Split(option, '=');
+          assert(fields.size() == 1 || fields.size() == 2);
 
-  // save merit function parameters to a file
-  void save_eval(std::string dir_name) {
-    auto eval_dir = Path::Combine(Options["EvalSaveDir"], dir_name);
-    std::cout << "save_eval() start. folder = " << eval_dir << std::endl;
+          if (fields.size() == 1) {
+              messages.emplace_back(fields[0]);
+          } else {
+              messages.emplace_back(fields[0], fields[1]);
+          }
+        }
 
-    // mkdir() will fail if this folder already exists, but
-    // Apart from that. If not, I just want you to make it.
-    // Also, assume that the folders up to EvalSaveDir have been dug.
-    std::filesystem::create_directories(eval_dir);
+        SendMessages(std::move(messages));
+    }
 
-    const std::string file_name = Path::Combine(eval_dir, NNUE::savedfileName);
-    std::ofstream stream(file_name, std::ios::binary);
+    // Reread the evaluation function parameters for learning from the file
+    void RestoreParameters(const std::string& dir_name) {
+        const std::string file_name = Path::Combine(dir_name, NNUE::savedfileName);
+        std::ifstream stream(file_name, std::ios::binary);
 #ifndef NDEBUG
-    bool result =
+        bool result =
 #endif
-    WriteParameters(stream);
+        ReadParameters(stream);
 #ifndef NDEBUG
-    assert(result);
+        assert(result);
 #endif
 
-    std::cout << "save_eval() finished. folder = " << eval_dir << std::endl;
-  }
+        SendMessages({{"reset"}});
+    }
+
+    void FinalizeNet() {
+        SendMessages({{"clear_unobserved_feature_weights"}});
+    }
+
+    // Add 1 sample of learning data
+    void AddExample(Position& pos, Color rootColor,
+                    const Learner::PackedSfenValue& psv, double weight) {
+
+        Example example;
+        if (rootColor == pos.side_to_move()) {
+            example.sign = 1;
+        } else {
+            example.sign = -1;
+        }
+
+        example.psv = psv;
+        example.weight = weight;
+
+        Features::IndexList active_indices[2];
+        for (const auto trigger : kRefreshTriggers) {
+            RawFeatures::AppendActiveIndices(pos, trigger, active_indices);
+        }
+
+        if (pos.side_to_move() != WHITE) {
+            active_indices[0].swap(active_indices[1]);
+        }
+
+        for (const auto color : Colors) {
+            std::vector<TrainingFeature> training_features;
+            for (const auto base_index : active_indices[color]) {
+                static_assert(Features::Factorizer<RawFeatures>::GetDimensions() <
+                              (1 << TrainingFeature::kIndexBits), "");
+                Features::Factorizer<RawFeatures>::AppendTrainingFeatures(
+                    base_index, &training_features);
+            }
+
+            std::sort(training_features.begin(), training_features.end());
+
+            auto& unique_features = example.training_features[color];
+            for (const auto& feature : training_features) {
+                if (!unique_features.empty() &&
+                    feature.GetIndex() == unique_features.back().GetIndex()) {
+
+                    unique_features.back() += feature;
+                } else {
+                    unique_features.push_back(feature);
+                }
+            }
+        }
+
+        std::lock_guard<std::mutex> lock(examples_mutex);
+        examples.push_back(std::move(example));
+    }
+
+    // update the evaluation function parameters
+    void UpdateParameters() {
+        assert(batch_size > 0);
+
+        const auto learning_rate = static_cast<LearnFloatType>(
+            global_learning_rate / batch_size);
+
+        std::lock_guard<std::mutex> lock(examples_mutex);
+        std::shuffle(examples.begin(), examples.end(), rng);
+        while (examples.size() >= batch_size) {
+            std::vector<Example> batch(examples.end() - batch_size, examples.end());
+            examples.resize(examples.size() - batch_size);
+
+            const auto network_output = trainer->Propagate(batch);
+
+            std::vector<LearnFloatType> gradients(batch.size());
+            for (std::size_t b = 0; b < batch.size(); ++b) {
+                const auto shallow = static_cast<Value>(Round<std::int32_t>(
+                    batch[b].sign * network_output[b] * kPonanzaConstant));
+                const auto& psv = batch[b].psv;
+                const double gradient = batch[b].sign * Learner::calc_grad(shallow, psv);
+                gradients[b] = static_cast<LearnFloatType>(gradient * batch[b].weight);
+            }
+
+            trainer->Backpropagate(gradients.data(), learning_rate);
+        }
+        SendMessages({{"quantize_parameters"}});
+    }
+
+    // Check if there are any problems with learning
+    void CheckHealth() {
+        SendMessages({{"check_health"}});
+    }
+
+    // save merit function parameters to a file
+    void save_eval(std::string dir_name) {
+        auto eval_dir = Path::Combine(Options["EvalSaveDir"], dir_name);
+        std::cout << "save_eval() start. folder = " << eval_dir << std::endl;
+
+        // mkdir() will fail if this folder already exists, but
+        // Apart from that. If not, I just want you to make it.
+        // Also, assume that the folders up to EvalSaveDir have been dug.
+        std::filesystem::create_directories(eval_dir);
+
+        const std::string file_name = Path::Combine(eval_dir, NNUE::savedfileName);
+        std::ofstream stream(file_name, std::ios::binary);
+#ifndef NDEBUG
+        bool result =
+#endif
+        WriteParameters(stream);
+#ifndef NDEBUG
+        assert(result);
+#endif
+
+        std::cout << "save_eval() finished. folder = " << eval_dir << std::endl;
+    }
 }  // namespace Eval::NNUE
\ No newline at end of file
diff --git a/src/nnue/evaluate_nnue_learner.h b/src/nnue/evaluate_nnue_learner.h
index c41d8d6b..525b286a 100644
--- a/src/nnue/evaluate_nnue_learner.h
+++ b/src/nnue/evaluate_nnue_learner.h
@@ -1,37 +1,36 @@
-﻿// Interface used for learning NNUE evaluation function
-
-#ifndef _EVALUATE_NNUE_LEARNER_H_
+﻿#ifndef _EVALUATE_NNUE_LEARNER_H_
 #define _EVALUATE_NNUE_LEARNER_H_
 
-#include "../learn/learn.h"
+#include "learn/learn.h"
 
+// Interface used for learning NNUE evaluation function
 namespace Eval::NNUE {
 
-  // Initialize learning
-  void InitializeTraining(const std::string& seed);
+    // Initialize learning
+    void InitializeTraining(const std::string& seed);
 
-  // set the number of samples in the mini-batch
-  void SetBatchSize(uint64_t size);
+    // set the number of samples in the mini-batch
+    void SetBatchSize(uint64_t size);
 
-  // Set options such as hyperparameters
-  void SetOptions(const std::string& options);
+    // Set options such as hyperparameters
+    void SetOptions(const std::string& options);
 
-  // Reread the evaluation function parameters for learning from the file
-  void RestoreParameters(const std::string& dir_name);
+    // Reread the evaluation function parameters for learning from the file
+    void RestoreParameters(const std::string& dir_name);
 
-// Add 1 sample of learning data
-  void AddExample(Position& pos, Color rootColor,
-  	const Learner::PackedSfenValue& psv, double weight);
+    // Add 1 sample of learning data
+    void AddExample(Position& pos, Color rootColor,
+    	 const Learner::PackedSfenValue& psv, double weight);
 
-  // update the evaluation function parameters
-  void UpdateParameters();
+    // update the evaluation function parameters
+    void UpdateParameters();
 
-  // Check if there are any problems with learning
-  void CheckHealth();
+    // Check if there are any problems with learning
+    void CheckHealth();
 
-  void FinalizeNet();
+    void FinalizeNet();
 
-  void save_eval(std::string suffix);
+    void save_eval(std::string suffix);
 }  // namespace Eval::NNUE
 
 #endif
diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h
index 26370710..8b60dafc 100644
--- a/src/nnue/nnue_accumulator.h
+++ b/src/nnue/nnue_accumulator.h
@@ -1,36 +1,34 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
 
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
 
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-// Class for difference calculation of NNUE evaluation function
-
 #ifndef NNUE_ACCUMULATOR_H_INCLUDED
 #define NNUE_ACCUMULATOR_H_INCLUDED
 
 #include "nnue_architecture.h"
 
+// Class for difference calculation of NNUE evaluation function
 namespace Eval::NNUE {
 
-  // Class that holds the result of affine transformation of input features
-  struct alignas(kCacheLineSize) Accumulator {
-    std::int16_t
-        accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions];
-    bool computed_accumulation;
-  };
+    // Class that holds the result of affine transformation of input features
+    struct alignas(kCacheLineSize) Accumulator {
+        std::int16_t accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions];
+        bool computed_accumulation;
+    };
 
 }  // namespace Eval::NNUE
 
diff --git a/src/nnue/nnue_architecture.h b/src/nnue/nnue_architecture.h
index 91cdc4bd..2ecb6999 100644
--- a/src/nnue/nnue_architecture.h
+++ b/src/nnue/nnue_architecture.h
@@ -1,37 +1,36 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
 
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
 
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-// Input features and network structure used in NNUE evaluation function
-
 #ifndef NNUE_ARCHITECTURE_H_INCLUDED
 #define NNUE_ARCHITECTURE_H_INCLUDED
 
 // Defines the network structure
 #include "architectures/halfkp_256x2-32-32.h"
 
+// Input features and network structure used in NNUE evaluation function
 namespace Eval::NNUE {
 
-  static_assert(kTransformedFeatureDimensions % kMaxSimdWidth == 0, "");
-  static_assert(Network::kOutputDimensions == 1, "");
-  static_assert(std::is_same<Network::OutputType, std::int32_t>::value, "");
+    static_assert(kTransformedFeatureDimensions % kMaxSimdWidth == 0, "");
+    static_assert(Network::kOutputDimensions == 1, "");
+    static_assert(std::is_same<Network::OutputType, std::int32_t>::value, "");
 
-  // Trigger for full calculation instead of difference calculation
-  constexpr auto kRefreshTriggers = RawFeatures::kRefreshTriggers;
+    // Trigger for full calculation instead of difference calculation
+    constexpr auto kRefreshTriggers = RawFeatures::kRefreshTriggers;
 
 }  // namespace Eval::NNUE
 
diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h
index 9975134c..70c7596d 100644
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -1,19 +1,19 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
 
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
 
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 // Constants used in NNUE evaluation function
@@ -21,11 +21,11 @@
 #ifndef NNUE_COMMON_H_INCLUDED
 #define NNUE_COMMON_H_INCLUDED
 
+#include "types.h"
+
 #include <cstring>
 #include <iostream>
 
-#include "../types.h"
-
 #if defined(USE_AVX2)
 #include <immintrin.h>
 
@@ -70,84 +70,84 @@
 
 namespace Eval::NNUE {
 
-  // Version of the evaluation file
-  constexpr std::uint32_t kVersion = 0x7AF32F17u;
+    // Version of the evaluation file
+    constexpr std::uint32_t kVersion = 0x7AF32F17u;
 
-  // Constant used in evaluation value calculation
-  constexpr int FV_SCALE = 16;
-  constexpr int kWeightScaleBits = 6;
+    // Constant used in evaluation value calculation
+    constexpr int FV_SCALE = 16;
+    constexpr int kWeightScaleBits = 6;
 
-  // Size of cache line (in bytes)
-  constexpr std::size_t kCacheLineSize = 64;
+    // Size of cache line (in bytes)
+    constexpr std::size_t kCacheLineSize = 64;
 
-  // SIMD width (in bytes)
-  #if defined(USE_AVX2)
-  constexpr std::size_t kSimdWidth = 32;
+    // SIMD width (in bytes)
+#if defined(USE_AVX2)
+    constexpr std::size_t kSimdWidth = 32;
 
-  #elif defined(USE_SSE2)
-  constexpr std::size_t kSimdWidth = 16;
+#elif defined(USE_SSE2)
+    constexpr std::size_t kSimdWidth = 16;
 
-  #elif defined(USE_MMX)
-  constexpr std::size_t kSimdWidth = 8;
+#elif defined(USE_MMX)
+    constexpr std::size_t kSimdWidth = 8;
 
-  #elif defined(USE_NEON)
-  constexpr std::size_t kSimdWidth = 16;
-  #endif
+#elif defined(USE_NEON)
+    constexpr std::size_t kSimdWidth = 16;
+#endif
 
-  constexpr std::size_t kMaxSimdWidth = 32;
+    constexpr std::size_t kMaxSimdWidth = 32;
 
-  // unique number for each piece type on each square
-  enum {
-    PS_NONE     =  0,
-    PS_W_PAWN   =  1,
-    PS_B_PAWN   =  1 * SQUARE_NB + 1,
-    PS_W_KNIGHT =  2 * SQUARE_NB + 1,
-    PS_B_KNIGHT =  3 * SQUARE_NB + 1,
-    PS_W_BISHOP =  4 * SQUARE_NB + 1,
-    PS_B_BISHOP =  5 * SQUARE_NB + 1,
-    PS_W_ROOK   =  6 * SQUARE_NB + 1,
-    PS_B_ROOK   =  7 * SQUARE_NB + 1,
-    PS_W_QUEEN  =  8 * SQUARE_NB + 1,
-    PS_B_QUEEN  =  9 * SQUARE_NB + 1,
-    PS_W_KING   = 10 * SQUARE_NB + 1,
-    PS_END      = PS_W_KING, // pieces without kings (pawns included)
-    PS_B_KING   = 11 * SQUARE_NB + 1,
-    PS_END2     = 12 * SQUARE_NB + 1
-  };
+    // unique number for each piece type on each square
+    enum {
+        PS_NONE     =  0,
+        PS_W_PAWN   =  1,
+        PS_B_PAWN   =  1 * SQUARE_NB + 1,
+        PS_W_KNIGHT =  2 * SQUARE_NB + 1,
+        PS_B_KNIGHT =  3 * SQUARE_NB + 1,
+        PS_W_BISHOP =  4 * SQUARE_NB + 1,
+        PS_B_BISHOP =  5 * SQUARE_NB + 1,
+        PS_W_ROOK   =  6 * SQUARE_NB + 1,
+        PS_B_ROOK   =  7 * SQUARE_NB + 1,
+        PS_W_QUEEN  =  8 * SQUARE_NB + 1,
+        PS_B_QUEEN  =  9 * SQUARE_NB + 1,
+        PS_W_KING   = 10 * SQUARE_NB + 1,
+        PS_END      = PS_W_KING, // pieces without kings (pawns included)
+        PS_B_KING   = 11 * SQUARE_NB + 1,
+        PS_END2     = 12 * SQUARE_NB + 1
+    };
 
-  extern const uint32_t kpp_board_index[PIECE_NB][COLOR_NB];
+    extern const uint32_t kpp_board_index[PIECE_NB][COLOR_NB];
 
-  // Type of input feature after conversion
-  using TransformedFeatureType = std::uint8_t;
-  using IndexType = std::uint32_t;
+    // Type of input feature after conversion
+    using TransformedFeatureType = std::uint8_t;
+    using IndexType = std::uint32_t;
 
-  // Forward declaration of learning class template
-  template <typename Layer>
-  class Trainer;
+    // Forward declaration of learning class template
+    template <typename Layer>
+    class Trainer;
 
-  // Round n up to be a multiple of base
-  template <typename IntType>
-  constexpr IntType CeilToMultiple(IntType n, IntType base) {
-      return (n + base - 1) / base * base;
-  }
+    // Round n up to be a multiple of base
+    template <typename IntType>
+    constexpr IntType CeilToMultiple(IntType n, IntType base) {
+        return (n + base - 1) / base * base;
+    }
 
-  // read_little_endian() is our utility to read an integer (signed or unsigned, any size)
-  // from a stream in little-endian order. We swap the byte order after the read if
-  // necessary to return a result with the byte ordering of the compiling machine.
-  template <typename IntType>
-  inline IntType read_little_endian(std::istream& stream) {
+    // read_little_endian() is our utility to read an integer (signed or unsigned, any size)
+    // from a stream in little-endian order. We swap the byte order after the read if
+    // necessary to return a result with the byte ordering of the compiling machine.
+    template <typename IntType>
+    inline IntType read_little_endian(std::istream& stream) {
 
-      IntType result;
-      std::uint8_t u[sizeof(IntType)];
-      typename std::make_unsigned<IntType>::type v = 0;
+        IntType result;
+        std::uint8_t u[sizeof(IntType)];
+        typename std::make_unsigned<IntType>::type v = 0;
 
-      stream.read(reinterpret_cast<char*>(u), sizeof(IntType));
-      for (std::size_t i = 0; i < sizeof(IntType); ++i)
-          v = (v << 8) | u[sizeof(IntType) - i - 1];
+        stream.read(reinterpret_cast<char*>(u), sizeof(IntType));
+        for (std::size_t i = 0; i < sizeof(IntType); ++i)
+            v = (v << 8) | u[sizeof(IntType) - i - 1];
 
-      std::memcpy(&result, &v, sizeof(IntType));
-      return result;
-  }
+        std::memcpy(&result, &v, sizeof(IntType));
+        return result;
+    }
 
 }  // namespace Eval::NNUE
 
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index c9d8e0d2..2fc24dab 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -1,19 +1,19 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
 
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
 
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 // A class that converts the input features of the NNUE evaluation function
@@ -23,435 +23,450 @@
 
 #include "nnue_common.h"
 #include "nnue_architecture.h"
+
 #include "features/index_list.h"
 
-#include <cstring> // std::memset()
+#include <cstring>
+#include <string>
 
 namespace Eval::NNUE {
 
-  // If vector instructions are enabled, we update and refresh the
-  // accumulator tile by tile such that each tile fits in the CPU's
-  // vector registers.
-  #define TILING
+    // If vector instructions are enabled, we update and refresh the
+    // accumulator tile by tile such that each tile fits in the CPU's
+    // vector registers.
+#define TILING
 
-  #ifdef USE_AVX512
-  typedef __m512i vec_t;
-  #define vec_load(a) _mm512_loadA_si512(a)
-  #define vec_store(a,b) _mm512_storeA_si512(a,b)
-  #define vec_add_16(a,b) _mm512_add_epi16(a,b)
-  #define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
-  #define vec_zero _mm512_setzero_si512()
-  static constexpr IndexType kNumRegs = 8; // only 8 are needed
+#ifdef USE_AVX512
+    typedef __m512i vec_t;
+#define vec_load(a) _mm512_loadA_si512(a)
+#define vec_store(a,b) _mm512_storeA_si512(a,b)
+#define vec_add_16(a,b) _mm512_add_epi16(a,b)
+#define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
+#define vec_zero _mm512_setzero_si512()
+    static constexpr IndexType kNumRegs = 8; // only 8 are needed
 
-  #elif USE_AVX2
-  typedef __m256i vec_t;
-  #define vec_load(a) _mm256_loadA_si256(a)
-  #define vec_store(a,b) _mm256_storeA_si256(a,b)
-  #define vec_add_16(a,b) _mm256_add_epi16(a,b)
-  #define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
-  #define vec_zero _mm256_setzero_si256()
-  static constexpr IndexType kNumRegs = 16;
+#elif USE_AVX2
+    typedef __m256i vec_t;
+#define vec_load(a) _mm256_loadA_si256(a)
+#define vec_store(a,b) _mm256_storeA_si256(a,b)
+#define vec_add_16(a,b) _mm256_add_epi16(a,b)
+#define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
+#define vec_zero _mm256_setzero_si256()
+    static constexpr IndexType kNumRegs = 16;
 
-  #elif USE_SSE2
-  typedef __m128i vec_t;
-  #define vec_load(a) (*(a))
-  #define vec_store(a,b) *(a)=(b)
-  #define vec_add_16(a,b) _mm_add_epi16(a,b)
-  #define vec_sub_16(a,b) _mm_sub_epi16(a,b)
-  #define vec_zero _mm_setzero_si128()
-  static constexpr IndexType kNumRegs = Is64Bit ? 16 : 8;
+#elif USE_SSE2
+    typedef __m128i vec_t;
+#define vec_load(a) (*(a))
+#define vec_store(a,b) *(a)=(b)
+#define vec_add_16(a,b) _mm_add_epi16(a,b)
+#define vec_sub_16(a,b) _mm_sub_epi16(a,b)
+#define vec_zero _mm_setzero_si128()
+    static constexpr IndexType kNumRegs = Is64Bit ? 16 : 8;
 
-  #elif USE_MMX
-  typedef __m64 vec_t;
-  #define vec_load(a) (*(a))
-  #define vec_store(a,b) *(a)=(b)
-  #define vec_add_16(a,b) _mm_add_pi16(a,b)
-  #define vec_sub_16(a,b) _mm_sub_pi16(a,b)
-  #define vec_zero _mm_setzero_si64()
-  static constexpr IndexType kNumRegs = 8;
+#elif USE_MMX
+    typedef __m64 vec_t;
+#define vec_load(a) (*(a))
+#define vec_store(a,b) *(a)=(b)
+#define vec_add_16(a,b) _mm_add_pi16(a,b)
+#define vec_sub_16(a,b) _mm_sub_pi16(a,b)
+#define vec_zero _mm_setzero_si64()
+    static constexpr IndexType kNumRegs = 8;
 
-  #elif USE_NEON
-  typedef int16x8_t vec_t;
-  #define vec_load(a) (*(a))
-  #define vec_store(a,b) *(a)=(b)
-  #define vec_add_16(a,b) vaddq_s16(a,b)
-  #define vec_sub_16(a,b) vsubq_s16(a,b)
-  #define vec_zero {0}
-  static constexpr IndexType kNumRegs = 16;
+#elif USE_NEON
+    typedef int16x8_t vec_t;
+#define vec_load(a) (*(a))
+#define vec_store(a,b) *(a)=(b)
+#define vec_add_16(a,b) vaddq_s16(a,b)
+#define vec_sub_16(a,b) vsubq_s16(a,b)
+#define vec_zero {0}
+    static constexpr IndexType kNumRegs = 16;
 
-  #else
-  #undef TILING
+#else
+#undef TILING
 
-  #endif
+#endif
 
-  // Input feature converter
-  class FeatureTransformer {
+    // Input feature converter
+    class FeatureTransformer {
 
-   private:
-    // Number of output dimensions for one side
-    static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions;
+    private:
+        // Number of output dimensions for one side
+        static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions;
 
-    #ifdef TILING
-    static constexpr IndexType kTileHeight = kNumRegs * sizeof(vec_t) / 2;
-    static_assert(kHalfDimensions % kTileHeight == 0, "kTileHeight must divide kHalfDimensions");
-    #endif
+#ifdef TILING
+        static constexpr IndexType kTileHeight = kNumRegs * sizeof(vec_t) / 2;
+        static_assert(kHalfDimensions % kTileHeight == 0, "kTileHeight must divide kHalfDimensions");
+#endif
 
-   public:
-    // Output type
-    using OutputType = TransformedFeatureType;
+    public:
+        // Output type
+        using OutputType = TransformedFeatureType;
 
-    // Number of input/output dimensions
-    static constexpr IndexType kInputDimensions = RawFeatures::kDimensions;
-    static constexpr IndexType kOutputDimensions = kHalfDimensions * 2;
+        // Number of input/output dimensions
+        static constexpr IndexType kInputDimensions = RawFeatures::kDimensions;
+        static constexpr IndexType kOutputDimensions = kHalfDimensions * 2;
 
-    // Size of forward propagation buffer
-    static constexpr std::size_t kBufferSize =
-        kOutputDimensions * sizeof(OutputType);
+        // Size of forward propagation buffer
+        static constexpr std::size_t kBufferSize =
+            kOutputDimensions * sizeof(OutputType);
 
-    // Hash value embedded in the evaluation file
-    static constexpr std::uint32_t GetHashValue() {
+        // Hash value embedded in the evaluation file
+        static constexpr std::uint32_t GetHashValue() {
 
-      return RawFeatures::kHashValue ^ kOutputDimensions;
-    }
-
-    // a string representing the structure
-    static std::string GetStructureString() {
-      return RawFeatures::GetName() + "[" +
-        std::to_string(kInputDimensions) + "->" +
-        std::to_string(kHalfDimensions) + "x2]";
-    }
-
-    // Read network parameters
-    bool ReadParameters(std::istream& stream) {
-
-      for (std::size_t i = 0; i < kHalfDimensions; ++i)
-        biases_[i] = read_little_endian<BiasType>(stream);
-      for (std::size_t i = 0; i < kHalfDimensions * kInputDimensions; ++i)
-        weights_[i] = read_little_endian<WeightType>(stream);
-      return !stream.fail();
-    }
-
-    // write parameters
-    bool WriteParameters(std::ostream& stream) const {
-      stream.write(reinterpret_cast<const char*>(biases_),
-        kHalfDimensions * sizeof(BiasType));
-      stream.write(reinterpret_cast<const char*>(weights_),
-        kHalfDimensions * kInputDimensions * sizeof(WeightType));
-      return !stream.fail();
-    }
-
-    // Proceed with the difference calculation if possible
-    bool UpdateAccumulatorIfPossible(const Position& pos) const {
-
-      const auto now = pos.state();
-      if (now->accumulator.computed_accumulation)
-        return true;
-
-      const auto prev = now->previous;
-      if (prev && prev->accumulator.computed_accumulation) {
-        UpdateAccumulator(pos);
-        return true;
-      }
-
-      return false;
-    }
-
-    // Convert input features
-    void Transform(const Position& pos, OutputType* output) const {
-
-      if (!UpdateAccumulatorIfPossible(pos))
-        RefreshAccumulator(pos);
-
-      const auto& accumulation = pos.state()->accumulator.accumulation;
-
-  #if defined(USE_AVX2)
-      constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
-      constexpr int kControl = 0b11011000;
-      const __m256i kZero = _mm256_setzero_si256();
-
-  #elif defined(USE_SSE2)
-      constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
-
-  #ifdef USE_SSE41
-      const __m128i kZero = _mm_setzero_si128();
-  #else
-      const __m128i k0x80s = _mm_set1_epi8(-128);
-  #endif
-
-  #elif defined(USE_MMX)
-      constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
-      const __m64 k0x80s = _mm_set1_pi8(-128);
-
-  #elif defined(USE_NEON)
-      constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-      const int8x8_t kZero = {0};
-  #endif
-
-      const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()};
-      for (IndexType p = 0; p < 2; ++p) {
-        const IndexType offset = kHalfDimensions * p;
-
-  #if defined(USE_AVX2)
-        auto out = reinterpret_cast<__m256i*>(&output[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-          __m256i sum0 = _mm256_loadA_si256(
-              &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
-          __m256i sum1 = _mm256_loadA_si256(
-            &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
-          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
-            sum0 = _mm256_add_epi16(sum0, reinterpret_cast<const __m256i*>(
-                accumulation[perspectives[p]][i])[j * 2 + 0]);
-            sum1 = _mm256_add_epi16(sum1, reinterpret_cast<const __m256i*>(
-                accumulation[perspectives[p]][i])[j * 2 + 1]);
-          }
-          _mm256_storeA_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
-              _mm256_packs_epi16(sum0, sum1), kZero), kControl));
+            return RawFeatures::kHashValue ^ kOutputDimensions;
         }
 
-  #elif defined(USE_SSE2)
-        auto out = reinterpret_cast<__m128i*>(&output[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-          __m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
-              accumulation[perspectives[p]][0])[j * 2 + 0]);
-          __m128i sum1 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
-              accumulation[perspectives[p]][0])[j * 2 + 1]);
-          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
-            sum0 = _mm_add_epi16(sum0, reinterpret_cast<const __m128i*>(
-                accumulation[perspectives[p]][i])[j * 2 + 0]);
-            sum1 = _mm_add_epi16(sum1, reinterpret_cast<const __m128i*>(
-                accumulation[perspectives[p]][i])[j * 2 + 1]);
-          }
-      const __m128i packedbytes = _mm_packs_epi16(sum0, sum1);
-
-          _mm_store_si128(&out[j],
-
-  #ifdef USE_SSE41
-            _mm_max_epi8(packedbytes, kZero)
-  #else
-            _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
-  #endif
-
-          );
+        // a string representing the structure
+        static std::string GetStructureString() {
+            return RawFeatures::GetName() + "[" +
+                std::to_string(kInputDimensions) + "->" +
+                std::to_string(kHalfDimensions) + "x2]";
         }
 
-  #elif defined(USE_MMX)
-        auto out = reinterpret_cast<__m64*>(&output[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-          __m64 sum0 = *(&reinterpret_cast<const __m64*>(
-              accumulation[perspectives[p]][0])[j * 2 + 0]);
-          __m64 sum1 = *(&reinterpret_cast<const __m64*>(
-              accumulation[perspectives[p]][0])[j * 2 + 1]);
-          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
-            sum0 = _mm_add_pi16(sum0, reinterpret_cast<const __m64*>(
-                accumulation[perspectives[p]][i])[j * 2 + 0]);
-            sum1 = _mm_add_pi16(sum1, reinterpret_cast<const __m64*>(
-                accumulation[perspectives[p]][i])[j * 2 + 1]);
-          }
-          const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
-          out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
+        // Read network parameters
+        bool ReadParameters(std::istream& stream) {
+
+            for (std::size_t i = 0; i < kHalfDimensions; ++i)
+                biases_[i] = read_little_endian<BiasType>(stream);
+
+            for (std::size_t i = 0; i < kHalfDimensions * kInputDimensions; ++i)
+                weights_[i] = read_little_endian<WeightType>(stream);
+
+            return !stream.fail();
         }
 
-  #elif defined(USE_NEON)
-        const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-          int16x8_t sum = reinterpret_cast<const int16x8_t*>(
-              accumulation[perspectives[p]][0])[j];
-          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
-            sum = vaddq_s16(sum, reinterpret_cast<const int16x8_t*>(
-                accumulation[perspectives[p]][i])[j]);
-          }
-          out[j] = vmax_s8(vqmovn_s16(sum), kZero);
+        // write parameters
+        bool WriteParameters(std::ostream& stream) const {
+            stream.write(reinterpret_cast<const char*>(biases_),
+                kHalfDimensions * sizeof(BiasType));
+
+            stream.write(reinterpret_cast<const char*>(weights_),
+                kHalfDimensions * kInputDimensions * sizeof(WeightType));
+
+            return !stream.fail();
         }
 
-  #else
-        for (IndexType j = 0; j < kHalfDimensions; ++j) {
-          BiasType sum = accumulation[static_cast<int>(perspectives[p])][0][j];
-          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
-            sum += accumulation[static_cast<int>(perspectives[p])][i][j];
-          }
-          output[offset + j] = static_cast<OutputType>(
-              std::max<int>(0, std::min<int>(127, sum)));
-        }
-  #endif
+        // Proceed with the difference calculation if possible
+        bool UpdateAccumulatorIfPossible(const Position& pos) const {
 
-      }
-  #if defined(USE_MMX)
-      _mm_empty();
-  #endif
-    }
+            const auto now = pos.state();
+            if (now->accumulator.computed_accumulation)
+                return true;
 
-   private:
-    // Calculate cumulative value without using difference calculation
-    void RefreshAccumulator(const Position& pos) const {
-
-      auto& accumulator = pos.state()->accumulator;
-      for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-        Features::IndexList active_indices[2];
-        RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i],
-                                         active_indices);
-        for (Color perspective : { WHITE, BLACK }) {
-    #ifdef TILING
-          for (unsigned j = 0; j < kHalfDimensions / kTileHeight; ++j) {
-            auto accTile = reinterpret_cast<vec_t*>(
-                &accumulator.accumulation[perspective][i][j * kTileHeight]);
-            vec_t acc[kNumRegs];
-
-            if (i == 0) {
-              auto biasesTile = reinterpret_cast<const vec_t*>(
-                  &biases_[j * kTileHeight]);
-              for (unsigned k = 0; k < kNumRegs; ++k)
-                acc[k] = biasesTile[k];
-            } else {
-              for (unsigned k = 0; k < kNumRegs; ++k)
-                acc[k] = vec_zero;
-            }
-            for (const auto index : active_indices[perspective]) {
-              const IndexType offset = kHalfDimensions * index + j * kTileHeight;
-              auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
-
-              for (unsigned k = 0; k < kNumRegs; ++k)
-                acc[k] = vec_add_16(acc[k], column[k]);
+            const auto prev = now->previous;
+            if (prev && prev->accumulator.computed_accumulation) {
+                UpdateAccumulator(pos);
+                return true;
             }
 
-            for (unsigned k = 0; k < kNumRegs; k++)
-              vec_store(&accTile[k], acc[k]);
-          }
-    #else
-          if (i == 0) {
-            std::memcpy(accumulator.accumulation[perspective][i], biases_,
-                        kHalfDimensions * sizeof(BiasType));
-          } else {
-            std::memset(accumulator.accumulation[perspective][i], 0,
-                        kHalfDimensions * sizeof(BiasType));
-          }
-
-          for (const auto index : active_indices[perspective]) {
-            const IndexType offset = kHalfDimensions * index;
-
-            for (IndexType j = 0; j < kHalfDimensions; ++j)
-              accumulator.accumulation[perspective][i][j] += weights_[offset + j];
-          }
-    #endif
+            return false;
         }
 
-      }
+        // Convert input features
+        void Transform(const Position& pos, OutputType* output) const {
 
-  #if defined(USE_MMX)
-      _mm_empty();
-  #endif
+            if (!UpdateAccumulatorIfPossible(pos))
+              RefreshAccumulator(pos);
 
-      accumulator.computed_accumulation = true;
-    }
+            const auto& accumulation = pos.state()->accumulator.accumulation;
 
-    // Calculate cumulative value using difference calculation
-    void UpdateAccumulator(const Position& pos) const {
+#if defined(USE_AVX2)
+            constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
+            constexpr int kControl = 0b11011000;
+            const __m256i kZero = _mm256_setzero_si256();
 
-      const auto& prev_accumulator = pos.state()->previous->accumulator;
-      auto& accumulator = pos.state()->accumulator;
-      for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-        Features::IndexList removed_indices[2], added_indices[2];
-        bool reset[2] = { false, false };
-        RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i],
-                                          removed_indices, added_indices, reset);
+#elif defined(USE_SSE2)
+            constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
 
-    #ifdef TILING
-        for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
-          for (Color perspective : { WHITE, BLACK }) {
-            auto accTile = reinterpret_cast<vec_t*>(
-                &accumulator.accumulation[perspective][i][j * kTileHeight]);
-            vec_t acc[kNumRegs];
+#ifdef USE_SSE41
+            const __m128i kZero = _mm_setzero_si128();
+#else
+            const __m128i k0x80s = _mm_set1_epi8(-128);
+#endif
 
-            if (reset[perspective]) {
-              if (i == 0) {
-                auto biasesTile = reinterpret_cast<const vec_t*>(
-                    &biases_[j * kTileHeight]);
-                for (unsigned k = 0; k < kNumRegs; ++k)
-                  acc[k] = biasesTile[k];
-              } else {
-                for (unsigned k = 0; k < kNumRegs; ++k)
-                  acc[k] = vec_zero;
-              }
-            } else {
-              auto prevAccTile = reinterpret_cast<const vec_t*>(
-                  &prev_accumulator.accumulation[perspective][i][j * kTileHeight]);
-              for (IndexType k = 0; k < kNumRegs; ++k)
-                acc[k] = vec_load(&prevAccTile[k]);
+#elif defined(USE_MMX)
+            constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
+            const __m64 k0x80s = _mm_set1_pi8(-128);
 
-              // Difference calculation for the deactivated features
-              for (const auto index : removed_indices[perspective]) {
-                const IndexType offset = kHalfDimensions * index + j * kTileHeight;
-                auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
+#elif defined(USE_NEON)
+            constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+            const int8x8_t kZero = {0};
+#endif
+
+            const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()};
+            for (IndexType p = 0; p < 2; ++p) {
+                const IndexType offset = kHalfDimensions * p;
+
+#if defined(USE_AVX2)
+                auto out = reinterpret_cast<__m256i*>(&output[offset]);
+                for (IndexType j = 0; j < kNumChunks; ++j) {
+                    __m256i sum0 = _mm256_loadA_si256(
+                        &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
+                    __m256i sum1 = _mm256_loadA_si256(
+                      &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
+                    for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+                        sum0 = _mm256_add_epi16(sum0, reinterpret_cast<const __m256i*>(
+                            accumulation[perspectives[p]][i])[j * 2 + 0]);
+                        sum1 = _mm256_add_epi16(sum1, reinterpret_cast<const __m256i*>(
+                            accumulation[perspectives[p]][i])[j * 2 + 1]);
+                    }
+
+                    _mm256_storeA_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
+                        _mm256_packs_epi16(sum0, sum1), kZero), kControl));
+                }
+
+#elif defined(USE_SSE2)
+                auto out = reinterpret_cast<__m128i*>(&output[offset]);
+                for (IndexType j = 0; j < kNumChunks; ++j) {
+                    __m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
+                        accumulation[perspectives[p]][0])[j * 2 + 0]);
+                    __m128i sum1 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
+                        accumulation[perspectives[p]][0])[j * 2 + 1]);
+                    for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+                        sum0 = _mm_add_epi16(sum0, reinterpret_cast<const __m128i*>(
+                            accumulation[perspectives[p]][i])[j * 2 + 0]);
+                        sum1 = _mm_add_epi16(sum1, reinterpret_cast<const __m128i*>(
+                            accumulation[perspectives[p]][i])[j * 2 + 1]);
+                    }
+
+                    const __m128i packedbytes = _mm_packs_epi16(sum0, sum1);
+
+                    _mm_store_si128(&out[j],
+
+#ifdef USE_SSE41
+                        _mm_max_epi8(packedbytes, kZero)
+#else
+                        _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
+#endif
+
+                    );
+                }
+
+#elif defined(USE_MMX)
+                auto out = reinterpret_cast<__m64*>(&output[offset]);
+                for (IndexType j = 0; j < kNumChunks; ++j) {
+                    __m64 sum0 = *(&reinterpret_cast<const __m64*>(
+                        accumulation[perspectives[p]][0])[j * 2 + 0]);
+                    __m64 sum1 = *(&reinterpret_cast<const __m64*>(
+                        accumulation[perspectives[p]][0])[j * 2 + 1]);
+                    for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+                        sum0 = _mm_add_pi16(sum0, reinterpret_cast<const __m64*>(
+                            accumulation[perspectives[p]][i])[j * 2 + 0]);
+                        sum1 = _mm_add_pi16(sum1, reinterpret_cast<const __m64*>(
+                            accumulation[perspectives[p]][i])[j * 2 + 1]);
+                    }
+
+                    const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
+                    out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
+                }
+
+#elif defined(USE_NEON)
+                const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
+                for (IndexType j = 0; j < kNumChunks; ++j) {
+                    int16x8_t sum = reinterpret_cast<const int16x8_t*>(
+                        accumulation[perspectives[p]][0])[j];
+
+                    for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+                        sum = vaddq_s16(sum, reinterpret_cast<const int16x8_t*>(
+                            accumulation[perspectives[p]][i])[j]);
+                    }
+
+                    out[j] = vmax_s8(vqmovn_s16(sum), kZero);
+                }
+
+#else
+                for (IndexType j = 0; j < kHalfDimensions; ++j) {
+                    BiasType sum = accumulation[static_cast<int>(perspectives[p])][0][j];
+                    for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+                        sum += accumulation[static_cast<int>(perspectives[p])][i][j];
+                    }
+
+                    output[offset + j] = static_cast<OutputType>(
+                        std::max<int>(0, std::min<int>(127, sum)));
+                }
+#endif
 
-                for (IndexType k = 0; k < kNumRegs; ++k)
-                  acc[k] = vec_sub_16(acc[k], column[k]);
-              }
             }
-            { // Difference calculation for the activated features
-              for (const auto index : added_indices[perspective]) {
-                const IndexType offset = kHalfDimensions * index + j * kTileHeight;
-                auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
-
-                for (IndexType k = 0; k < kNumRegs; ++k)
-                  acc[k] = vec_add_16(acc[k], column[k]);
-              }
-            }
-
-            for (IndexType k = 0; k < kNumRegs; ++k)
-              vec_store(&accTile[k], acc[k]);
-          }
+#if defined(USE_MMX)
+            _mm_empty();
+#endif
         }
-    #if defined(USE_MMX)
-        _mm_empty();
-    #endif
 
-    #else
-        for (Color perspective : { WHITE, BLACK }) {
+    private:
+        // Calculate cumulative value without using difference calculation
+        void RefreshAccumulator(const Position& pos) const {
+
+            auto& accumulator = pos.state()->accumulator;
+            for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+                Features::IndexList active_indices[2];
+                RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i],
+                                                 active_indices);
+                for (Color perspective : { WHITE, BLACK }) {
+#ifdef TILING
+                    for (unsigned j = 0; j < kHalfDimensions / kTileHeight; ++j) {
+                        auto accTile = reinterpret_cast<vec_t*>(
+                            &accumulator.accumulation[perspective][i][j * kTileHeight]);
+                        vec_t acc[kNumRegs];
+
+                        if (i == 0) {
+                            auto biasesTile = reinterpret_cast<const vec_t*>(
+                                &biases_[j * kTileHeight]);
+                            for (unsigned k = 0; k < kNumRegs; ++k)
+                                acc[k] = biasesTile[k];
+                        } else {
+                            for (unsigned k = 0; k < kNumRegs; ++k)
+                                acc[k] = vec_zero;
+                        }
+
+                        for (const auto index : active_indices[perspective]) {
+                            const IndexType offset = kHalfDimensions * index + j * kTileHeight;
+                            auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
+
+                            for (unsigned k = 0; k < kNumRegs; ++k)
+                                acc[k] = vec_add_16(acc[k], column[k]);
+                        }
+
+                        for (unsigned k = 0; k < kNumRegs; k++)
+                            vec_store(&accTile[k], acc[k]);
+                    }
+#else
+                    if (i == 0) {
+                        std::memcpy(accumulator.accumulation[perspective][i], biases_,
+                                    kHalfDimensions * sizeof(BiasType));
+                    } else {
+                        std::memset(accumulator.accumulation[perspective][i], 0,
+                                    kHalfDimensions * sizeof(BiasType));
+                    }
+
+                    for (const auto index : active_indices[perspective]) {
+                        const IndexType offset = kHalfDimensions * index;
+
+                        for (IndexType j = 0; j < kHalfDimensions; ++j)
+                            accumulator.accumulation[perspective][i][j] += weights_[offset + j];
+                    }
+#endif
+                }
 
-          if (reset[perspective]) {
-            if (i == 0) {
-              std::memcpy(accumulator.accumulation[perspective][i], biases_,
-                          kHalfDimensions * sizeof(BiasType));
-            } else {
-              std::memset(accumulator.accumulation[perspective][i], 0,
-                          kHalfDimensions * sizeof(BiasType));
             }
-          } else {
-            std::memcpy(accumulator.accumulation[perspective][i],
-                        prev_accumulator.accumulation[perspective][i],
-                        kHalfDimensions * sizeof(BiasType));
-            // Difference calculation for the deactivated features
-            for (const auto index : removed_indices[perspective]) {
-              const IndexType offset = kHalfDimensions * index;
 
-              for (IndexType j = 0; j < kHalfDimensions; ++j)
-                accumulator.accumulation[perspective][i][j] -= weights_[offset + j];
-            }
-          }
-          { // Difference calculation for the activated features
-            for (const auto index : added_indices[perspective]) {
-              const IndexType offset = kHalfDimensions * index;
+#if defined(USE_MMX)
+            _mm_empty();
+#endif
 
-              for (IndexType j = 0; j < kHalfDimensions; ++j)
-                accumulator.accumulation[perspective][i][j] += weights_[offset + j];
-            }
-          }
+            accumulator.computed_accumulation = true;
         }
-    #endif
-      }
-      accumulator.computed_accumulation = true;
-    }
 
-    using BiasType = std::int16_t;
-    using WeightType = std::int16_t;
+        // Calculate cumulative value using difference calculation
+        void UpdateAccumulator(const Position& pos) const {
 
-    // Make the learning class a friend
-    friend class Trainer<FeatureTransformer>;
+            const auto& prev_accumulator = pos.state()->previous->accumulator;
+            auto& accumulator = pos.state()->accumulator;
+            for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+                Features::IndexList removed_indices[2], added_indices[2];
+                bool reset[2] = { false, false };
+                RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i],
+                                                  removed_indices, added_indices, reset);
 
-    alignas(kCacheLineSize) BiasType biases_[kHalfDimensions];
-    alignas(kCacheLineSize)
-        WeightType weights_[kHalfDimensions * kInputDimensions];
-  };
+#ifdef TILING
+                for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
+                    for (Color perspective : { WHITE, BLACK }) {
+                        auto accTile = reinterpret_cast<vec_t*>(
+                            &accumulator.accumulation[perspective][i][j * kTileHeight]);
+                        vec_t acc[kNumRegs];
+
+                        if (reset[perspective]) {
+                            if (i == 0) {
+                                auto biasesTile = reinterpret_cast<const vec_t*>(
+                                    &biases_[j * kTileHeight]);
+                                for (unsigned k = 0; k < kNumRegs; ++k)
+                                    acc[k] = biasesTile[k];
+                            } else {
+                                for (unsigned k = 0; k < kNumRegs; ++k)
+                                    acc[k] = vec_zero;
+                            }
+                        } else {
+                            auto prevAccTile = reinterpret_cast<const vec_t*>(
+                                &prev_accumulator.accumulation[perspective][i][j * kTileHeight]);
+
+                            for (IndexType k = 0; k < kNumRegs; ++k)
+                                acc[k] = vec_load(&prevAccTile[k]);
+
+                            // Difference calculation for the deactivated features
+                            for (const auto index : removed_indices[perspective]) {
+                                const IndexType offset = kHalfDimensions * index + j * kTileHeight;
+                                auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
+
+                                for (IndexType k = 0; k < kNumRegs; ++k)
+                                    acc[k] = vec_sub_16(acc[k], column[k]);
+                            }
+                        }
+
+                        { // Difference calculation for the activated features
+                          for (const auto index : added_indices[perspective]) {
+                              const IndexType offset = kHalfDimensions * index + j * kTileHeight;
+                              auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
+
+                              for (IndexType k = 0; k < kNumRegs; ++k)
+                                  acc[k] = vec_add_16(acc[k], column[k]);
+                          }
+                        }
+
+                        for (IndexType k = 0; k < kNumRegs; ++k)
+                          vec_store(&accTile[k], acc[k]);
+                    }
+                }
+#if defined(USE_MMX)
+                _mm_empty();
+#endif
+
+#else
+                for (Color perspective : { WHITE, BLACK }) {
+
+                    if (reset[perspective]) {
+                        if (i == 0) {
+                            std::memcpy(accumulator.accumulation[perspective][i], biases_,
+                                        kHalfDimensions * sizeof(BiasType));
+                        } else {
+                            std::memset(accumulator.accumulation[perspective][i], 0,
+                                        kHalfDimensions * sizeof(BiasType));
+                        }
+                    } else {
+                        std::memcpy(accumulator.accumulation[perspective][i],
+                                    prev_accumulator.accumulation[perspective][i],
+                                    kHalfDimensions * sizeof(BiasType));
+                        // Difference calculation for the deactivated features
+                        for (const auto index : removed_indices[perspective]) {
+                            const IndexType offset = kHalfDimensions * index;
+
+                            for (IndexType j = 0; j < kHalfDimensions; ++j)
+                                accumulator.accumulation[perspective][i][j] -= weights_[offset + j];
+                        }
+                    }
+                    { // Difference calculation for the activated features
+                        for (const auto index : added_indices[perspective]) {
+                          const IndexType offset = kHalfDimensions * index;
+
+                          for (IndexType j = 0; j < kHalfDimensions; ++j)
+                              accumulator.accumulation[perspective][i][j] += weights_[offset + j];
+                        }
+                    }
+                }
+#endif
+            }
+            accumulator.computed_accumulation = true;
+        }
+
+        using BiasType = std::int16_t;
+        using WeightType = std::int16_t;
+
+        // Make the learning class a friend
+        friend class Trainer<FeatureTransformer>;
+
+        alignas(kCacheLineSize) BiasType biases_[kHalfDimensions];
+        alignas(kCacheLineSize)
+            WeightType weights_[kHalfDimensions * kInputDimensions];
+    };
 
 }  // namespace Eval::NNUE
 
-#endif // #ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
+#endif //#ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
diff --git a/src/nnue/nnue_test_command.cpp b/src/nnue/nnue_test_command.cpp
index f6f05c2e..55fa603a 100644
--- a/src/nnue/nnue_test_command.cpp
+++ b/src/nnue/nnue_test_command.cpp
@@ -1,197 +1,215 @@
-﻿// USI extended command for NNUE evaluation function
-
-#include "../thread.h"
-#include "../uci.h"
-#include "evaluate_nnue.h"
+﻿#include "evaluate_nnue.h"
 #include "nnue_test_command.h"
 
+#include "thread.h"
+#include "uci.h"
+
 #include <set>
 #include <fstream>
 
-#define ASSERT(X) { if (!(X)) { std::cout << "\nError : ASSERT(" << #X << "), " << __FILE__ << "(" << __LINE__ << "): " << __func__ << std::endl; \
- std::this_thread::sleep_for(std::chrono::microseconds(3000)); *(int*)1 =0;} }
-
-namespace Eval {
-
-namespace NNUE {
-
-namespace {
-
-// Testing RawFeatures mainly for difference calculation
-void TestFeatures(Position& pos) {
-  const std::uint64_t num_games = 1000;
-  StateInfo si;
-  pos.set(StartFEN, false, &si, Threads.main());
-  const int MAX_PLY = 256; // test up to 256 hands
-
-  StateInfo state[MAX_PLY]; // StateInfo only for the maximum number of steps
-  int ply; // Trouble from the initial phase
-
-  PRNG prng(20171128);
-
-  std::uint64_t num_moves = 0;
-  std::vector<std::uint64_t> num_updates(kRefreshTriggers.size() + 1);
-  std::vector<std::uint64_t> num_resets(kRefreshTriggers.size());
-  constexpr IndexType kUnknown = -1;
-  std::vector<IndexType> trigger_map(RawFeatures::kDimensions, kUnknown);
-  auto make_index_sets = [&](const Position& position) {
-    std::vector<std::vector<std::set<IndexType>>> index_sets(
-        kRefreshTriggers.size(), std::vector<std::set<IndexType>>(2));
-    for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-      Features::IndexList active_indices[2];
-      RawFeatures::AppendActiveIndices(position, kRefreshTriggers[i],
-                                       active_indices);
-      for (const auto perspective : Colors) {
-        for (const auto index : active_indices[perspective]) {
-          ASSERT(index < RawFeatures::kDimensions);
-          ASSERT(index_sets[i][perspective].count(index) == 0);
-          ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
-          index_sets[i][perspective].insert(index);
-          trigger_map[index] = i;
-        }
-      }
-    }
-    return index_sets;
-  };
-  auto update_index_sets = [&](const Position& position, auto* index_sets) {
-    for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-      Features::IndexList removed_indices[2], added_indices[2];
-      bool reset[2] = { false, false };
-      RawFeatures::AppendChangedIndices(position, kRefreshTriggers[i],
-                                        removed_indices, added_indices, reset);
-      for (const auto perspective : Colors) {
-        if (reset[perspective]) {
-          (*index_sets)[i][perspective].clear();
-          ++num_resets[i];
-        } else {
-          for (const auto index : removed_indices[perspective]) {
-            ASSERT(index < RawFeatures::kDimensions);
-            ASSERT((*index_sets)[i][perspective].count(index) == 1);
-            ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
-            (*index_sets)[i][perspective].erase(index);
-            ++num_updates.back();
-            ++num_updates[i];
-            trigger_map[index] = i;
-          }
-        }
-        for (const auto index : added_indices[perspective]) {
-          ASSERT(index < RawFeatures::kDimensions);
-          ASSERT((*index_sets)[i][perspective].count(index) == 0);
-          ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
-          (*index_sets)[i][perspective].insert(index);
-          ++num_updates.back();
-          ++num_updates[i];
-          trigger_map[index] = i;
-        }
-      }
-    }
-  };
-
-  std::cout << "feature set: " << RawFeatures::GetName()
-            << "[" << RawFeatures::kDimensions << "]" << std::endl;
-  std::cout << "start testing with random games";
-
-  for (std::uint64_t i = 0; i < num_games; ++i) {
-    auto index_sets = make_index_sets(pos);
-    for (ply = 0; ply < MAX_PLY; ++ply) {
-      MoveList<LEGAL> mg(pos); // Generate all legal hands
-
-      // There was no legal move == Clog
-      if (mg.size() == 0)
-        break;
-
-      // Randomly choose from the generated moves and advance the phase with the moves.
-      Move m = mg.begin()[prng.rand(mg.size())];
-      pos.do_move(m, state[ply]);
-
-      ++num_moves;
-      update_index_sets(pos, &index_sets);
-      ASSERT(index_sets == make_index_sets(pos));
-    }
-
-    pos.set(StartFEN, false, &si, Threads.main());
-
-    // Output'.' every 100 times (so you can see that it's progressing)
-    if ((i % 100) == 0)
-      std::cout << "." << std::flush;
-  }
-  std::cout << "passed." << std::endl;
-  std::cout << num_games << " games, " << num_moves << " moves, "
-            << num_updates.back() << " updates, "
-            << (1.0 * num_updates.back() / num_moves)
-            << " updates per move" << std::endl;
-  std::size_t num_observed_indices = 0;
-  for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-    const auto count = std::count(trigger_map.begin(), trigger_map.end(), i);
-    num_observed_indices += count;
-    std::cout << "TriggerEvent(" << static_cast<int>(kRefreshTriggers[i])
-              << "): " << count << " features ("
-              << (100.0 * count / RawFeatures::kDimensions) << "%), "
-              << num_updates[i] << " updates ("
-              << (1.0 * num_updates[i] / num_moves) << " per move), "
-              << num_resets[i] << " resets ("
-              << (100.0 * num_resets[i] / num_moves) << "%)"
-              << std::endl;
-  }
-  std::cout << "observed " << num_observed_indices << " ("
-            << (100.0 * num_observed_indices / RawFeatures::kDimensions)
-            << "% of " << RawFeatures::kDimensions
-            << ") features" << std::endl;
+#define ASSERT(X) { \
+    if (!(X)) { \
+        std::cout \
+            << "\nError : ASSERT(" << #X << "), " \
+            << __FILE__ << "(" << __LINE__ << "): " \
+            << __func__ << std::endl; \
+            std::this_thread::sleep_for(std::chrono::microseconds(3000)); \
+            *(int*)1 =0; \
+    } \
 }
 
-// Output a string that represents the structure of the evaluation function
-void PrintInfo(std::istream& stream) {
-  std::cout << "network architecture: " << GetArchitectureString() << std::endl;
-
-  while (true) {
-    std::string file_name;
-    stream >> file_name;
-    if (file_name.empty()) break;
-
-    std::uint32_t hash_value;
-    std::string architecture;
-    const bool success = [&]() {
-      std::ifstream file_stream(file_name, std::ios::binary);
-      if (!file_stream) return false;
-      if (!ReadHeader(file_stream, &hash_value, &architecture)) return false;
-      return true;
-    }();
-
-    std::cout << file_name << ": ";
-    if (success) {
-      if (hash_value == kHashValue) {
-        std::cout << "matches with this binary";
-        if (architecture != GetArchitectureString()) {
-          std::cout << ", but architecture string differs: " << architecture;
-        }
-        std::cout << std::endl;
-      } else {
-        std::cout << architecture << std::endl;
-      }
-    } else {
-      std::cout << "failed to read header" << std::endl;
-    }
-  }
-}
-
-}  // namespace
-
 // USI extended command for NNUE evaluation function
-void TestCommand(Position& pos, std::istream& stream) {
-  std::string sub_command;
-  stream >> sub_command;
+namespace Eval::NNUE {
 
-  if (sub_command == "test_features") {
-    TestFeatures(pos);
-  } else if (sub_command == "info") {
-    PrintInfo(stream);
-  } else {
-    std::cout << "usage:" << std::endl;
-    std::cout << " test nnue test_features" << std::endl;
-    std::cout << " test nnue info [path/to/" << fileName << "...]" << std::endl;
-  }
-}
+    namespace {
 
-}  // namespace NNUE
+        // Testing RawFeatures mainly for difference calculation
+        void TestFeatures(Position& pos) {
+            const std::uint64_t num_games = 1000;
+            StateInfo si;
+            pos.set(StartFEN, false, &si, Threads.main());
+            const int MAX_PLY = 256; // test up to 256 hands
 
-}  // namespace Eval
+            StateInfo state[MAX_PLY]; // StateInfo only for the maximum number of steps
+            int ply; // Trouble from the initial phase
+
+            PRNG prng(20171128);
+
+            std::uint64_t num_moves = 0;
+            std::vector<std::uint64_t> num_updates(kRefreshTriggers.size() + 1);
+            std::vector<std::uint64_t> num_resets(kRefreshTriggers.size());
+            constexpr IndexType kUnknown = -1;
+            std::vector<IndexType> trigger_map(RawFeatures::kDimensions, kUnknown);
+
+            auto make_index_sets = [&](const Position& position) {
+                std::vector<std::vector<std::set<IndexType>>> index_sets(
+                    kRefreshTriggers.size(), std::vector<std::set<IndexType>>(2));
+
+                for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+                    Features::IndexList active_indices[2];
+                    RawFeatures::AppendActiveIndices(position, kRefreshTriggers[i],
+                                                     active_indices);
+
+                    for (const auto perspective : Colors) {
+                        for (const auto index : active_indices[perspective]) {
+                            ASSERT(index < RawFeatures::kDimensions);
+                            ASSERT(index_sets[i][perspective].count(index) == 0);
+                            ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
+                            index_sets[i][perspective].insert(index);
+                            trigger_map[index] = i;
+                        }
+                    }
+                }
+
+                return index_sets;
+            };
+
+            auto update_index_sets = [&](const Position& position, auto* index_sets) {
+                for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+                    Features::IndexList removed_indices[2], added_indices[2];
+                    bool reset[2] = { false, false };
+                    RawFeatures::AppendChangedIndices(position, kRefreshTriggers[i],
+                                                      removed_indices, added_indices, reset);
+                    for (const auto perspective : Colors) {
+                        if (reset[perspective]) {
+                            (*index_sets)[i][perspective].clear();
+                            ++num_resets[i];
+                        } else {
+                            for (const auto index : removed_indices[perspective]) {
+                                ASSERT(index < RawFeatures::kDimensions);
+                                ASSERT((*index_sets)[i][perspective].count(index) == 1);
+                                ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
+                                (*index_sets)[i][perspective].erase(index);
+                                ++num_updates.back();
+                                ++num_updates[i];
+                                trigger_map[index] = i;
+                            }
+                        }
+
+                        for (const auto index : added_indices[perspective]) {
+                            ASSERT(index < RawFeatures::kDimensions);
+                            ASSERT((*index_sets)[i][perspective].count(index) == 0);
+                            ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
+                            (*index_sets)[i][perspective].insert(index);
+                            ++num_updates.back();
+                            ++num_updates[i];
+                            trigger_map[index] = i;
+                        }
+                    }
+                }
+            };
+
+            std::cout << "feature set: " << RawFeatures::GetName()
+                      << "[" << RawFeatures::kDimensions << "]" << std::endl;
+            std::cout << "start testing with random games";
+
+            for (std::uint64_t i = 0; i < num_games; ++i) {
+                auto index_sets = make_index_sets(pos);
+                for (ply = 0; ply < MAX_PLY; ++ply) {
+                    MoveList<LEGAL> mg(pos); // Generate all legal hands
+
+                    // There was no legal move == Clog
+                    if (mg.size() == 0)
+                        break;
+
+                    // Randomly choose from the generated moves and advance the phase with the moves.
+                    Move m = mg.begin()[prng.rand(mg.size())];
+                    pos.do_move(m, state[ply]);
+
+                    ++num_moves;
+                    update_index_sets(pos, &index_sets);
+                    ASSERT(index_sets == make_index_sets(pos));
+                }
+
+                pos.set(StartFEN, false, &si, Threads.main());
+
+                // Output'.' every 100 times (so you can see that it's progressing)
+                if ((i % 100) == 0)
+                    std::cout << "." << std::flush;
+            }
+
+            std::cout << "passed." << std::endl;
+            std::cout << num_games << " games, " << num_moves << " moves, "
+                      << num_updates.back() << " updates, "
+                      << (1.0 * num_updates.back() / num_moves)
+                      << " updates per move" << std::endl;
+            std::size_t num_observed_indices = 0;
+
+            for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+                const auto count = std::count(trigger_map.begin(), trigger_map.end(), i);
+                num_observed_indices += count;
+                std::cout << "TriggerEvent(" << static_cast<int>(kRefreshTriggers[i])
+                          << "): " << count << " features ("
+                          << (100.0 * count / RawFeatures::kDimensions) << "%), "
+                          << num_updates[i] << " updates ("
+                          << (1.0 * num_updates[i] / num_moves) << " per move), "
+                          << num_resets[i] << " resets ("
+                          << (100.0 * num_resets[i] / num_moves) << "%)"
+                          << std::endl;
+            }
+            std::cout << "observed " << num_observed_indices << " ("
+                      << (100.0 * num_observed_indices / RawFeatures::kDimensions)
+                      << "% of " << RawFeatures::kDimensions
+                      << ") features" << std::endl;
+        }
+
+        // Output a string that represents the structure of the evaluation function
+        void PrintInfo(std::istream& stream) {
+            std::cout << "network architecture: " << GetArchitectureString() << std::endl;
+
+            while (true) {
+                std::string file_name;
+                stream >> file_name;
+                if (file_name.empty())
+                    break;
+
+                std::uint32_t hash_value;
+                std::string architecture;
+                const bool success = [&]() {
+                    std::ifstream file_stream(file_name, std::ios::binary);
+
+                    if (!file_stream)
+                        return false;
+                    if (!ReadHeader(file_stream, &hash_value, &architecture))
+                        return false;
+
+                    return true;
+                }();
+
+                std::cout << file_name << ": ";
+                if (success) {
+                    if (hash_value == kHashValue) {
+                        std::cout << "matches with this binary";
+                        if (architecture != GetArchitectureString()) {
+                            std::cout << ", but architecture string differs: " << architecture;
+                        }
+
+                        std::cout << std::endl;
+                    } else {
+                        std::cout << architecture << std::endl;
+                    }
+                } else {
+                    std::cout << "failed to read header" << std::endl;
+                }
+            }
+        }
+
+    }  // namespace
+
+    // USI extended command for NNUE evaluation function
+    void TestCommand(Position& pos, std::istream& stream) {
+        std::string sub_command;
+        stream >> sub_command;
+
+        if (sub_command == "test_features") {
+            TestFeatures(pos);
+        } else if (sub_command == "info") {
+            PrintInfo(stream);
+        } else {
+            std::cout << "usage:" << std::endl;
+            std::cout << " test nnue test_features" << std::endl;
+            std::cout << " test nnue info [path/to/" << fileName << "...]" << std::endl;
+        }
+    }
+
+}  // namespace Eval::NNUE
diff --git a/src/nnue/nnue_test_command.h b/src/nnue/nnue_test_command.h
index 75d33e82..989731d6 100644
--- a/src/nnue/nnue_test_command.h
+++ b/src/nnue/nnue_test_command.h
@@ -1,17 +1,12 @@
-﻿// USI extended command interface for NNUE evaluation function
-
-#ifndef _NNUE_TEST_COMMAND_H_
+﻿#ifndef _NNUE_TEST_COMMAND_H_
 #define _NNUE_TEST_COMMAND_H_
 
-namespace Eval {
+// USI extended command interface for NNUE evaluation function
+namespace Eval::NNUE {
 
-namespace NNUE {
+    // USI extended command for NNUE evaluation function
+    void TestCommand(Position& pos, std::istream& stream);
 
-// USI extended command for NNUE evaluation function
-void TestCommand(Position& pos, std::istream& stream);
-
-}  // namespace NNUE
-
-}  // namespace Eval
+}  // namespace Eval::NNUE
 
 #endif

From 77624addf2763de1418162a1ed34527dadc83da5 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 14 Oct 2020 21:46:40 +0200
Subject: [PATCH 215/398] Cleanup last ".." in include paths.

---
 src/nnue/trainer/trainer_sum.h |  6 ++++--
 src/syzygy/tbprobe.cpp         | 12 ++++++------
 src/syzygy/tbprobe.h           |  2 +-
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/nnue/trainer/trainer_sum.h b/src/nnue/trainer/trainer_sum.h
index 9904704b..24fc6152 100644
--- a/src/nnue/trainer/trainer_sum.h
+++ b/src/nnue/trainer/trainer_sum.h
@@ -1,10 +1,12 @@
 ﻿#ifndef _NNUE_TRAINER_SUM_H_
 #define _NNUE_TRAINER_SUM_H_
 
-#include "../../learn/learn.h"
-#include "../layers/sum.h"
 #include "trainer.h"
 
+#include "learn/learn.h"
+
+#include "nnue/layers/sum.h"
+
 // Specialization of NNUE evaluation function learning class template for Sum
 namespace Eval::NNUE {
 
diff --git a/src/syzygy/tbprobe.cpp b/src/syzygy/tbprobe.cpp
index f4b9447f..191986da 100644
--- a/src/syzygy/tbprobe.cpp
+++ b/src/syzygy/tbprobe.cpp
@@ -28,12 +28,12 @@
 #include <type_traits>
 #include <mutex>
 
-#include "../bitboard.h"
-#include "../movegen.h"
-#include "../position.h"
-#include "../search.h"
-#include "../types.h"
-#include "../uci.h"
+#include "bitboard.h"
+#include "movegen.h"
+#include "position.h"
+#include "search.h"
+#include "types.h"
+#include "uci.h"
 
 #include "tbprobe.h"
 
diff --git a/src/syzygy/tbprobe.h b/src/syzygy/tbprobe.h
index b998989b..efc4b6b7 100644
--- a/src/syzygy/tbprobe.h
+++ b/src/syzygy/tbprobe.h
@@ -21,7 +21,7 @@
 
 #include <ostream>
 
-#include "../search.h"
+#include "search.h"
 
 namespace Tablebases {
 

From 9023edc3c864e1932cab7cec7a1608c5d7dce27a Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 14 Oct 2020 21:48:31 +0200
Subject: [PATCH 216/398] Add missing includes.

---
 src/nnue/evaluate_nnue.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/nnue/evaluate_nnue.h b/src/nnue/evaluate_nnue.h
index e6ddc7fd..264d24fe 100644
--- a/src/nnue/evaluate_nnue.h
+++ b/src/nnue/evaluate_nnue.h
@@ -21,6 +21,8 @@
 
 #include "nnue_feature_transformer.h"
 
+#include "misc.h"
+
 #include <memory>
 
 // header used in NNUE evaluation function

From 69ea3d30b241b268cc5b521ce6b6a6c6274c94e9 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 14 Oct 2020 21:58:25 +0200
Subject: [PATCH 217/398] Move the extra new line to after check health.

---
 src/learn/learn.cpp                            | 4 ----
 src/nnue/trainer/trainer_feature_transformer.h | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 95cbe4bb..205b9220 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -903,10 +903,6 @@ namespace Learner
                     << " , learn_entropy = " << learn_sum_entropy / done
                     << endl;
             }
-
-            // Bigger space between progress reports so that they can be more
-            // easly disinguished. Looking for timestamps is hard.
-            cout << endl;
         }
         else
         {
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index f403e413..4173f46d 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -349,7 +349,7 @@ namespace Eval::NNUE {
 
             std::cout << "INFO: largest min activation = " << largest_min_activation
                       << ", smallest max activation = " << smallest_max_activation
-                      << std::endl;
+                      << std::endl << std::endl;
 
             std::fill(std::begin(min_activations_), std::end(min_activations_),
                       std::numeric_limits<LearnFloatType>::max());

From 2398d34e87226df0244fda050440f6a63115b79f Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 14 Oct 2020 22:35:35 +0200
Subject: [PATCH 218/398] Move string split to misc

---
 src/misc.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/misc.h b/src/misc.h
index 6696b0a8..ae1d69d4 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -30,6 +30,7 @@
 #include <utility>
 #include <cmath>
 #include <cctype>
+#include <sstream>
 
 #include "types.h"
 
@@ -273,6 +274,19 @@ namespace Algo {
         for (uint64_t i = 0; i < size; ++i)
             std::swap(buf[i], buf[prng.rand(size - i) + i]);
     }
+
+    // split the string
+    inline std::vector<std::string> split(const std::string& input, char delimiter) {
+        std::istringstream stream(input);
+        std::string field;
+        std::vector<std::string> fields;
+
+        while (std::getline(stream, field, delimiter)) {
+            fields.push_back(field);
+        }
+
+        return fields;
+    }
 }
 
 // --------------------

From 146a6b056ed2daef9a06da0ae28ce5bcdb351dbf Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 14 Oct 2020 22:42:58 +0200
Subject: [PATCH 219/398] PascalCase -> snake_case for consistency with the
 rest of the codebase.

---
 src/learn/learn.cpp                           |  28 +-
 src/misc.h                                    |   4 +-
 src/nnue/evaluate_nnue.cpp                    | 547 ++++++++++--------
 src/nnue/evaluate_nnue.h                      |  12 +-
 src/nnue/evaluate_nnue_learner.cpp            |  64 +-
 src/nnue/evaluate_nnue_learner.h              |  21 +-
 src/nnue/features/castling_right.cpp          |  15 +-
 src/nnue/features/castling_right.h            |  11 +-
 src/nnue/features/enpassant.cpp               |  14 +-
 src/nnue/features/enpassant.h                 |  11 +-
 src/nnue/features/feature_set.h               |  75 ++-
 src/nnue/features/half_kp.cpp                 |  33 +-
 src/nnue/features/half_kp.h                   |  15 +-
 src/nnue/features/half_relative_kp.cpp        |  35 +-
 src/nnue/features/half_relative_kp.h          |  17 +-
 src/nnue/features/k.cpp                       |  22 +-
 src/nnue/features/k.h                         |  51 +-
 src/nnue/features/p.cpp                       |  22 +-
 src/nnue/features/p.h                         |  51 +-
 src/nnue/layers/affine_transform.h            |  26 +-
 src/nnue/layers/clipped_relu.h                |  22 +-
 src/nnue/layers/input_slice.h                 |  10 +-
 src/nnue/layers/sum.h                         |  64 +-
 src/nnue/nnue_common.h                        |   2 +-
 src/nnue/nnue_feature_transformer.h           |  32 +-
 src/nnue/nnue_test_command.cpp                |  22 +-
 src/nnue/nnue_test_command.h                  |   2 +-
 src/nnue/trainer/features/factorizer.h        |  22 +-
 .../trainer/features/factorizer_feature_set.h |  34 +-
 .../trainer/features/factorizer_half_kp.h     |  24 +-
 src/nnue/trainer/trainer.h                    |  33 +-
 src/nnue/trainer/trainer_affine_transform.h   |  44 +-
 src/nnue/trainer/trainer_clipped_relu.h       |  26 +-
 .../trainer/trainer_feature_transformer.h     |  78 +--
 src/nnue/trainer/trainer_input_slice.h        |  42 +-
 src/nnue/trainer/trainer_sum.h                |  48 +-
 src/uci.cpp                                   |   2 +-
 37 files changed, 844 insertions(+), 737 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 205b9220..dfbba391 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -964,7 +964,7 @@ namespace Learner
 
                         // Lock the evaluation function so that it is not used during updating.
                         lock_guard<shared_timed_mutex> write_lock(nn_mutex);
-                        Eval::NNUE::UpdateParameters();
+                        Eval::NNUE::update_parameters();
                     }
 
                     ++epoch;
@@ -998,7 +998,7 @@ namespace Learner
                         // loss calculation
                         calc_loss(thread_id, done);
 
-                        Eval::NNUE::CheckHealth();
+                        Eval::NNUE::check_health();
 
                         // Make a note of how far you have totaled.
                         sr.last_done = sr.total_done;
@@ -1127,7 +1127,7 @@ namespace Learner
                 learn_sum_entropy_win += learn_entropy_win;
                 learn_sum_entropy += learn_entropy;
 
-                Eval::NNUE::AddExample(pos, rootColor, ps, 1.0);
+                Eval::NNUE::add_example(pos, rootColor, ps, 1.0);
 
                 // Since the processing is completed, the counter of the processed number is incremented
                 sr.total_done++;
@@ -1194,7 +1194,7 @@ namespace Learner
                 {
                     cout << " < best (" << best_loss << "), accepted" << endl;
                     best_loss = latest_loss;
-                    best_nn_directory = Path::Combine((std::string)Options["EvalSaveDir"], dir_name);
+                    best_nn_directory = Path::combine((std::string)Options["EvalSaveDir"], dir_name);
                     trials = newbob_num_trials;
 
                     if (tot >= last_lr_drop + auto_lr_drop)
@@ -1207,13 +1207,13 @@ namespace Learner
                 {
                     cout << " < best (" << best_loss << "), accepted" << endl;
                     best_loss = latest_loss;
-                    best_nn_directory = Path::Combine((std::string)Options["EvalSaveDir"], dir_name);
+                    best_nn_directory = Path::combine((std::string)Options["EvalSaveDir"], dir_name);
                     trials = newbob_num_trials;
                 }
                 else
                 {
                     cout << " >= best (" << best_loss << "), rejected" << endl;
-                    best_nn_directory = Path::Combine((std::string)Options["EvalSaveDir"], dir_name);
+                    best_nn_directory = Path::combine((std::string)Options["EvalSaveDir"], dir_name);
 
                     if (--trials > 0 && !is_final)
                     {
@@ -1713,14 +1713,14 @@ namespace Learner
         // Display learning game file
         if (target_dir != "")
         {
-            string kif_base_dir = Path::Combine(base_dir, target_dir);
+            string kif_base_dir = Path::combine(base_dir, target_dir);
 
             namespace sys = std::filesystem;
             sys::path p(kif_base_dir); // Origin of enumeration
             std::for_each(sys::directory_iterator(p), sys::directory_iterator(),
                 [&](const sys::path& path) {
                     if (sys::is_regular_file(path))
-                        filenames.push_back(Path::Combine(target_dir, path.filename().generic_string()));
+                        filenames.push_back(Path::combine(target_dir, path.filename().generic_string()));
                 });
         }
 
@@ -1814,7 +1814,7 @@ namespace Learner
             // order so I'll reverse it here. I'm sorry.
             for (auto it = filenames.rbegin(); it != filenames.rend(); ++it)
             {
-                sr.filenames.push_back(Path::Combine(base_dir, *it));
+                sr.filenames.push_back(Path::combine(base_dir, *it));
             }
         }
 
@@ -1858,9 +1858,9 @@ namespace Learner
         set_learning_search_limits();
 
         cout << "init_training.." << endl;
-        Eval::NNUE::InitializeTraining(seed);
-        Eval::NNUE::SetBatchSize(nn_batch_size);
-        Eval::NNUE::SetOptions(nn_options);
+        Eval::NNUE::initialize_training(seed);
+        Eval::NNUE::set_batch_size(nn_batch_size);
+        Eval::NNUE::set_options(nn_options);
         if (newbob_decay != 1.0 && !Options["SkipLoadingEval"]) {
             // Save the current net to [EvalSaveDir]\original.
             Eval::NNUE::save_eval("original");
@@ -1868,7 +1868,7 @@ namespace Learner
             // Set the folder above to best_nn_directory so that the trainer can
             // resotre the network parameters from the original net file.
             learn_think.best_nn_directory =
-                Path::Combine(Options["EvalSaveDir"], "original");
+                Path::combine(Options["EvalSaveDir"], "original");
         }
 
         cout << "init done." << endl;
@@ -1925,7 +1925,7 @@ namespace Learner
         // Start learning.
         learn_think.go_think();
 
-        Eval::NNUE::FinalizeNet();
+        Eval::NNUE::finalize_net();
 
         // Save once at the end.
         learn_think.save(true);
diff --git a/src/misc.h b/src/misc.h
index ae1d69d4..320eea76 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -299,7 +299,7 @@ struct Path
 {
 	// Combine the path name and file name and return it.
 	// If the folder name is not an empty string, append it if there is no'/' or'\\' at the end.
-	static std::string Combine(const std::string& folder, const std::string& filename)
+	static std::string combine(const std::string& folder, const std::string& filename)
 	{
 		if (folder.length() >= 1 && *folder.rbegin() != '/' && *folder.rbegin() != '\\')
 			return folder + "/" + filename;
@@ -308,7 +308,7 @@ struct Path
 	}
 
 	// Get the file name part (excluding the folder name) from the full path expression.
-	static std::string GetFileName(const std::string& path)
+	static std::string get_file_name(const std::string& path)
 	{
 		// I don't know which "\" or "/" is used.
 		auto path_index1 = path.find_last_of("\\") + 1;
diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index 0d504468..67398f81 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -1,303 +1,338 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
 
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
 
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 // Code for calculating NNUE evaluation function
 
+#include "evaluate_nnue.h"
+
+#include "position.h"
+#include "misc.h"
+#include "uci.h"
+#include "types.h"
+
 #include <iostream>
 #include <string>
 #include <fstream>
 #include <set>
 
-#include "../position.h"
-#include "../misc.h"
-#include "../uci.h"
-#include "../types.h"
-
-#include "evaluate_nnue.h"
-
 namespace Eval::NNUE {
 
-  const uint32_t kpp_board_index[PIECE_NB][COLOR_NB] = {
-   // convention: W - us, B - them
-   // viewed from other side, W and B are reversed
-      { PS_NONE,     PS_NONE     },
-      { PS_W_PAWN,   PS_B_PAWN   },
-      { PS_W_KNIGHT, PS_B_KNIGHT },
-      { PS_W_BISHOP, PS_B_BISHOP },
-      { PS_W_ROOK,   PS_B_ROOK   },
-      { PS_W_QUEEN,  PS_B_QUEEN  },
-      { PS_W_KING,   PS_B_KING   },
-      { PS_NONE,     PS_NONE     },
-      { PS_NONE,     PS_NONE     },
-      { PS_B_PAWN,   PS_W_PAWN   },
-      { PS_B_KNIGHT, PS_W_KNIGHT },
-      { PS_B_BISHOP, PS_W_BISHOP },
-      { PS_B_ROOK,   PS_W_ROOK   },
-      { PS_B_QUEEN,  PS_W_QUEEN  },
-      { PS_B_KING,   PS_W_KING   },
-      { PS_NONE,     PS_NONE     }
-  };
+    const uint32_t kpp_board_index[PIECE_NB][COLOR_NB] = {
+        // convention: W - us, B - them
+        // viewed from other side, W and B are reversed
+        { PS_NONE,     PS_NONE     },
+        { PS_W_PAWN,   PS_B_PAWN   },
+        { PS_W_KNIGHT, PS_B_KNIGHT },
+        { PS_W_BISHOP, PS_B_BISHOP },
+        { PS_W_ROOK,   PS_B_ROOK   },
+        { PS_W_QUEEN,  PS_B_QUEEN  },
+        { PS_W_KING,   PS_B_KING   },
+        { PS_NONE,     PS_NONE     },
+        { PS_NONE,     PS_NONE     },
+        { PS_B_PAWN,   PS_W_PAWN   },
+        { PS_B_KNIGHT, PS_W_KNIGHT },
+        { PS_B_BISHOP, PS_W_BISHOP },
+        { PS_B_ROOK,   PS_W_ROOK   },
+        { PS_B_QUEEN,  PS_W_QUEEN  },
+        { PS_B_KING,   PS_W_KING   },
+        { PS_NONE,     PS_NONE     }
+    };
 
-  // Input feature converter
-  LargePagePtr<FeatureTransformer> feature_transformer;
+    // Input feature converter
+    LargePagePtr<FeatureTransformer> feature_transformer;
 
-  // Evaluation function
-  AlignedPtr<Network> network;
+    // Evaluation function
+    AlignedPtr<Network> network;
 
-  // Evaluation function file name
-  std::string fileName;
+    // Evaluation function file name
+    std::string fileName;
 
-  // Saved evaluation function file name
-  std::string savedfileName = "nn.bin";
+    // Saved evaluation function file name
+    std::string savedfileName = "nn.bin";
 
-  // Get a string that represents the structure of the evaluation function
-  std::string GetArchitectureString() {
-    return "Features=" + FeatureTransformer::GetStructureString() +
-      ",Network=" + Network::GetStructureString();
-  }
-
-  UseNNUEMode useNNUE;
-  std::string eval_file_loaded = "None";
-
-  namespace Detail {
-
-  // Initialize the evaluation function parameters
-  template <typename T>
-  void Initialize(AlignedPtr<T>& pointer) {
-
-    pointer.reset(reinterpret_cast<T*>(std_aligned_alloc(alignof(T), sizeof(T))));
-    std::memset(pointer.get(), 0, sizeof(T));
-  }
-
-  template <typename T>
-  void Initialize(LargePagePtr<T>& pointer) {
-
-    static_assert(alignof(T) <= 4096, "aligned_large_pages_alloc() may fail for such a big alignment requirement of T");
-    pointer.reset(reinterpret_cast<T*>(aligned_large_pages_alloc(sizeof(T))));
-    std::memset(pointer.get(), 0, sizeof(T));
-  }
-
-  // Read evaluation function parameters
-  template <typename T>
-  bool ReadParameters(std::istream& stream, T& reference) {
-
-    std::uint32_t header;
-    header = read_little_endian<std::uint32_t>(stream);
-    if (!stream || header != T::GetHashValue()) return false;
-    return reference.ReadParameters(stream);
-  }
-
-  // write evaluation function parameters
-  template <typename T>
-  bool WriteParameters(std::ostream& stream, const AlignedPtr<T>& pointer) {
-    constexpr std::uint32_t header = T::GetHashValue();
-    stream.write(reinterpret_cast<const char*>(&header), sizeof(header));
-    return pointer->WriteParameters(stream);
-  }
-
-  template <typename T>
-  bool WriteParameters(std::ostream& stream, const LargePagePtr<T>& pointer) {
-    constexpr std::uint32_t header = T::GetHashValue();
-    stream.write(reinterpret_cast<const char*>(&header), sizeof(header));
-    return pointer->WriteParameters(stream);
-  }
-
-  }  // namespace Detail
-
-  // Initialize the evaluation function parameters
-  void Initialize() {
-
-    Detail::Initialize(feature_transformer);
-    Detail::Initialize(network);
-  }
-
-  // Read network header
-  bool ReadHeader(std::istream& stream, std::uint32_t* hash_value, std::string* architecture)
-  {
-    std::uint32_t version, size;
-
-    version     = read_little_endian<std::uint32_t>(stream);
-    *hash_value = read_little_endian<std::uint32_t>(stream);
-    size        = read_little_endian<std::uint32_t>(stream);
-    if (!stream || version != kVersion) return false;
-    architecture->resize(size);
-    stream.read(&(*architecture)[0], size);
-    return !stream.fail();
-  }
-
-  // write the header
-  bool WriteHeader(std::ostream& stream,
-    std::uint32_t hash_value, const std::string& architecture) {
-    stream.write(reinterpret_cast<const char*>(&kVersion), sizeof(kVersion));
-    stream.write(reinterpret_cast<const char*>(&hash_value), sizeof(hash_value));
-    const std::uint32_t size = static_cast<std::uint32_t>(architecture.size());
-    stream.write(reinterpret_cast<const char*>(&size), sizeof(size));
-    stream.write(architecture.data(), size);
-    return !stream.fail();
-  }
-
-  // Read network parameters
-  bool ReadParameters(std::istream& stream) {
-
-    std::uint32_t hash_value;
-    std::string architecture;
-    if (!ReadHeader(stream, &hash_value, &architecture)) return false;
-    if (hash_value != kHashValue) return false;
-    if (!Detail::ReadParameters(stream, *feature_transformer)) return false;
-    if (!Detail::ReadParameters(stream, *network)) return false;
-    return stream && stream.peek() == std::ios::traits_type::eof();
-  }
-  // write evaluation function parameters
-  bool WriteParameters(std::ostream& stream) {
-    if (!WriteHeader(stream, kHashValue, GetArchitectureString())) return false;
-    if (!Detail::WriteParameters(stream, feature_transformer)) return false;
-    if (!Detail::WriteParameters(stream, network)) return false;
-    return !stream.fail();
-  }
-  // Evaluation function. Perform differential calculation.
-  Value evaluate(const Position& pos) {
-
-    alignas(kCacheLineSize) TransformedFeatureType
-        transformed_features[FeatureTransformer::kBufferSize];
-    feature_transformer->Transform(pos, transformed_features);
-    alignas(kCacheLineSize) char buffer[Network::kBufferSize];
-    const auto output = network->Propagate(transformed_features, buffer);
-
-    return static_cast<Value>(output[0] / FV_SCALE);
-  }
-
-  // Load eval, from a file stream or a memory stream
-  bool load_eval(std::string name, std::istream& stream) {
-
-    Initialize();
-
-    fileName = name;
-    return ReadParameters(stream);
-  }
-
-  static UseNNUEMode nnue_mode_from_option(const UCI::Option& mode)
-  {
-    if (mode == "false")
-      return UseNNUEMode::False;
-    else if (mode == "true")
-      return UseNNUEMode::True;
-    else if (mode == "pure")
-      return UseNNUEMode::Pure;
-
-    return UseNNUEMode::False;
-  }
-
-  void init() {
-
-    useNNUE = nnue_mode_from_option(Options["Use NNUE"]);
-
-    if (Options["SkipLoadingEval"] || useNNUE == UseNNUEMode::False)
-    {
-      eval_file_loaded.clear();
-      return;
+    // Get a string that represents the structure of the evaluation function
+    std::string get_architecture_string() {
+        return "Features=" + FeatureTransformer::get_structure_string() +
+            ",Network=" + Network::get_structure_string();
     }
 
-    std::string eval_file = std::string(Options["EvalFile"]);
+    UseNNUEMode useNNUE;
+    std::string eval_file_loaded = "None";
+
+    namespace Detail {
+
+        // Initialize the evaluation function parameters
+        template <typename T>
+        void initialize(AlignedPtr<T>& pointer) {
+
+            pointer.reset(reinterpret_cast<T*>(std_aligned_alloc(alignof(T), sizeof(T))));
+            std::memset(pointer.get(), 0, sizeof(T));
+        }
+
+        template <typename T>
+        void initialize(LargePagePtr<T>& pointer) {
+
+            static_assert(alignof(T) <= 4096, "aligned_large_pages_alloc() may fail for such a big alignment requirement of T");
+
+            pointer.reset(reinterpret_cast<T*>(aligned_large_pages_alloc(sizeof(T))));
+            std::memset(pointer.get(), 0, sizeof(T));
+        }
+
+        // Read evaluation function parameters
+        template <typename T>
+        bool read_parameters(std::istream& stream, T& reference) {
+
+            std::uint32_t header;
+            header = read_little_endian<std::uint32_t>(stream);
+
+            if (!stream || header != T::get_hash_value())
+                return false;
+
+            return reference.read_parameters(stream);
+        }
+
+        // write evaluation function parameters
+        template <typename T>
+        bool write_parameters(std::ostream& stream, const AlignedPtr<T>& pointer) {
+            constexpr std::uint32_t header = T::get_hash_value();
+
+            stream.write(reinterpret_cast<const char*>(&header), sizeof(header));
+
+            return pointer->write_parameters(stream);
+        }
+
+        template <typename T>
+        bool write_parameters(std::ostream& stream, const LargePagePtr<T>& pointer) {
+            constexpr std::uint32_t header = T::get_hash_value();
+
+            stream.write(reinterpret_cast<const char*>(&header), sizeof(header));
+
+            return pointer->write_parameters(stream);
+        }
+    }  // namespace Detail
+
+    // Initialize the evaluation function parameters
+    void initialize() {
+
+        Detail::initialize(feature_transformer);
+        Detail::initialize(network);
+    }
+
+    // Read network header
+    bool read_header(std::istream& stream, std::uint32_t* hash_value, std::string* architecture)
+    {
+        std::uint32_t version, size;
+
+        version     = read_little_endian<std::uint32_t>(stream);
+        *hash_value = read_little_endian<std::uint32_t>(stream);
+        size        = read_little_endian<std::uint32_t>(stream);
+
+        if (!stream || version != kVersion)
+            return false;
+
+        architecture->resize(size);
+        stream.read(&(*architecture)[0], size);
+
+        return !stream.fail();
+    }
+
+    // write the header
+    bool write_header(std::ostream& stream,
+        std::uint32_t hash_value, const std::string& architecture) {
+
+        stream.write(reinterpret_cast<const char*>(&kVersion), sizeof(kVersion));
+        stream.write(reinterpret_cast<const char*>(&hash_value), sizeof(hash_value));
+
+        const std::uint32_t size = static_cast<std::uint32_t>(architecture.size());
+
+        stream.write(reinterpret_cast<const char*>(&size), sizeof(size));
+        stream.write(architecture.data(), size);
+
+        return !stream.fail();
+    }
+
+    // Read network parameters
+    bool read_parameters(std::istream& stream) {
+
+        std::uint32_t hash_value;
+        std::string architecture;
+        if (!read_header(stream, &hash_value, &architecture))
+            return false;
+
+        if (hash_value != kHashValue)
+            return false;
+
+        if (!Detail::read_parameters(stream, *feature_transformer))
+            return false;
+
+        if (!Detail::read_parameters(stream, *network))
+            return false;
+
+        return stream && stream.peek() == std::ios::traits_type::eof();
+    }
+    // write evaluation function parameters
+    bool write_parameters(std::ostream& stream) {
+
+        if (!write_header(stream, kHashValue, get_architecture_string()))
+            return false;
+
+        if (!Detail::write_parameters(stream, feature_transformer))
+            return false;
+
+        if (!Detail::write_parameters(stream, network))
+            return false;
+
+        return !stream.fail();
+    }
+    // Evaluation function. Perform differential calculation.
+    Value evaluate(const Position& pos) {
+
+        alignas(kCacheLineSize) TransformedFeatureType
+            transformed_features[FeatureTransformer::kBufferSize];
+
+        feature_transformer->transform(pos, transformed_features);
+
+        alignas(kCacheLineSize) char buffer[Network::kBufferSize];
+
+        const auto output = network->propagate(transformed_features, buffer);
+
+        return static_cast<Value>(output[0] / FV_SCALE);
+    }
+
+    // Load eval, from a file stream or a memory stream
+    bool load_eval(std::string name, std::istream& stream) {
+
+        initialize();
+
+        fileName = name;
+        return read_parameters(stream);
+    }
+
+    static UseNNUEMode nnue_mode_from_option(const UCI::Option& mode)
+    {
+        if (mode == "false")
+          return UseNNUEMode::False;
+        else if (mode == "true")
+          return UseNNUEMode::True;
+        else if (mode == "pure")
+          return UseNNUEMode::Pure;
+
+        return UseNNUEMode::False;
+    }
+
+    void init() {
+
+        useNNUE = nnue_mode_from_option(Options["Use NNUE"]);
+
+        if (Options["SkipLoadingEval"] || useNNUE == UseNNUEMode::False)
+        {
+            eval_file_loaded.clear();
+            return;
+        }
+
+        std::string eval_file = std::string(Options["EvalFile"]);
 
 #if defined(DEFAULT_NNUE_DIRECTORY)
 #define stringify2(x) #x
 #define stringify(x) stringify2(x)
-    std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory , stringify(DEFAULT_NNUE_DIRECTORY) };
+        std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory , stringify(DEFAULT_NNUE_DIRECTORY) };
 #else
-    std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory };
+        std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory };
 #endif
 
-    for (std::string directory : dirs)
-        if (eval_file_loaded != eval_file)
+        for (std::string directory : dirs)
         {
-            std::ifstream stream(directory + eval_file, std::ios::binary);
-            if (load_eval(eval_file, stream))
+            if (eval_file_loaded != eval_file)
             {
-                sync_cout << "info string Loaded eval file " << directory + eval_file << sync_endl;
-                eval_file_loaded = eval_file;
-            }
-            else
-            {
-                sync_cout << "info string ERROR: failed to load eval file " << directory + eval_file << sync_endl;
-                eval_file_loaded.clear();
+                std::ifstream stream(directory + eval_file, std::ios::binary);
+                if (load_eval(eval_file, stream))
+                {
+                    sync_cout << "info string Loaded eval file " << directory + eval_file << sync_endl;
+                    eval_file_loaded = eval_file;
+                }
+                else
+                {
+                    sync_cout << "info string ERROR: failed to load eval file " << directory + eval_file << sync_endl;
+                    eval_file_loaded.clear();
+                }
             }
         }
 
 #undef stringify2
 #undef stringify
-  }
-
-  /// NNUE::verify() verifies that the last net used was loaded successfully
-  void verify_eval_file_loaded() {
-
-    std::string eval_file = std::string(Options["EvalFile"]);
-
-    if (useNNUE != UseNNUEMode::False && eval_file_loaded != eval_file)
-    {
-        UCI::OptionsMap defaults;
-        UCI::init(defaults);
-
-        std::string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available.";
-        std::string msg2 = "The option is set to true, but the network file " + eval_file + " was not loaded successfully.";
-        std::string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file.";
-        std::string msg4 = "The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/" + std::string(defaults["EvalFile"]);
-        std::string msg5 = "The engine will be terminated now.";
-
-        sync_cout << "info string ERROR: " << msg1 << sync_endl;
-        sync_cout << "info string ERROR: " << msg2 << sync_endl;
-        sync_cout << "info string ERROR: " << msg3 << sync_endl;
-        sync_cout << "info string ERROR: " << msg4 << sync_endl;
-        sync_cout << "info string ERROR: " << msg5 << sync_endl;
-
-        std::exit(EXIT_FAILURE);
     }
 
-    if (useNNUE != UseNNUEMode::False)
-        sync_cout << "info string NNUE evaluation using " << eval_file << " enabled" << sync_endl;
-    else
-        sync_cout << "info string classical evaluation enabled" << sync_endl;
-  }
+    /// NNUE::verify() verifies that the last net used was loaded successfully
+    void verify_eval_file_loaded() {
 
-  /// In training we override eval file so this is useful.
-  void verify_any_net_loaded() {
+        std::string eval_file = std::string(Options["EvalFile"]);
 
-    if (!Options["SkipLoadingEval"] && useNNUE != UseNNUEMode::False && eval_file_loaded.empty())
-    {
-        UCI::OptionsMap defaults;
-        UCI::init(defaults);
+        if (useNNUE != UseNNUEMode::False && eval_file_loaded != eval_file)
+        {
+            UCI::OptionsMap defaults;
+            UCI::init(defaults);
 
-        std::string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available.";
-        std::string msg2 = "The option is set to true, but the network file was not loaded successfully.";
-        std::string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file.";
-        std::string msg5 = "The engine will be terminated now.";
+            std::string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available.";
+            std::string msg2 = "The option is set to true, but the network file " + eval_file + " was not loaded successfully.";
+            std::string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file.";
+            std::string msg4 = "The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/" + std::string(defaults["EvalFile"]);
+            std::string msg5 = "The engine will be terminated now.";
 
-        sync_cout << "info string ERROR: " << msg1 << sync_endl;
-        sync_cout << "info string ERROR: " << msg2 << sync_endl;
-        sync_cout << "info string ERROR: " << msg3 << sync_endl;
-        sync_cout << "info string ERROR: " << msg5 << sync_endl;
+            sync_cout << "info string ERROR: " << msg1 << sync_endl;
+            sync_cout << "info string ERROR: " << msg2 << sync_endl;
+            sync_cout << "info string ERROR: " << msg3 << sync_endl;
+            sync_cout << "info string ERROR: " << msg4 << sync_endl;
+            sync_cout << "info string ERROR: " << msg5 << sync_endl;
 
-        std::exit(EXIT_FAILURE);
+            std::exit(EXIT_FAILURE);
+        }
+
+        if (useNNUE != UseNNUEMode::False)
+            sync_cout << "info string NNUE evaluation using " << eval_file << " enabled" << sync_endl;
+        else
+            sync_cout << "info string classical evaluation enabled" << sync_endl;
     }
 
-    if (useNNUE != UseNNUEMode::False)
-        sync_cout << "info string NNUE evaluation using " << eval_file_loaded << " enabled" << sync_endl;
-    else
-        sync_cout << "info string classical evaluation enabled" << sync_endl;
-  }
+    /// In training we override eval file so this is useful.
+    void verify_any_net_loaded() {
+
+        if (!Options["SkipLoadingEval"] && useNNUE != UseNNUEMode::False && eval_file_loaded.empty())
+        {
+            UCI::OptionsMap defaults;
+            UCI::init(defaults);
+
+            std::string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available.";
+            std::string msg2 = "The option is set to true, but the network file was not loaded successfully.";
+            std::string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file.";
+            std::string msg5 = "The engine will be terminated now.";
+
+            sync_cout << "info string ERROR: " << msg1 << sync_endl;
+            sync_cout << "info string ERROR: " << msg2 << sync_endl;
+            sync_cout << "info string ERROR: " << msg3 << sync_endl;
+            sync_cout << "info string ERROR: " << msg5 << sync_endl;
+
+            std::exit(EXIT_FAILURE);
+        }
+
+        if (useNNUE != UseNNUEMode::False)
+            sync_cout << "info string NNUE evaluation using " << eval_file_loaded << " enabled" << sync_endl;
+        else
+            sync_cout << "info string classical evaluation enabled" << sync_endl;
+    }
 
 } // namespace Eval::NNUE
diff --git a/src/nnue/evaluate_nnue.h b/src/nnue/evaluate_nnue.h
index 264d24fe..d0f61644 100644
--- a/src/nnue/evaluate_nnue.h
+++ b/src/nnue/evaluate_nnue.h
@@ -37,7 +37,7 @@ namespace Eval::NNUE {
 
     // Hash value of evaluation function structure
     constexpr std::uint32_t kHashValue =
-        FeatureTransformer::GetHashValue() ^ Network::GetHashValue();
+        FeatureTransformer::get_hash_value() ^ Network::get_hash_value();
 
     // Deleter for automating release of memory area
     template <typename T>
@@ -79,21 +79,21 @@ namespace Eval::NNUE {
     extern std::string eval_file_loaded;
 
     // Get a string that represents the structure of the evaluation function
-    std::string GetArchitectureString();
+    std::string get_architecture_string();
 
     // read the header
-    bool ReadHeader(std::istream& stream,
+    bool read_header(std::istream& stream,
         std::uint32_t* hash_value, std::string* architecture);
 
     // write the header
-    bool WriteHeader(std::ostream& stream,
+    bool write_header(std::ostream& stream,
         std::uint32_t hash_value, const std::string& architecture);
 
     // read evaluation function parameters
-    bool ReadParameters(std::istream& stream);
+    bool read_parameters(std::istream& stream);
 
     // write evaluation function parameters
-    bool WriteParameters(std::ostream& stream);
+    bool write_parameters(std::ostream& stream);
 
     Value evaluate(const Position& pos);
     bool load_eval(std::string name, std::istream& stream);
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 92ecd8d2..e0236781 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -44,9 +44,9 @@ namespace Eval::NNUE {
         std::shared_ptr<Trainer<Network>> trainer;
 
         // Tell the learner options such as hyperparameters
-        void SendMessages(std::vector<Message> messages) {
+        void send_messages(std::vector<Message> messages) {
             for (auto& message : messages) {
-                trainer->SendMessage(&message);
+                trainer->send_message(&message);
                 assert(message.num_receivers > 0);
             }
         }
@@ -54,31 +54,31 @@ namespace Eval::NNUE {
     }  // namespace
 
     // Initialize learning
-    void InitializeTraining(const std::string& seed) {
+    void initialize_training(const std::string& seed) {
         std::cout << "Initializing NN training for "
-                  << GetArchitectureString() << std::endl;
+                  << get_architecture_string() << std::endl;
 
         assert(feature_transformer);
         assert(network);
-        trainer = Trainer<Network>::Create(network.get(), feature_transformer.get());
+        trainer = Trainer<Network>::create(network.get(), feature_transformer.get());
         rng.seed(PRNG(seed).rand<uint64_t>());
 
         if (Options["SkipLoadingEval"]) {
-            trainer->Initialize(rng);
+            trainer->initialize(rng);
         }
     }
 
     // set the number of samples in the mini-batch
-    void SetBatchSize(uint64_t size) {
+    void set_batch_size(uint64_t size) {
         assert(size > 0);
         batch_size = size;
     }
 
     // Set options such as hyperparameters
-    void SetOptions(const std::string& options) {
+    void set_options(const std::string& options) {
         std::vector<Message> messages;
-        for (const auto& option : Split(options, ',')) {
-          const auto fields = Split(option, '=');
+        for (const auto& option : Algo::split(options, ',')) {
+          const auto fields = Algo::split(option, '=');
           assert(fields.size() == 1 || fields.size() == 2);
 
           if (fields.size() == 1) {
@@ -88,30 +88,30 @@ namespace Eval::NNUE {
           }
         }
 
-        SendMessages(std::move(messages));
+        send_messages(std::move(messages));
     }
 
     // Reread the evaluation function parameters for learning from the file
-    void RestoreParameters(const std::string& dir_name) {
-        const std::string file_name = Path::Combine(dir_name, NNUE::savedfileName);
+    void restore_parameters(const std::string& dir_name) {
+        const std::string file_name = Path::combine(dir_name, NNUE::savedfileName);
         std::ifstream stream(file_name, std::ios::binary);
 #ifndef NDEBUG
         bool result =
 #endif
-        ReadParameters(stream);
+        read_parameters(stream);
 #ifndef NDEBUG
         assert(result);
 #endif
 
-        SendMessages({{"reset"}});
+        send_messages({{"reset"}});
     }
 
-    void FinalizeNet() {
-        SendMessages({{"clear_unobserved_feature_weights"}});
+    void finalize_net() {
+        send_messages({{"clear_unobserved_feature_weights"}});
     }
 
     // Add 1 sample of learning data
-    void AddExample(Position& pos, Color rootColor,
+    void add_example(Position& pos, Color rootColor,
                     const Learner::PackedSfenValue& psv, double weight) {
 
         Example example;
@@ -126,7 +126,7 @@ namespace Eval::NNUE {
 
         Features::IndexList active_indices[2];
         for (const auto trigger : kRefreshTriggers) {
-            RawFeatures::AppendActiveIndices(pos, trigger, active_indices);
+            RawFeatures::append_active_indices(pos, trigger, active_indices);
         }
 
         if (pos.side_to_move() != WHITE) {
@@ -136,9 +136,9 @@ namespace Eval::NNUE {
         for (const auto color : Colors) {
             std::vector<TrainingFeature> training_features;
             for (const auto base_index : active_indices[color]) {
-                static_assert(Features::Factorizer<RawFeatures>::GetDimensions() <
+                static_assert(Features::Factorizer<RawFeatures>::get_dimensions() <
                               (1 << TrainingFeature::kIndexBits), "");
-                Features::Factorizer<RawFeatures>::AppendTrainingFeatures(
+                Features::Factorizer<RawFeatures>::append_training_features(
                     base_index, &training_features);
             }
 
@@ -147,7 +147,7 @@ namespace Eval::NNUE {
             auto& unique_features = example.training_features[color];
             for (const auto& feature : training_features) {
                 if (!unique_features.empty() &&
-                    feature.GetIndex() == unique_features.back().GetIndex()) {
+                    feature.get_index() == unique_features.back().get_index()) {
 
                     unique_features.back() += feature;
                 } else {
@@ -161,7 +161,7 @@ namespace Eval::NNUE {
     }
 
     // update the evaluation function parameters
-    void UpdateParameters() {
+    void update_parameters() {
         assert(batch_size > 0);
 
         const auto learning_rate = static_cast<LearnFloatType>(
@@ -173,30 +173,30 @@ namespace Eval::NNUE {
             std::vector<Example> batch(examples.end() - batch_size, examples.end());
             examples.resize(examples.size() - batch_size);
 
-            const auto network_output = trainer->Propagate(batch);
+            const auto network_output = trainer->propagate(batch);
 
             std::vector<LearnFloatType> gradients(batch.size());
             for (std::size_t b = 0; b < batch.size(); ++b) {
-                const auto shallow = static_cast<Value>(Round<std::int32_t>(
+                const auto shallow = static_cast<Value>(round<std::int32_t>(
                     batch[b].sign * network_output[b] * kPonanzaConstant));
                 const auto& psv = batch[b].psv;
                 const double gradient = batch[b].sign * Learner::calc_grad(shallow, psv);
                 gradients[b] = static_cast<LearnFloatType>(gradient * batch[b].weight);
             }
 
-            trainer->Backpropagate(gradients.data(), learning_rate);
+            trainer->backpropagate(gradients.data(), learning_rate);
         }
-        SendMessages({{"quantize_parameters"}});
+        send_messages({{"quantize_parameters"}});
     }
 
     // Check if there are any problems with learning
-    void CheckHealth() {
-        SendMessages({{"check_health"}});
+    void check_health() {
+        send_messages({{"check_health"}});
     }
 
     // save merit function parameters to a file
     void save_eval(std::string dir_name) {
-        auto eval_dir = Path::Combine(Options["EvalSaveDir"], dir_name);
+        auto eval_dir = Path::combine(Options["EvalSaveDir"], dir_name);
         std::cout << "save_eval() start. folder = " << eval_dir << std::endl;
 
         // mkdir() will fail if this folder already exists, but
@@ -204,12 +204,12 @@ namespace Eval::NNUE {
         // Also, assume that the folders up to EvalSaveDir have been dug.
         std::filesystem::create_directories(eval_dir);
 
-        const std::string file_name = Path::Combine(eval_dir, NNUE::savedfileName);
+        const std::string file_name = Path::combine(eval_dir, NNUE::savedfileName);
         std::ofstream stream(file_name, std::ios::binary);
 #ifndef NDEBUG
         bool result =
 #endif
-        WriteParameters(stream);
+        write_parameters(stream);
 #ifndef NDEBUG
         assert(result);
 #endif
diff --git a/src/nnue/evaluate_nnue_learner.h b/src/nnue/evaluate_nnue_learner.h
index 525b286a..431fb02e 100644
--- a/src/nnue/evaluate_nnue_learner.h
+++ b/src/nnue/evaluate_nnue_learner.h
@@ -7,28 +7,31 @@
 namespace Eval::NNUE {
 
     // Initialize learning
-    void InitializeTraining(const std::string& seed);
+    void initialize_training(const std::string& seed);
 
     // set the number of samples in the mini-batch
-    void SetBatchSize(uint64_t size);
+    void set_batch_size(uint64_t size);
 
     // Set options such as hyperparameters
-    void SetOptions(const std::string& options);
+    void set_options(const std::string& options);
 
     // Reread the evaluation function parameters for learning from the file
-    void RestoreParameters(const std::string& dir_name);
+    void restore_parameters(const std::string& dir_name);
 
     // Add 1 sample of learning data
-    void AddExample(Position& pos, Color rootColor,
-    	 const Learner::PackedSfenValue& psv, double weight);
+    void add_example(
+        Position& pos,
+        Color rootColor,
+    	const Learner::PackedSfenValue& psv,
+        double weight);
 
     // update the evaluation function parameters
-    void UpdateParameters();
+    void update_parameters();
 
     // Check if there are any problems with learning
-    void CheckHealth();
+    void check_health();
 
-    void FinalizeNet();
+    void finalize_net();
 
     void save_eval(std::string suffix);
 }  // namespace Eval::NNUE
diff --git a/src/nnue/features/castling_right.cpp b/src/nnue/features/castling_right.cpp
index eb8a36a1..cbac0851 100644
--- a/src/nnue/features/castling_right.cpp
+++ b/src/nnue/features/castling_right.cpp
@@ -5,8 +5,11 @@
 namespace Eval::NNUE::Features {
 
     // Get a list of indices with a value of 1 among the features
-    void CastlingRight::AppendActiveIndices(
-        const Position& pos, Color perspective, IndexList* active) {
+    void CastlingRight::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {
+
         // do nothing if array size is small to avoid compiler warning
         if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
 
@@ -29,9 +32,11 @@ namespace Eval::NNUE::Features {
     }
 
     // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-    void CastlingRight::AppendChangedIndices(
-        const Position& pos, Color perspective,
-        IndexList* removed, IndexList* /* added */) {
+    void CastlingRight::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* /* added */) {
 
         int previous_castling_rights = pos.state()->previous->castlingRights;
         int current_castling_rights = pos.state()->castlingRights;
diff --git a/src/nnue/features/castling_right.h b/src/nnue/features/castling_right.h
index 3e35e432..cada24b6 100644
--- a/src/nnue/features/castling_right.h
+++ b/src/nnue/features/castling_right.h
@@ -26,12 +26,17 @@ namespace Eval::NNUE::Features {
         static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
 
         // Get a list of indices with a value of 1 among the features
-        static void AppendActiveIndices(const Position& pos, Color perspective,
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
             IndexList* active);
 
         // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-        static void AppendChangedIndices(const Position& pos, Color perspective,
-            IndexList* removed, IndexList* added);
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
     };
 
 }  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/enpassant.cpp b/src/nnue/features/enpassant.cpp
index 7aa8988b..06ba2d49 100644
--- a/src/nnue/features/enpassant.cpp
+++ b/src/nnue/features/enpassant.cpp
@@ -5,8 +5,10 @@
 namespace Eval::NNUE::Features {
 
     // Get a list of indices with a value of 1 among the features
-    void EnPassant::AppendActiveIndices(
-        const Position& pos, Color /* perspective */, IndexList* active) {
+    void EnPassant::append_active_indices(
+        const Position& pos,
+        Color /* perspective */,
+        IndexList* active) {
 
         // do nothing if array size is small to avoid compiler warning
         if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions)
@@ -21,9 +23,11 @@ namespace Eval::NNUE::Features {
     }
 
     // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-    void EnPassant::AppendChangedIndices(
-        const Position& pos, Color /* perspective */,
-        IndexList* removed, IndexList* added) {
+    void EnPassant::append_changed_indices(
+        const Position& pos,
+        Color /* perspective */,
+        IndexList* removed,
+        IndexList* added) {
 
         auto previous_epSquare = pos.state()->previous->epSquare;
         auto epSquare = pos.state()->epSquare;
diff --git a/src/nnue/features/enpassant.h b/src/nnue/features/enpassant.h
index 65819a96..6ccb6046 100644
--- a/src/nnue/features/enpassant.h
+++ b/src/nnue/features/enpassant.h
@@ -22,12 +22,17 @@ namespace Eval::NNUE::Features {
         static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
 
         // Get a list of indices with a value of 1 among the features
-        static void AppendActiveIndices(const Position& pos, Color perspective,
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
             IndexList* active);
 
         // Get a list of indices whose values have changed from the previous one in the feature quantity
-        static void AppendChangedIndices(const Position& pos, Color perspective,
-            IndexList* removed, IndexList* added);
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
     };
 
 }  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/feature_set.h b/src/nnue/features/feature_set.h
index 5b243424..32ef24ef 100644
--- a/src/nnue/features/feature_set.h
+++ b/src/nnue/features/feature_set.h
@@ -33,8 +33,8 @@ namespace Eval::NNUE::Features {
 
     template <typename T, T First, T... Remaining>
     struct CompileTimeList<T, First, Remaining...> {
-        static constexpr bool Contains(T value) {
-            return value == First || CompileTimeList<T, Remaining...>::Contains(value);
+        static constexpr bool contains(T value) {
+            return value == First || CompileTimeList<T, Remaining...>::contains(value);
         }
 
         static constexpr std::array<T, sizeof...(Remaining) + 1>
@@ -47,7 +47,7 @@ namespace Eval::NNUE::Features {
 
     template <typename T>
     struct CompileTimeList<T> {
-        static constexpr bool Contains(T /*value*/) {
+        static constexpr bool contains(T /*value*/) {
             return false;
         }
         static constexpr std::array<T, 0> kValues = { {} };
@@ -70,7 +70,7 @@ namespace Eval::NNUE::Features {
     struct InsertToSet<T, CompileTimeList<T, First, Remaining...>, AnotherValue> {
         using Result =
             std::conditional_t<
-                CompileTimeList<T, First, Remaining...>::Contains(AnotherValue),
+                CompileTimeList<T, First, Remaining...>::contains(AnotherValue),
                 CompileTimeList<T, First, Remaining...>,
                 std::conditional_t<
                     (AnotherValue < First),
@@ -95,20 +95,23 @@ namespace Eval::NNUE::Features {
        public:
         // Get a list of indices for active features
         template <typename IndexListType>
-        static void AppendActiveIndices(
+        static void append_active_indices(
             const Position& pos, TriggerEvent trigger, IndexListType active[2]) {
 
             for (Color perspective : { WHITE, BLACK }) {
-                Derived::CollectActiveIndices(
+                Derived::collect_active_indices(
                     pos, trigger, perspective, &active[perspective]);
             }
         }
 
         // Get a list of indices for recently changed features
         template <typename PositionType, typename IndexListType>
-        static void AppendChangedIndices(
-            const PositionType& pos, TriggerEvent trigger,
-            IndexListType removed[2], IndexListType added[2], bool reset[2]) {
+        static void append_changed_indices(
+            const PositionType& pos,
+            TriggerEvent trigger,
+            IndexListType removed[2],
+            IndexListType added[2],
+            bool reset[2]) {
 
             const auto& dp = pos.state()->dirtyPiece;
 
@@ -137,10 +140,10 @@ namespace Eval::NNUE::Features {
                 }
 
                 if (reset[perspective]) {
-                    Derived::CollectActiveIndices(
+                    Derived::collect_active_indices(
                         pos, trigger, perspective, &added[perspective]);
                 } else {
-                    Derived::CollectChangedIndices(
+                    Derived::collect_changed_indices(
                         pos, trigger, perspective,
                         &removed[perspective], &added[perspective]);
                 }
@@ -180,20 +183,23 @@ namespace Eval::NNUE::Features {
         static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
 
         // Get the feature quantity name
-        static std::string GetName() {
-            return std::string(Head::kName) + "+" + Tail::GetName();
+        static std::string get_name() {
+            return std::string(Head::kName) + "+" + Tail::get_name();
         }
 
     private:
         // Get a list of indices with a value of 1 among the features
         template <typename IndexListType>
-        static void CollectActiveIndices(
-            const Position& pos, const TriggerEvent trigger, const Color perspective,
+        static void collect_active_indices(
+            const Position& pos,
+            const TriggerEvent trigger,
+            const Color perspective,
             IndexListType* const active) {
-            Tail::CollectActiveIndices(pos, trigger, perspective, active);
+
+            Tail::collect_active_indices(pos, trigger, perspective, active);
             if (Head::kRefreshTrigger == trigger) {
                 const auto start = active->size();
-                Head::AppendActiveIndices(pos, perspective, active);
+                Head::append_active_indices(pos, perspective, active);
 
                 for (auto i = start; i < active->size(); ++i) {
                     (*active)[i] += Tail::kDimensions;
@@ -203,14 +209,18 @@ namespace Eval::NNUE::Features {
 
         // Get a list of indices whose values have changed from the previous one in the feature quantity
         template <typename IndexListType>
-        static void CollectChangedIndices(
-            const Position& pos, const TriggerEvent trigger, const Color perspective,
-            IndexListType* const removed, IndexListType* const added) {
-            Tail::CollectChangedIndices(pos, trigger, perspective, removed, added);
+        static void collect_changed_indices(
+            const Position& pos,
+            const TriggerEvent trigger,
+            const Color perspective,
+            IndexListType* const removed,
+            IndexListType* const added) {
+
+            Tail::collect_changed_indices(pos, trigger, perspective, removed, added);
             if (Head::kRefreshTrigger == trigger) {
                 const auto start_removed = removed->size();
                 const auto start_added = added->size();
-                Head::AppendChangedIndices(pos, perspective, removed, added);
+                Head::append_changed_indices(pos, perspective, removed, added);
 
                 for (auto i = start_removed; i < removed->size(); ++i) {
                     (*removed)[i] += Tail::kDimensions;
@@ -251,28 +261,33 @@ namespace Eval::NNUE::Features {
         static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
 
         // Get the feature quantity name
-        static std::string GetName() {
+        static std::string get_name() {
             return FeatureType::kName;
         }
 
     private:
         // Get a list of indices for active features
-        static void CollectActiveIndices(
-            const Position& pos, const TriggerEvent trigger, const Color perspective,
+        static void collect_active_indices(
+            const Position& pos,
+            const TriggerEvent trigger,
+            const Color perspective,
             IndexList* const active) {
 
             if (FeatureType::kRefreshTrigger == trigger) {
-              FeatureType::AppendActiveIndices(pos, perspective, active);
+              FeatureType::append_active_indices(pos, perspective, active);
             }
         }
 
         // Get a list of indices for recently changed features
-        static void CollectChangedIndices(
-            const Position& pos, const TriggerEvent trigger, const Color perspective,
-            IndexList* const removed, IndexList* const added) {
+        static void collect_changed_indices(
+            const Position& pos,
+            const TriggerEvent trigger,
+            const Color perspective,
+            IndexList* const removed,
+            IndexList* const added) {
 
             if (FeatureType::kRefreshTrigger == trigger) {
-              FeatureType::AppendChangedIndices(pos, perspective, removed, added);
+              FeatureType::append_changed_indices(pos, perspective, removed, added);
             }
         }
 
diff --git a/src/nnue/features/half_kp.cpp b/src/nnue/features/half_kp.cpp
index 17b50472..18e82004 100644
--- a/src/nnue/features/half_kp.cpp
+++ b/src/nnue/features/half_kp.cpp
@@ -30,30 +30,41 @@ namespace Eval::NNUE::Features {
 
     // Find the index of the feature quantity from the king position and PieceSquare
     template <Side AssociatedKing>
-    inline IndexType HalfKP<AssociatedKing>::MakeIndex(
-        Color perspective, Square s, Piece pc, Square ksq) {
+    inline IndexType HalfKP<AssociatedKing>::make_index(
+        Color perspective,
+        Square s,
+        Piece pc,
+        Square ksq) {
 
         return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective] + PS_END * ksq);
     }
 
     // Get a list of indices for active features
     template <Side AssociatedKing>
-    void HalfKP<AssociatedKing>::AppendActiveIndices(
-        const Position& pos, Color perspective, IndexList* active) {
+    void HalfKP<AssociatedKing>::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {
+
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));
 
-        Square ksq = orient(perspective, pos.square<KING>(AssociatedKing == Side::kFriend ? perspective : ~perspective));
         Bitboard bb = pos.pieces() & ~pos.pieces(KING);
         while (bb) {
             Square s = pop_lsb(&bb);
-            active->push_back(MakeIndex(perspective, s, pos.piece_on(s), ksq));
+            active->push_back(make_index(perspective, s, pos.piece_on(s), ksq));
         }
     }
 
     // Get a list of indices for recently changed features
     template <Side AssociatedKing>
-    void HalfKP<AssociatedKing>::AppendChangedIndices(
-        const Position& pos, Color perspective,
-        IndexList* removed, IndexList* added) {
+    void HalfKP<AssociatedKing>::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* added) {
 
         Square ksq = orient(
             perspective,
@@ -68,10 +79,10 @@ namespace Eval::NNUE::Features {
                 continue;
 
             if (dp.from[i] != SQ_NONE)
-                removed->push_back(MakeIndex(perspective, dp.from[i], pc, ksq));
+                removed->push_back(make_index(perspective, dp.from[i], pc, ksq));
 
             if (dp.to[i] != SQ_NONE)
-                added->push_back(MakeIndex(perspective, dp.to[i], pc, ksq));
+                added->push_back(make_index(perspective, dp.to[i], pc, ksq));
         }
     }
 
diff --git a/src/nnue/features/half_kp.h b/src/nnue/features/half_kp.h
index 834f800e..4a4329e8 100644
--- a/src/nnue/features/half_kp.h
+++ b/src/nnue/features/half_kp.h
@@ -53,16 +53,21 @@ namespace Eval::NNUE::Features {
             TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
 
         // Get a list of indices for active features
-        static void AppendActiveIndices(const Position& pos, Color perspective,
-                                        IndexList* active);
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);
 
         // Get a list of indices for recently changed features
-        static void AppendChangedIndices(const Position& pos, Color perspective,
-                                         IndexList* removed, IndexList* added);
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
 
     private:
         // Index of a feature for a given king position and another piece on some square
-        static IndexType MakeIndex(Color perspective, Square s, Piece pc, Square sq_k);
+        static IndexType make_index(Color perspective, Square s, Piece pc, Square sq_k);
     };
 
 }  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/half_relative_kp.cpp b/src/nnue/features/half_relative_kp.cpp
index 5ab22890..240e20c0 100644
--- a/src/nnue/features/half_relative_kp.cpp
+++ b/src/nnue/features/half_relative_kp.cpp
@@ -11,16 +11,21 @@ namespace Eval::NNUE::Features {
 
     // Find the index of the feature quantity from the ball position and PieceSquare
     template <Side AssociatedKing>
-    inline IndexType HalfRelativeKP<AssociatedKing>::MakeIndex(
-        Color perspective, Square s, Piece pc, Square sq_k) {
+    inline IndexType HalfRelativeKP<AssociatedKing>::make_index(
+        Color perspective,
+        Square s,
+        Piece pc,
+        Square sq_k) {
+
         const IndexType p = IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
-        return MakeIndex(sq_k, p);
+        return make_index(sq_k, p);
     }
 
     // Find the index of the feature quantity from the ball position and PieceSquare
     template <Side AssociatedKing>
-    inline IndexType HalfRelativeKP<AssociatedKing>::MakeIndex(
-        Square sq_k, IndexType p) {
+    inline IndexType HalfRelativeKP<AssociatedKing>::make_index(
+        Square sq_k,
+        IndexType p) {
 
         constexpr IndexType W = kBoardWidth;
         constexpr IndexType H = kBoardHeight;
@@ -33,8 +38,10 @@ namespace Eval::NNUE::Features {
 
     // Get a list of indices with a value of 1 among the features
     template <Side AssociatedKing>
-    void HalfRelativeKP<AssociatedKing>::AppendActiveIndices(
-        const Position& pos, Color perspective, IndexList* active) {
+    void HalfRelativeKP<AssociatedKing>::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {
 
         Square ksq = orient(
             perspective,
@@ -44,15 +51,17 @@ namespace Eval::NNUE::Features {
         Bitboard bb = pos.pieces() & ~pos.pieces(KING);
         while (bb) {
             Square s = pop_lsb(&bb);
-            active->push_back(MakeIndex(perspective, s, pos.piece_on(s), ksq));
+            active->push_back(make_index(perspective, s, pos.piece_on(s), ksq));
         }
     }
 
     // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
     template <Side AssociatedKing>
-    void HalfRelativeKP<AssociatedKing>::AppendChangedIndices(
-        const Position& pos, Color perspective,
-        IndexList* removed, IndexList* added) {
+    void HalfRelativeKP<AssociatedKing>::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* added) {
 
         Square ksq = orient(
             perspective,
@@ -67,10 +76,10 @@ namespace Eval::NNUE::Features {
                 continue;
 
             if (dp.from[i] != SQ_NONE)
-                removed->push_back(MakeIndex(perspective, dp.from[i], pc, ksq));
+                removed->push_back(make_index(perspective, dp.from[i], pc, ksq));
 
             if (dp.to[i] != SQ_NONE)
-                added->push_back(MakeIndex(perspective, dp.to[i], pc, ksq));
+                added->push_back(make_index(perspective, dp.to[i], pc, ksq));
         }
     }
 
diff --git a/src/nnue/features/half_relative_kp.h b/src/nnue/features/half_relative_kp.h
index cc1e136f..590a01a3 100644
--- a/src/nnue/features/half_relative_kp.h
+++ b/src/nnue/features/half_relative_kp.h
@@ -42,18 +42,23 @@ namespace Eval::NNUE::Features {
             TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
 
         // Get a list of indices with a value of 1 among the features
-        static void AppendActiveIndices(const Position& pos, Color perspective,
-                                        IndexList* active);
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);
 
         // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-        static void AppendChangedIndices(const Position& pos, Color perspective,
-                                         IndexList* removed, IndexList* added);
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
 
         // Find the index of the feature quantity from the ball position and PieceSquare
-        static IndexType MakeIndex(Square s, IndexType p);
+        static IndexType make_index(Square s, IndexType p);
 
         // Find the index of the feature quantity from the ball position and PieceSquare
-        static IndexType MakeIndex(Color perspective, Square s, Piece pc, Square sq_k);
+        static IndexType make_index(Color perspective, Square s, Piece pc, Square sq_k);
     };
 
 }  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/k.cpp b/src/nnue/features/k.cpp
index 8911abb7..f01a6ce0 100644
--- a/src/nnue/features/k.cpp
+++ b/src/nnue/features/k.cpp
@@ -10,29 +10,33 @@ namespace Eval::NNUE::Features {
     }
 
     // Index of a feature for a given king position.
-    IndexType K::MakeIndex(Color perspective, Square s, Color king_color) {
+    IndexType K::make_index(Color perspective, Square s, Color king_color) {
         return IndexType(orient(perspective, s) + bool(perspective ^ king_color) * 64);
     }
 
     // Get a list of indices with a value of 1 among the features
-    void K::AppendActiveIndices(
-        const Position& pos, Color perspective, IndexList* active) {
+    void K::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {
 
         for (auto color : Colors) {
-          active->push_back(MakeIndex(perspective, pos.square<KING>(color), color));
+          active->push_back(make_index(perspective, pos.square<KING>(color), color));
         }
     }
 
     // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-    void K::AppendChangedIndices(
-        const Position& pos, Color perspective,
-        IndexList* removed, IndexList* added) {
+    void K::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* added) {
 
         const auto& dp = pos.state()->dirtyPiece;
         if (type_of(dp.piece[0]) == KING)
         {
-            removed->push_back(MakeIndex(perspective, dp.from[0], color_of(dp.piece[0])));
-            added->push_back(MakeIndex(perspective, dp.to[0], color_of(dp.piece[0])));
+            removed->push_back(make_index(perspective, dp.from[0], color_of(dp.piece[0])));
+            added->push_back(make_index(perspective, dp.to[0], color_of(dp.piece[0])));
         }
     }
 
diff --git a/src/nnue/features/k.h b/src/nnue/features/k.h
index c9726ab2..928d77de 100644
--- a/src/nnue/features/k.h
+++ b/src/nnue/features/k.h
@@ -8,36 +8,41 @@
 //Definition of input feature quantity K of NNUE evaluation function
 namespace Eval::NNUE::Features {
 
-  // Feature K: Ball position
-  class K {
-  public:
-      // feature quantity name
-      static constexpr const char* kName = "K";
+    // Feature K: Ball position
+    class K {
+    public:
+        // feature quantity name
+        static constexpr const char* kName = "K";
 
-      // Hash value embedded in the evaluation function file
-      static constexpr std::uint32_t kHashValue = 0xD3CEE169u;
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t kHashValue = 0xD3CEE169u;
 
-      // number of feature dimensions
-      static constexpr IndexType kDimensions = SQUARE_NB * 2;
+        // number of feature dimensions
+        static constexpr IndexType kDimensions = SQUARE_NB * 2;
 
-      // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-      static constexpr IndexType kMaxActiveDimensions = 2;
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions = 2;
 
-      // Timing of full calculation instead of difference calculation
-      static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
+        // Timing of full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
 
-      // Get a list of indices with a value of 1 among the features
-      static void AppendActiveIndices(const Position& pos, Color perspective,
-                                      IndexList* active);
+        // Get a list of indices with a value of 1 among the features
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);
 
-      // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-      static void AppendChangedIndices(const Position& pos, Color perspective,
-                                       IndexList* removed, IndexList* added);
+        // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
 
-  private:
-      // Index of a feature for a given king position.
-      static IndexType MakeIndex(Color perspective, Square s, Color king_color);
-  };
+    private:
+        // Index of a feature for a given king position.
+        static IndexType make_index(Color perspective, Square s, Color king_color);
+    };
 
 }  // namespace Eval::NNUE::Features
 
diff --git a/src/nnue/features/p.cpp b/src/nnue/features/p.cpp
index b4757284..1621e8b2 100644
--- a/src/nnue/features/p.cpp
+++ b/src/nnue/features/p.cpp
@@ -10,26 +10,30 @@ namespace Eval::NNUE::Features {
     }
 
     // Find the index of the feature quantity from the king position and PieceSquare
-    inline IndexType P::MakeIndex(
+    inline IndexType P::make_index(
         Color perspective, Square s, Piece pc) {
         return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
     }
 
     // Get a list of indices with a value of 1 among the features
-    void P::AppendActiveIndices(
-        const Position& pos, Color perspective, IndexList* active) {
+    void P::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {
 
         Bitboard bb = pos.pieces() & ~pos.pieces(KING);
         while (bb) {
             Square s = pop_lsb(&bb);
-            active->push_back(MakeIndex(perspective, s, pos.piece_on(s)));
+            active->push_back(make_index(perspective, s, pos.piece_on(s)));
         }
     }
 
     // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-    void P::AppendChangedIndices(
-        const Position& pos, Color perspective,
-        IndexList* removed, IndexList* added) {
+    void P::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* added) {
 
         const auto& dp = pos.state()->dirtyPiece;
         for (int i = 0; i < dp.dirty_num; ++i) {
@@ -39,10 +43,10 @@ namespace Eval::NNUE::Features {
               continue;
 
             if (dp.from[i] != SQ_NONE)
-              removed->push_back(MakeIndex(perspective, dp.from[i], pc));
+              removed->push_back(make_index(perspective, dp.from[i], pc));
 
             if (dp.to[i] != SQ_NONE)
-              added->push_back(MakeIndex(perspective, dp.to[i], pc));
+              added->push_back(make_index(perspective, dp.to[i], pc));
         }
     }
 
diff --git a/src/nnue/features/p.h b/src/nnue/features/p.h
index 6a8a5392..d461086b 100644
--- a/src/nnue/features/p.h
+++ b/src/nnue/features/p.h
@@ -8,36 +8,41 @@
 //Definition of input feature P of NNUE evaluation function
 namespace Eval::NNUE::Features {
 
-  // Feature P: PieceSquare of pieces other than balls
-  class P {
-  public:
-      // feature quantity name
-      static constexpr const char* kName = "P";
+    // Feature P: PieceSquare of pieces other than balls
+    class P {
+    public:
+        // feature quantity name
+        static constexpr const char* kName = "P";
 
-      // Hash value embedded in the evaluation function file
-      static constexpr std::uint32_t kHashValue = 0x764CFB4Bu;
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t kHashValue = 0x764CFB4Bu;
 
-      // number of feature dimensions
-      static constexpr IndexType kDimensions = PS_END;
+        // number of feature dimensions
+        static constexpr IndexType kDimensions = PS_END;
 
-      // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-      static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
 
-      // Timing of full calculation instead of difference calculation
-      static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
+        // Timing of full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
 
-      // Get a list of indices with a value of 1 among the features
-      static void AppendActiveIndices(const Position& pos, Color perspective,
-                                      IndexList* active);
+        // Get a list of indices with a value of 1 among the features
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);
 
-      // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-      static void AppendChangedIndices(const Position& pos, Color perspective,
-                                       IndexList* removed, IndexList* added);
+        // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
 
-  private:
-      // Index of a feature for a given piece on some square
-      static IndexType MakeIndex(Color perspective, Square s, Piece pc);
-  };
+    private:
+        // Index of a feature for a given piece on some square
+        static IndexType make_index(Color perspective, Square s, Piece pc);
+    };
 
 }  // namespace Eval::NNUE::Features
 
diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index cc5e5eef..6efaecbc 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -47,36 +47,36 @@ namespace Eval::NNUE::Layers {
         static constexpr IndexType kOutputDimensions = OutputDimensions;
 
         static constexpr IndexType kPaddedInputDimensions =
-            CeilToMultiple<IndexType>(kInputDimensions, kMaxSimdWidth);
+            ceil_to_multiple<IndexType>(kInputDimensions, kMaxSimdWidth);
 
         // Size of forward propagation buffer used in this layer
         static constexpr std::size_t kSelfBufferSize =
-            CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
+            ceil_to_multiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
 
         // Size of the forward propagation buffer used from the input layer to this layer
         static constexpr std::size_t kBufferSize =
             PreviousLayer::kBufferSize + kSelfBufferSize;
 
         // Hash value embedded in the evaluation file
-        static constexpr std::uint32_t GetHashValue() {
+        static constexpr std::uint32_t get_hash_value() {
             std::uint32_t hash_value = 0xCC03DAE4u;
             hash_value += kOutputDimensions;
-            hash_value ^= PreviousLayer::GetHashValue() >> 1;
-            hash_value ^= PreviousLayer::GetHashValue() << 31;
+            hash_value ^= PreviousLayer::get_hash_value() >> 1;
+            hash_value ^= PreviousLayer::get_hash_value() << 31;
             return hash_value;
         }
 
         // A string that represents the structure from the input layer to this layer
-        static std::string GetStructureString() {
+        static std::string get_structure_string() {
             return "AffineTransform[" +
                 std::to_string(kOutputDimensions) + "<-" +
                 std::to_string(kInputDimensions) + "](" +
-                PreviousLayer::GetStructureString() + ")";
+                PreviousLayer::get_structure_string() + ")";
         }
 
        // Read network parameters
-        bool ReadParameters(std::istream& stream) {
-            if (!previous_layer_.ReadParameters(stream))
+        bool read_parameters(std::istream& stream) {
+            if (!previous_layer_.read_parameters(stream))
                 return false;
 
             for (std::size_t i = 0; i < kOutputDimensions; ++i)
@@ -89,8 +89,8 @@ namespace Eval::NNUE::Layers {
         }
 
         // write parameters
-        bool WriteParameters(std::ostream& stream) const {
-            if (!previous_layer_.WriteParameters(stream))
+        bool write_parameters(std::ostream& stream) const {
+            if (!previous_layer_.write_parameters(stream))
                 return false;
 
             stream.write(reinterpret_cast<const char*>(biases_),
@@ -104,10 +104,10 @@ namespace Eval::NNUE::Layers {
         }
 
         // Forward propagation
-        const OutputType* Propagate(
+        const OutputType* propagate(
             const TransformedFeatureType* transformed_features, char* buffer) const {
 
-            const auto input = previous_layer_.Propagate(
+            const auto input = previous_layer_.propagate(
                 transformed_features, buffer + kSelfBufferSize);
             const auto output = reinterpret_cast<OutputType*>(buffer);
 
diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h
index 0846f3df..889effa7 100644
--- a/src/nnue/layers/clipped_relu.h
+++ b/src/nnue/layers/clipped_relu.h
@@ -48,41 +48,41 @@ namespace Eval::NNUE::Layers {
 
         // Size of forward propagation buffer used in this layer
         static constexpr std::size_t kSelfBufferSize =
-            CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
+            ceil_to_multiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
 
         // Size of the forward propagation buffer used from the input layer to this layer
         static constexpr std::size_t kBufferSize =
             PreviousLayer::kBufferSize + kSelfBufferSize;
 
         // Hash value embedded in the evaluation file
-        static constexpr std::uint32_t GetHashValue() {
+        static constexpr std::uint32_t get_hash_value() {
             std::uint32_t hash_value = 0x538D24C7u;
-            hash_value += PreviousLayer::GetHashValue();
+            hash_value += PreviousLayer::get_hash_value();
             return hash_value;
         }
 
         // A string that represents the structure from the input layer to this layer
-        static std::string GetStructureString() {
+        static std::string get_structure_string() {
             return "ClippedReLU[" +
                 std::to_string(kOutputDimensions) + "](" +
-                PreviousLayer::GetStructureString() + ")";
+                PreviousLayer::get_structure_string() + ")";
         }
 
         // Read network parameters
-        bool ReadParameters(std::istream& stream) {
-            return previous_layer_.ReadParameters(stream);
+        bool read_parameters(std::istream& stream) {
+            return previous_layer_.read_parameters(stream);
         }
 
         // write parameters
-        bool WriteParameters(std::ostream& stream) const {
-            return previous_layer_.WriteParameters(stream);
+        bool write_parameters(std::ostream& stream) const {
+            return previous_layer_.write_parameters(stream);
         }
 
         // Forward propagation
-        const OutputType* Propagate(
+        const OutputType* propagate(
             const TransformedFeatureType* transformed_features, char* buffer) const {
 
-            const auto input = previous_layer_.Propagate(
+            const auto input = previous_layer_.propagate(
                 transformed_features, buffer + kSelfBufferSize);
             const auto output = reinterpret_cast<OutputType*>(buffer);
 
diff --git a/src/nnue/layers/input_slice.h b/src/nnue/layers/input_slice.h
index 9d9476a5..b69028ab 100644
--- a/src/nnue/layers/input_slice.h
+++ b/src/nnue/layers/input_slice.h
@@ -45,31 +45,31 @@ namespace Eval::NNUE::Layers {
       static constexpr std::size_t kBufferSize = 0;
 
       // Hash value embedded in the evaluation file
-      static constexpr std::uint32_t GetHashValue() {
+      static constexpr std::uint32_t get_hash_value() {
           std::uint32_t hash_value = 0xEC42E90Du;
           hash_value ^= kOutputDimensions ^ (Offset << 10);
           return hash_value;
       }
 
       // A string that represents the structure from the input layer to this layer
-      static std::string GetStructureString() {
+      static std::string get_structure_string() {
           return "InputSlice[" + std::to_string(kOutputDimensions) + "(" +
               std::to_string(Offset) + ":" +
               std::to_string(Offset + kOutputDimensions) + ")]";
       }
 
       // Read network parameters
-      bool ReadParameters(std::istream& /*stream*/) {
+      bool read_parameters(std::istream& /*stream*/) {
           return true;
       }
 
       // write parameters
-      bool WriteParameters(std::ostream& /*stream*/) const {
+      bool write_parameters(std::ostream& /*stream*/) const {
           return true;
       }
 
       // Forward propagation
-      const OutputType* Propagate(
+      const OutputType* propagate(
           const TransformedFeatureType* transformed_features,
           char* /*buffer*/) const {
 
diff --git a/src/nnue/layers/sum.h b/src/nnue/layers/sum.h
index c81f5850..64ef30f9 100644
--- a/src/nnue/layers/sum.h
+++ b/src/nnue/layers/sum.h
@@ -30,51 +30,51 @@ namespace Eval::NNUE::Layers {
 
         // Size of forward propagation buffer used in this layer
         static constexpr std::size_t kSelfBufferSize =
-            CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
+            ceil_to_multiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
 
         // Size of the forward propagation buffer used from the input layer to this layer
         static constexpr std::size_t kBufferSize =
             std::max(Head::kBufferSize + kSelfBufferSize, Tail::kBufferSize);
 
         // Hash value embedded in the evaluation function file
-        static constexpr std::uint32_t GetHashValue() {
+        static constexpr std::uint32_t get_hash_value() {
             std::uint32_t hash_value = 0xBCE400B4u;
-            hash_value ^= Head::GetHashValue() >> 1;
-            hash_value ^= Head::GetHashValue() << 31;
-            hash_value ^= Tail::GetHashValue() >> 2;
-            hash_value ^= Tail::GetHashValue() << 30;
+            hash_value ^= Head::get_hash_value() >> 1;
+            hash_value ^= Head::get_hash_value() << 31;
+            hash_value ^= Tail::get_hash_value() >> 2;
+            hash_value ^= Tail::get_hash_value() << 30;
             return hash_value;
         }
 
         // A string that represents the structure from the input layer to this layer
-        static std::string GetStructureString() {
+        static std::string get_structure_string() {
             return "Sum[" +
-                std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")";
+                std::to_string(kOutputDimensions) + "](" + get_summands_string() + ")";
         }
 
         // read parameters
-        bool ReadParameters(std::istream& stream) {
-            if (!Tail::ReadParameters(stream))
+        bool read_parameters(std::istream& stream) {
+            if (!Tail::read_parameters(stream))
                 return false;
 
-            return previous_layer_.ReadParameters(stream);
+            return previous_layer_.read_parameters(stream);
         }
 
         // write parameters
-        bool WriteParameters(std::ostream& stream) const {
-            if (!Tail::WriteParameters(stream))
+        bool write_parameters(std::ostream& stream) const {
+            if (!Tail::write_parameters(stream))
                 return false;
 
-            return previous_layer_.WriteParameters(stream);
+            return previous_layer_.write_parameters(stream);
         }
 
         // forward propagation
-        const OutputType* Propagate(
+        const OutputType* propagate(
             const TransformedFeatureType* transformed_features, char* buffer) const {
 
-            Tail::Propagate(transformed_features, buffer);
+            Tail::propagate(transformed_features, buffer);
 
-            const auto head_output = previous_layer_.Propagate(
+            const auto head_output = previous_layer_.propagate(
                 transformed_features, buffer + kSelfBufferSize);
 
             const auto output = reinterpret_cast<OutputType*>(buffer);
@@ -88,8 +88,8 @@ namespace Eval::NNUE::Layers {
 
     protected:
         // A string that represents the list of layers to be summed
-        static std::string GetSummandsString() {
-            return Head::GetStructureString() + "," + Tail::GetSummandsString();
+        static std::string get_summands_string() {
+            return Head::get_structure_string() + "," + Tail::get_summands_string();
         }
 
         // Make the learning class a friend
@@ -118,40 +118,40 @@ namespace Eval::NNUE::Layers {
         static constexpr std::size_t kBufferSize = PreviousLayer::kBufferSize;
 
         // Hash value embedded in the evaluation function file
-        static constexpr std::uint32_t GetHashValue() {
+        static constexpr std::uint32_t get_hash_value() {
             std::uint32_t hash_value = 0xBCE400B4u;
-            hash_value ^= PreviousLayer::GetHashValue() >> 1;
-            hash_value ^= PreviousLayer::GetHashValue() << 31;
+            hash_value ^= PreviousLayer::get_hash_value() >> 1;
+            hash_value ^= PreviousLayer::get_hash_value() << 31;
             return hash_value;
         }
 
         // A string that represents the structure from the input layer to this layer
-        static std::string GetStructureString() {
+        static std::string get_structure_string() {
             return "Sum[" +
-                std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")";
+                std::to_string(kOutputDimensions) + "](" + get_summands_string() + ")";
         }
 
         // read parameters
-        bool ReadParameters(std::istream& stream) {
-            return previous_layer_.ReadParameters(stream);
+        bool read_parameters(std::istream& stream) {
+            return previous_layer_.read_parameters(stream);
         }
 
         // write parameters
-        bool WriteParameters(std::ostream& stream) const {
-            return previous_layer_.WriteParameters(stream);
+        bool write_parameters(std::ostream& stream) const {
+            return previous_layer_.write_parameters(stream);
         }
 
         // forward propagation
-        const OutputType* Propagate(
+        const OutputType* propagate(
             const TransformedFeatureType* transformed_features, char* buffer) const {
 
-            return previous_layer_.Propagate(transformed_features, buffer);
+            return previous_layer_.propagate(transformed_features, buffer);
         }
 
     protected:
         // A string that represents the list of layers to be summed
-        static std::string GetSummandsString() {
-            return PreviousLayer::GetStructureString();
+        static std::string get_summands_string() {
+            return PreviousLayer::get_structure_string();
         }
 
         // Make the learning class a friend
diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h
index 70c7596d..bd4294a3 100644
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -127,7 +127,7 @@ namespace Eval::NNUE {
 
     // Round n up to be a multiple of base
     template <typename IntType>
-    constexpr IntType CeilToMultiple(IntType n, IntType base) {
+    constexpr IntType ceil_to_multiple(IntType n, IntType base) {
         return (n + base - 1) / base * base;
     }
 
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index 2fc24dab..87b8ee58 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -111,20 +111,20 @@ namespace Eval::NNUE {
             kOutputDimensions * sizeof(OutputType);
 
         // Hash value embedded in the evaluation file
-        static constexpr std::uint32_t GetHashValue() {
+        static constexpr std::uint32_t get_hash_value() {
 
             return RawFeatures::kHashValue ^ kOutputDimensions;
         }
 
         // a string representing the structure
-        static std::string GetStructureString() {
-            return RawFeatures::GetName() + "[" +
+        static std::string get_structure_string() {
+            return RawFeatures::get_name() + "[" +
                 std::to_string(kInputDimensions) + "->" +
                 std::to_string(kHalfDimensions) + "x2]";
         }
 
         // Read network parameters
-        bool ReadParameters(std::istream& stream) {
+        bool read_parameters(std::istream& stream) {
 
             for (std::size_t i = 0; i < kHalfDimensions; ++i)
                 biases_[i] = read_little_endian<BiasType>(stream);
@@ -136,7 +136,7 @@ namespace Eval::NNUE {
         }
 
         // write parameters
-        bool WriteParameters(std::ostream& stream) const {
+        bool write_parameters(std::ostream& stream) const {
             stream.write(reinterpret_cast<const char*>(biases_),
                 kHalfDimensions * sizeof(BiasType));
 
@@ -147,7 +147,7 @@ namespace Eval::NNUE {
         }
 
         // Proceed with the difference calculation if possible
-        bool UpdateAccumulatorIfPossible(const Position& pos) const {
+        bool update_accumulator_if_possible(const Position& pos) const {
 
             const auto now = pos.state();
             if (now->accumulator.computed_accumulation)
@@ -155,7 +155,7 @@ namespace Eval::NNUE {
 
             const auto prev = now->previous;
             if (prev && prev->accumulator.computed_accumulation) {
-                UpdateAccumulator(pos);
+                update_accumulator(pos);
                 return true;
             }
 
@@ -163,10 +163,10 @@ namespace Eval::NNUE {
         }
 
         // Convert input features
-        void Transform(const Position& pos, OutputType* output) const {
+        void transform(const Position& pos, OutputType* output) const {
 
-            if (!UpdateAccumulatorIfPossible(pos))
-              RefreshAccumulator(pos);
+            if (!update_accumulator_if_possible(pos))
+              refresh_accumulator(pos);
 
             const auto& accumulation = pos.state()->accumulator.accumulation;
 
@@ -294,13 +294,13 @@ namespace Eval::NNUE {
 
     private:
         // Calculate cumulative value without using difference calculation
-        void RefreshAccumulator(const Position& pos) const {
+        void refresh_accumulator(const Position& pos) const {
 
             auto& accumulator = pos.state()->accumulator;
             for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
                 Features::IndexList active_indices[2];
-                RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i],
-                                                 active_indices);
+                RawFeatures::append_active_indices(pos, kRefreshTriggers[i],
+                                                   active_indices);
                 for (Color perspective : { WHITE, BLACK }) {
 #ifdef TILING
                     for (unsigned j = 0; j < kHalfDimensions / kTileHeight; ++j) {
@@ -357,15 +357,15 @@ namespace Eval::NNUE {
         }
 
         // Calculate cumulative value using difference calculation
-        void UpdateAccumulator(const Position& pos) const {
+        void update_accumulator(const Position& pos) const {
 
             const auto& prev_accumulator = pos.state()->previous->accumulator;
             auto& accumulator = pos.state()->accumulator;
             for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
                 Features::IndexList removed_indices[2], added_indices[2];
                 bool reset[2] = { false, false };
-                RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i],
-                                                  removed_indices, added_indices, reset);
+                RawFeatures::append_changed_indices(pos, kRefreshTriggers[i],
+                                                    removed_indices, added_indices, reset);
 
 #ifdef TILING
                 for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
diff --git a/src/nnue/nnue_test_command.cpp b/src/nnue/nnue_test_command.cpp
index 55fa603a..d892222b 100644
--- a/src/nnue/nnue_test_command.cpp
+++ b/src/nnue/nnue_test_command.cpp
@@ -24,7 +24,7 @@ namespace Eval::NNUE {
     namespace {
 
         // Testing RawFeatures mainly for difference calculation
-        void TestFeatures(Position& pos) {
+        void test_features(Position& pos) {
             const std::uint64_t num_games = 1000;
             StateInfo si;
             pos.set(StartFEN, false, &si, Threads.main());
@@ -47,7 +47,7 @@ namespace Eval::NNUE {
 
                 for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
                     Features::IndexList active_indices[2];
-                    RawFeatures::AppendActiveIndices(position, kRefreshTriggers[i],
+                    RawFeatures::append_active_indices(position, kRefreshTriggers[i],
                                                      active_indices);
 
                     for (const auto perspective : Colors) {
@@ -68,7 +68,7 @@ namespace Eval::NNUE {
                 for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
                     Features::IndexList removed_indices[2], added_indices[2];
                     bool reset[2] = { false, false };
-                    RawFeatures::AppendChangedIndices(position, kRefreshTriggers[i],
+                    RawFeatures::append_changed_indices(position, kRefreshTriggers[i],
                                                       removed_indices, added_indices, reset);
                     for (const auto perspective : Colors) {
                         if (reset[perspective]) {
@@ -99,7 +99,7 @@ namespace Eval::NNUE {
                 }
             };
 
-            std::cout << "feature set: " << RawFeatures::GetName()
+            std::cout << "feature set: " << RawFeatures::get_name()
                       << "[" << RawFeatures::kDimensions << "]" << std::endl;
             std::cout << "start testing with random games";
 
@@ -154,8 +154,8 @@ namespace Eval::NNUE {
         }
 
         // Output a string that represents the structure of the evaluation function
-        void PrintInfo(std::istream& stream) {
-            std::cout << "network architecture: " << GetArchitectureString() << std::endl;
+        void print_info(std::istream& stream) {
+            std::cout << "network architecture: " << get_architecture_string() << std::endl;
 
             while (true) {
                 std::string file_name;
@@ -170,7 +170,7 @@ namespace Eval::NNUE {
 
                     if (!file_stream)
                         return false;
-                    if (!ReadHeader(file_stream, &hash_value, &architecture))
+                    if (!read_header(file_stream, &hash_value, &architecture))
                         return false;
 
                     return true;
@@ -180,7 +180,7 @@ namespace Eval::NNUE {
                 if (success) {
                     if (hash_value == kHashValue) {
                         std::cout << "matches with this binary";
-                        if (architecture != GetArchitectureString()) {
+                        if (architecture != get_architecture_string()) {
                             std::cout << ", but architecture string differs: " << architecture;
                         }
 
@@ -197,14 +197,14 @@ namespace Eval::NNUE {
     }  // namespace
 
     // USI extended command for NNUE evaluation function
-    void TestCommand(Position& pos, std::istream& stream) {
+    void test_command(Position& pos, std::istream& stream) {
         std::string sub_command;
         stream >> sub_command;
 
         if (sub_command == "test_features") {
-            TestFeatures(pos);
+            test_features(pos);
         } else if (sub_command == "info") {
-            PrintInfo(stream);
+            print_info(stream);
         } else {
             std::cout << "usage:" << std::endl;
             std::cout << " test nnue test_features" << std::endl;
diff --git a/src/nnue/nnue_test_command.h b/src/nnue/nnue_test_command.h
index 989731d6..fcfe16f6 100644
--- a/src/nnue/nnue_test_command.h
+++ b/src/nnue/nnue_test_command.h
@@ -5,7 +5,7 @@
 namespace Eval::NNUE {
 
     // USI extended command for NNUE evaluation function
-    void TestCommand(Position& pos, std::istream& stream);
+    void test_command(Position& pos, std::istream& stream);
 
 }  // namespace Eval::NNUE
 
diff --git a/src/nnue/trainer/features/factorizer.h b/src/nnue/trainer/features/factorizer.h
index 784fe047..49a2fe26 100644
--- a/src/nnue/trainer/features/factorizer.h
+++ b/src/nnue/trainer/features/factorizer.h
@@ -14,12 +14,12 @@ namespace Eval::NNUE::Features {
     class Factorizer {
     public:
         // Get the dimensionality of the learning feature
-        static constexpr IndexType GetDimensions() {
+        static constexpr IndexType get_dimensions() {
             return FeatureType::kDimensions;
         }
 
         // Get index of learning feature and scale of learning rate
-        static void AppendTrainingFeatures(
+        static void append_training_features(
             IndexType base_index, std::vector<TrainingFeature>* training_features) {
 
             assert(base_index <FeatureType::kDimensions);
@@ -35,7 +35,7 @@ namespace Eval::NNUE::Features {
 
     // Add the original input features to the learning features
     template <typename FeatureType>
-    IndexType AppendBaseFeature(
+    IndexType append_base_feature(
         FeatureProperties properties, IndexType base_index,
         std::vector<TrainingFeature>* training_features) {
 
@@ -47,7 +47,7 @@ namespace Eval::NNUE::Features {
 
     // If the learning rate scale is not 0, inherit other types of learning features
     template <typename FeatureType>
-    IndexType InheritFeaturesIfRequired(
+    IndexType inherit_features_if_required(
         IndexType index_offset, FeatureProperties properties, IndexType base_index,
         std::vector<TrainingFeature>* training_features) {
 
@@ -55,17 +55,17 @@ namespace Eval::NNUE::Features {
             return 0;
         }
 
-        assert(properties.dimensions == Factorizer<FeatureType>::GetDimensions());
+        assert(properties.dimensions == Factorizer<FeatureType>::get_dimensions());
         assert(base_index < FeatureType::kDimensions);
 
         const auto start = training_features->size();
-        Factorizer<FeatureType>::AppendTrainingFeatures(
+        Factorizer<FeatureType>::append_training_features(
             base_index, training_features);
 
         for (auto i = start; i < training_features->size(); ++i) {
             auto& feature = (*training_features)[i];
-            assert(feature.GetIndex() < Factorizer<FeatureType>::GetDimensions());
-            feature.ShiftIndex(index_offset);
+            assert(feature.get_index() < Factorizer<FeatureType>::get_dimensions());
+            feature.shift_index(index_offset);
         }
 
         return properties.dimensions;
@@ -73,7 +73,7 @@ namespace Eval::NNUE::Features {
 
     // Return the index difference as needed, without adding learning features
     // Call instead of InheritFeaturesIfRequired() if there are no corresponding features
-    IndexType SkipFeatures(FeatureProperties properties) {
+    IndexType skip_features(FeatureProperties properties) {
         if (!properties.active)
             return 0;
 
@@ -82,7 +82,7 @@ namespace Eval::NNUE::Features {
 
     // Get the dimensionality of the learning feature
     template <std::size_t N>
-    constexpr IndexType GetActiveDimensions(
+    constexpr IndexType get_active_dimensions(
         const FeatureProperties (&properties)[N]) {
 
         static_assert(N > 0, "");
@@ -100,7 +100,7 @@ namespace Eval::NNUE::Features {
 
     // get the number of elements in the array
     template <typename T, std::size_t N>
-    constexpr std::size_t GetArrayLength(const T (&/*array*/)[N]) {
+    constexpr std::size_t get_array_length(const T (&/*array*/)[N]) {
         return N;
     }
 
diff --git a/src/nnue/trainer/features/factorizer_feature_set.h b/src/nnue/trainer/features/factorizer_feature_set.h
index d272a453..032a449b 100644
--- a/src/nnue/trainer/features/factorizer_feature_set.h
+++ b/src/nnue/trainer/features/factorizer_feature_set.h
@@ -22,12 +22,12 @@ namespace Eval::NNUE::Features {
             FeatureSet<FirstFeatureType, RemainingFeatureTypes...>::kDimensions;
 
         // Get the dimensionality of the learning feature
-        static constexpr IndexType GetDimensions() {
-            return Head::GetDimensions() + Tail::GetDimensions();
+        static constexpr IndexType get_dimensions() {
+            return Head::get_dimensions() + Tail::get_dimensions();
         }
 
         // Get index of learning feature and scale of learning rate
-        static void AppendTrainingFeatures(
+        static void append_training_features(
             IndexType base_index, std::vector<TrainingFeature>* training_features,
             IndexType base_dimensions = kBaseDimensions) {
 
@@ -36,29 +36,29 @@ namespace Eval::NNUE::Features {
             constexpr auto boundary = FeatureSet<RemainingFeatureTypes...>::kDimensions;
 
             if (base_index < boundary) {
-                Tail::AppendTrainingFeatures(
+                Tail::append_training_features(
                     base_index, training_features, base_dimensions);
             }
             else {
                 const auto start = training_features->size();
 
-                Head::AppendTrainingFeatures(
+                Head::append_training_features(
                     base_index - boundary, training_features, base_dimensions);
 
                 for (auto i = start; i < training_features->size(); ++i) {
                     auto& feature = (*training_features)[i];
-                    const auto index = feature.GetIndex();
+                    const auto index = feature.get_index();
 
-                    assert(index < Head::GetDimensions() ||
+                    assert(index < Head::get_dimensions() ||
                                (index >= base_dimensions &&
                                 index < base_dimensions +
-                                        Head::GetDimensions() - Head::kBaseDimensions));
+                                        Head::get_dimensions() - Head::kBaseDimensions));
 
                     if (index < Head::kBaseDimensions) {
-                        feature.ShiftIndex(Tail::kBaseDimensions);
+                        feature.shift_index(Tail::kBaseDimensions);
                     }
                     else {
-                        feature.ShiftIndex(Tail::GetDimensions() - Tail::kBaseDimensions);
+                        feature.shift_index(Tail::get_dimensions() - Tail::kBaseDimensions);
                     }
                 }
             }
@@ -74,12 +74,12 @@ namespace Eval::NNUE::Features {
         static constexpr IndexType kBaseDimensions = FeatureType::kDimensions;
 
         // Get the dimensionality of the learning feature
-        static constexpr IndexType GetDimensions() {
-            return Factorizer<FeatureType>::GetDimensions();
+        static constexpr IndexType get_dimensions() {
+            return Factorizer<FeatureType>::get_dimensions();
         }
 
         // Get index of learning feature and scale of learning rate
-        static void AppendTrainingFeatures(
+        static void append_training_features(
             IndexType base_index, std::vector<TrainingFeature>* training_features,
             IndexType base_dimensions = kBaseDimensions) {
 
@@ -87,14 +87,14 @@ namespace Eval::NNUE::Features {
 
             const auto start = training_features->size();
 
-            Factorizer<FeatureType>::AppendTrainingFeatures(
+            Factorizer<FeatureType>::append_training_features(
                 base_index, training_features);
 
             for (auto i = start; i < training_features->size(); ++i) {
                 auto& feature = (*training_features)[i];
-                assert(feature.GetIndex() < Factorizer<FeatureType>::GetDimensions());
-                if (feature.GetIndex() >= kBaseDimensions) {
-                    feature.ShiftIndex(base_dimensions - kBaseDimensions);
+                assert(feature.get_index() < Factorizer<FeatureType>::get_dimensions());
+                if (feature.get_index() >= kBaseDimensions) {
+                    feature.shift_index(base_dimensions - kBaseDimensions);
                 }
             }
         }
diff --git a/src/nnue/trainer/features/factorizer_half_kp.h b/src/nnue/trainer/features/factorizer_half_kp.h
index 1ed5bdd3..152722ac 100644
--- a/src/nnue/trainer/features/factorizer_half_kp.h
+++ b/src/nnue/trainer/features/factorizer_half_kp.h
@@ -37,25 +37,25 @@ namespace Eval::NNUE::Features {
             // kFeaturesHalfK
             {true, SQUARE_NB},
             // kFeaturesP
-            {true, Factorizer<P>::GetDimensions()},
+            {true, Factorizer<P>::get_dimensions()},
             // kFeaturesHalfRelativeKP
-            {true, Factorizer<HalfRelativeKP<AssociatedKing>>::GetDimensions()},
+            {true, Factorizer<HalfRelativeKP<AssociatedKing>>::get_dimensions()},
         };
 
-        static_assert(GetArrayLength(kProperties) == kNumTrainingFeatureTypes, "");
+        static_assert(get_array_length(kProperties) == kNumTrainingFeatureTypes, "");
 
     public:
         // Get the dimensionality of the learning feature
-        static constexpr IndexType GetDimensions() {
-            return GetActiveDimensions(kProperties);
+        static constexpr IndexType get_dimensions() {
+            return get_active_dimensions(kProperties);
         }
 
         // Get index of learning feature and scale of learning rate
-        static void AppendTrainingFeatures(
+        static void append_training_features(
             IndexType base_index, std::vector<TrainingFeature>* training_features) {
 
             // kFeaturesHalfKP
-            IndexType index_offset = AppendBaseFeature<FeatureType>(
+            IndexType index_offset = append_base_feature<FeatureType>(
                 kProperties[kFeaturesHalfKP], base_index, training_features);
 
             const auto sq_k = static_cast<Square>(base_index / PS_END);
@@ -71,20 +71,20 @@ namespace Eval::NNUE::Features {
             }
 
             // kFeaturesP
-            index_offset += InheritFeaturesIfRequired<P>(
+            index_offset += inherit_features_if_required<P>(
                 index_offset, kProperties[kFeaturesP], p, training_features);
             // kFeaturesHalfRelativeKP
             if (p >= PS_W_PAWN) {
-                index_offset += InheritFeaturesIfRequired<HalfRelativeKP<AssociatedKing>>(
+                index_offset += inherit_features_if_required<HalfRelativeKP<AssociatedKing>>(
                     index_offset, kProperties[kFeaturesHalfRelativeKP],
-                    HalfRelativeKP<AssociatedKing>::MakeIndex(sq_k, p),
+                    HalfRelativeKP<AssociatedKing>::make_index(sq_k, p),
                     training_features);
             }
             else {
-                index_offset += SkipFeatures(kProperties[kFeaturesHalfRelativeKP]);
+                index_offset += skip_features(kProperties[kFeaturesHalfRelativeKP]);
             }
 
-            assert(index_offset == GetDimensions());
+            assert(index_offset == get_dimensions());
         }
     };
 
diff --git a/src/nnue/trainer/trainer.h b/src/nnue/trainer/trainer.h
index 7d9b66ee..85666576 100644
--- a/src/nnue/trainer/trainer.h
+++ b/src/nnue/trainer/trainer.h
@@ -37,22 +37,22 @@ namespace Eval::NNUE {
         }
 
         TrainingFeature& operator+=(const TrainingFeature& other) {
-            assert(other.GetIndex() == GetIndex());
-            assert(other.GetCount() + GetCount() < (1 << kCountBits));
-            index_and_count_ += other.GetCount();
+            assert(other.get_index() == get_index());
+            assert(other.get_index() + get_count() < (1 << kCountBits));
+            index_and_count_ += other.get_count();
             return *this;
         }
 
-        IndexType GetIndex() const {
+        IndexType get_index() const {
             return static_cast<IndexType>(index_and_count_ >> kCountBits);
         }
 
-        void ShiftIndex(IndexType offset) {
-            assert(GetIndex() + offset < (1 << kIndexBits));
+        void shift_index(IndexType offset) {
+            assert(get_index() + offset < (1 << kIndexBits));
             index_and_count_ += offset << kCountBits;
         }
 
-        IndexType GetCount() const {
+        IndexType get_count() const {
             return static_cast<IndexType>(index_and_count_ & ((1 << kCountBits) - 1));
         }
 
@@ -86,7 +86,7 @@ namespace Eval::NNUE {
     };
 
     // determine whether to accept the message
-    bool ReceiveMessage(const std::string& name, Message* message) {
+    bool receive_message(const std::string& name, Message* message) {
         const auto subscript = "[" + std::to_string(message->num_peekers) + "]";
 
         if (message->name.substr(0, name.size() + 1) == name + "[") {
@@ -101,28 +101,15 @@ namespace Eval::NNUE {
         return false;
     }
 
-    // split the string
-    std::vector<std::string> Split(const std::string& input, char delimiter) {
-        std::istringstream stream(input);
-        std::string field;
-        std::vector<std::string> fields;
-
-        while (std::getline(stream, field, delimiter)) {
-            fields.push_back(field);
-        }
-
-        return fields;
-    }
-
     // round a floating point number to an integer
     template <typename IntType>
-    IntType Round(double value) {
+    IntType round(double value) {
         return static_cast<IntType>(std::floor(value + 0.5));
     }
 
     // make_shared with alignment
     template <typename T, typename... ArgumentTypes>
-    std::shared_ptr<T> MakeAlignedSharedPtr(ArgumentTypes&&... arguments) {
+    std::shared_ptr<T> make_aligned_shared_ptr(ArgumentTypes&&... arguments) {
         const auto ptr = new(std_aligned_alloc(alignof(T), sizeof(T)))
             T(std::forward<ArgumentTypes>(arguments)...);
 
diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h
index dd70b8fb..f6d374ef 100644
--- a/src/nnue/trainer/trainer_affine_transform.h
+++ b/src/nnue/trainer/trainer_affine_transform.h
@@ -21,7 +21,7 @@ namespace Eval::NNUE {
 
     public:
         // factory function
-        static std::shared_ptr<Trainer> Create(
+        static std::shared_ptr<Trainer> create(
             LayerType* target_layer, FeatureTransformer* ft) {
 
             return std::shared_ptr<Trainer>(
@@ -29,31 +29,31 @@ namespace Eval::NNUE {
         }
 
         // Set options such as hyperparameters
-        void SendMessage(Message* message) {
-            previous_layer_trainer_->SendMessage(message);
+        void send_message(Message* message) {
+            previous_layer_trainer_->send_message(message);
 
-            if (ReceiveMessage("momentum", message)) {
+            if (receive_message("momentum", message)) {
                 momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
             }
 
-            if (ReceiveMessage("learning_rate_scale", message)) {
+            if (receive_message("learning_rate_scale", message)) {
                 learning_rate_scale_ =
                     static_cast<LearnFloatType>(std::stod(message->value));
             }
 
-            if (ReceiveMessage("reset", message)) {
-                DequantizeParameters();
+            if (receive_message("reset", message)) {
+                dequantize_parameters();
             }
 
-            if (ReceiveMessage("quantize_parameters", message)) {
-                QuantizeParameters();
+            if (receive_message("quantize_parameters", message)) {
+                quantize_parameters();
             }
         }
 
         // Initialize the parameters with random numbers
         template <typename RNG>
-        void Initialize(RNG& rng) {
-            previous_layer_trainer_->Initialize(rng);
+        void initialize(RNG& rng) {
+            previous_layer_trainer_->initialize(rng);
 
             if (kIsOutputLayer) {
                 // Initialize output layer with 0
@@ -80,18 +80,18 @@ namespace Eval::NNUE {
                 }
             }
 
-            QuantizeParameters();
+            quantize_parameters();
         }
 
         // forward propagation
-        const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+        const LearnFloatType* propagate(const std::vector<Example>& batch) {
             if (output_.size() < kOutputDimensions * batch.size()) {
                 output_.resize(kOutputDimensions * batch.size());
                 gradients_.resize(kInputDimensions * batch.size());
             }
 
             batch_size_ = static_cast<IndexType>(batch.size());
-            batch_input_ = previous_layer_trainer_->Propagate(batch);
+            batch_input_ = previous_layer_trainer_->propagate(batch);
 #if defined(USE_BLAS)
             for (IndexType b = 0; b < batch_size_; ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
@@ -123,7 +123,7 @@ namespace Eval::NNUE {
         }
 
         // backpropagation
-        void Backpropagate(const LearnFloatType* gradients,
+        void backpropagate(const LearnFloatType* gradients,
                            LearnFloatType learning_rate) {
 
             const LearnFloatType local_learning_rate =
@@ -206,7 +206,7 @@ namespace Eval::NNUE {
             }
 
 #endif
-            previous_layer_trainer_->Backpropagate(gradients_.data(), learning_rate);
+            previous_layer_trainer_->backpropagate(gradients_.data(), learning_rate);
         }
 
     private:
@@ -214,7 +214,7 @@ namespace Eval::NNUE {
         Trainer(LayerType* target_layer, FeatureTransformer* ft) :
             batch_size_(0),
             batch_input_(nullptr),
-            previous_layer_trainer_(Trainer<PreviousLayer>::Create(
+            previous_layer_trainer_(Trainer<PreviousLayer>::create(
                 &target_layer->previous_layer_, ft)),
             target_layer_(target_layer),
             biases_(),
@@ -224,11 +224,11 @@ namespace Eval::NNUE {
             momentum_(0.2),
             learning_rate_scale_(1.0) {
 
-            DequantizeParameters();
+            dequantize_parameters();
         }
 
         // Weight saturation and parameterization
-        void QuantizeParameters() {
+        void quantize_parameters() {
             for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
                 weights_[i] = std::max(-kMaxWeightMagnitude,
                                        std::min(+kMaxWeightMagnitude, weights_[i]));
@@ -236,7 +236,7 @@ namespace Eval::NNUE {
 
             for (IndexType i = 0; i < kOutputDimensions; ++i) {
                 target_layer_->biases_[i] =
-                    Round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
+                    round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
             }
 
             for (IndexType i = 0; i < kOutputDimensions; ++i) {
@@ -244,14 +244,14 @@ namespace Eval::NNUE {
                 const auto padded_offset = LayerType::kPaddedInputDimensions * i;
                 for (IndexType j = 0; j < kInputDimensions; ++j) {
                     target_layer_->weights_[padded_offset + j] =
-                        Round<typename LayerType::WeightType>(
+                        round<typename LayerType::WeightType>(
                             weights_[offset + j] * kWeightScale);
                 }
             }
         }
 
         // read parameterized integer
-        void DequantizeParameters() {
+        void dequantize_parameters() {
             for (IndexType i = 0; i < kOutputDimensions; ++i) {
                 biases_[i] = static_cast<LearnFloatType>(
                     target_layer_->biases_[i] / kBiasScale);
diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index 902c2747..35503493 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -19,7 +19,7 @@ namespace Eval::NNUE {
 
     public:
         // factory function
-        static std::shared_ptr<Trainer> Create(
+        static std::shared_ptr<Trainer> create(
             LayerType* target_layer, FeatureTransformer* ft) {
 
             return std::shared_ptr<Trainer>(
@@ -27,27 +27,27 @@ namespace Eval::NNUE {
         }
 
         // Set options such as hyperparameters
-        void SendMessage(Message* message) {
-            previous_layer_trainer_->SendMessage(message);
-            if (ReceiveMessage("check_health", message)) {
-                CheckHealth();
+        void send_message(Message* message) {
+            previous_layer_trainer_->send_message(message);
+            if (receive_message("check_health", message)) {
+                check_health();
             }
         }
 
         // Initialize the parameters with random numbers
         template <typename RNG>
-        void Initialize(RNG& rng) {
-            previous_layer_trainer_->Initialize(rng);
+        void initialize(RNG& rng) {
+            previous_layer_trainer_->initialize(rng);
         }
 
         // forward propagation
-        const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+        const LearnFloatType* propagate(const std::vector<Example>& batch) {
             if (output_.size() < kOutputDimensions * batch.size()) {
               output_.resize(kOutputDimensions * batch.size());
               gradients_.resize(kInputDimensions * batch.size());
             }
 
-            const auto input = previous_layer_trainer_->Propagate(batch);
+            const auto input = previous_layer_trainer_->propagate(batch);
             batch_size_ = static_cast<IndexType>(batch.size());
             for (IndexType b = 0; b < batch_size_; ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
@@ -63,7 +63,7 @@ namespace Eval::NNUE {
         }
 
         // backpropagation
-        void Backpropagate(const LearnFloatType* gradients,
+        void backpropagate(const LearnFloatType* gradients,
                            LearnFloatType learning_rate) {
 
             for (IndexType b = 0; b < batch_size_; ++b) {
@@ -75,14 +75,14 @@ namespace Eval::NNUE {
                 }
             }
 
-            previous_layer_trainer_->Backpropagate(gradients_.data(), learning_rate);
+            previous_layer_trainer_->backpropagate(gradients_.data(), learning_rate);
         }
 
     private:
         // constructor
         Trainer(LayerType* target_layer, FeatureTransformer* ft) :
             batch_size_(0),
-            previous_layer_trainer_(Trainer<PreviousLayer>::Create(
+            previous_layer_trainer_(Trainer<PreviousLayer>::create(
                 &target_layer->previous_layer_, ft)),
             target_layer_(target_layer) {
 
@@ -93,7 +93,7 @@ namespace Eval::NNUE {
         }
 
         // Check if there are any problems with learning
-        void CheckHealth() {
+        void check_health() {
             const auto largest_min_activation = *std::max_element(
                 std::begin(min_activations_), std::end(min_activations_));
             const auto smallest_max_activation = *std::min_element(
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 4173f46d..a3d6c16a 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -34,44 +34,44 @@ namespace Eval::NNUE {
         friend struct AlignedDeleter;
 
         template <typename T, typename... ArgumentTypes>
-        friend std::shared_ptr<T> MakeAlignedSharedPtr(ArgumentTypes&&... arguments);
+        friend std::shared_ptr<T> make_aligned_shared_ptr(ArgumentTypes&&... arguments);
 
         // factory function
-        static std::shared_ptr<Trainer> Create(LayerType* target_layer) {
-            return MakeAlignedSharedPtr<Trainer>(target_layer);
+        static std::shared_ptr<Trainer> create(LayerType* target_layer) {
+            return make_aligned_shared_ptr<Trainer>(target_layer);
         }
 
         // Set options such as hyperparameters
-        void SendMessage(Message* message) {
-            if (ReceiveMessage("momentum", message)) {
+        void send_message(Message* message) {
+            if (receive_message("momentum", message)) {
                 momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
             }
 
-            if (ReceiveMessage("learning_rate_scale", message)) {
+            if (receive_message("learning_rate_scale", message)) {
                 learning_rate_scale_ =
                     static_cast<LearnFloatType>(std::stod(message->value));
             }
 
-            if (ReceiveMessage("reset", message)) {
-                DequantizeParameters();
+            if (receive_message("reset", message)) {
+                dequantize_parameters();
             }
 
-            if (ReceiveMessage("quantize_parameters", message)) {
-                QuantizeParameters();
+            if (receive_message("quantize_parameters", message)) {
+                quantize_parameters();
             }
 
-            if (ReceiveMessage("clear_unobserved_feature_weights", message)) {
-                ClearUnobservedFeatureWeights();
+            if (receive_message("clear_unobserved_feature_weights", message)) {
+                clear_unobserved_feature_weights();
             }
 
-            if (ReceiveMessage("check_health", message)) {
-                CheckHealth();
+            if (receive_message("check_health", message)) {
+                check_health();
             }
         }
 
         // Initialize the parameters with random numbers
         template <typename RNG>
-        void Initialize(RNG& rng) {
+        void initialize(RNG& rng) {
             std::fill(std::begin(weights_), std::end(weights_), +kZero);
 
             const double kSigma = 0.1 / std::sqrt(RawFeatures::kMaxActiveDimensions);
@@ -86,11 +86,11 @@ namespace Eval::NNUE {
                 biases_[i] = static_cast<LearnFloatType>(0.5);
             }
 
-            QuantizeParameters();
+            quantize_parameters();
         }
 
         // forward propagation
-        const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+        const LearnFloatType* propagate(const std::vector<Example>& batch) {
             if (output_.size() < kOutputDimensions * batch.size()) {
                 output_.resize(kOutputDimensions * batch.size());
                 gradients_.resize(kOutputDimensions * batch.size());
@@ -106,8 +106,8 @@ namespace Eval::NNUE {
 #if defined(USE_BLAS)
                     cblas_scopy(kHalfDimensions, biases_, 1, &output_[output_offset], 1);
                     for (const auto& feature : batch[b].training_features[c]) {
-                        const IndexType weights_offset = kHalfDimensions * feature.GetIndex();
-                        cblas_saxpy(kHalfDimensions, (float)feature.GetCount(),
+                        const IndexType weights_offset = kHalfDimensions * feature.get_index();
+                        cblas_saxpy(kHalfDimensions, (float)feature.get_count(),
                                     &weights_[weights_offset], 1, &output_[output_offset], 1);
                     }
 #else
@@ -115,10 +115,10 @@ namespace Eval::NNUE {
                         output_[output_offset + i] = biases_[i];
                     }
                     for (const auto& feature : batch[b].training_features[c]) {
-                        const IndexType weights_offset = kHalfDimensions * feature.GetIndex();
+                        const IndexType weights_offset = kHalfDimensions * feature.get_index();
                         for (IndexType i = 0; i < kHalfDimensions; ++i) {
                             output_[output_offset + i] +=
-                                feature.GetCount() * weights_[weights_offset + i];
+                                feature.get_count() * weights_[weights_offset + i];
                         }
                     }
 #endif
@@ -143,7 +143,7 @@ namespace Eval::NNUE {
         }
 
         // backpropagation
-        void Backpropagate(const LearnFloatType* gradients,
+        void backpropagate(const LearnFloatType* gradients,
                            LearnFloatType learning_rate) {
 
             const LearnFloatType local_learning_rate =
@@ -188,13 +188,13 @@ namespace Eval::NNUE {
                         const IndexType output_offset = batch_offset + kHalfDimensions * c;
                         for (const auto& feature : (*batch_)[b].training_features[c]) {
 #if defined(_OPENMP)
-                            if (feature.GetIndex() % num_threads != thread_index)
+                            if (feature.get_index() % num_threads != thread_index)
                                 continue;
 #endif
                             const IndexType weights_offset =
-                                kHalfDimensions * feature.GetIndex();
+                                kHalfDimensions * feature.get_index();
                             const auto scale = static_cast<LearnFloatType>(
-                                effective_learning_rate / feature.GetCount());
+                                effective_learning_rate / feature.get_count());
 
                             cblas_saxpy(kHalfDimensions, -scale,
                                         &gradients_[output_offset], 1,
@@ -228,9 +228,9 @@ namespace Eval::NNUE {
                 for (IndexType c = 0; c < 2; ++c) {
                     const IndexType output_offset = batch_offset + kHalfDimensions * c;
                     for (const auto& feature : (*batch_)[b].training_features[c]) {
-                        const IndexType weights_offset = kHalfDimensions * feature.GetIndex();
+                        const IndexType weights_offset = kHalfDimensions * feature.get_index();
                         const auto scale = static_cast<LearnFloatType>(
-                            effective_learning_rate / feature.GetCount());
+                            effective_learning_rate / feature.get_count());
 
                         for (IndexType i = 0; i < kHalfDimensions; ++i) {
                             weights_[weights_offset + i] -=
@@ -244,7 +244,7 @@ namespace Eval::NNUE {
             for (IndexType b = 0; b < batch_->size(); ++b) {
                 for (IndexType c = 0; c < 2; ++c) {
                     for (const auto& feature : (*batch_)[b].training_features[c]) {
-                        observed_features.set(feature.GetIndex());
+                        observed_features.set(feature.get_index());
                     }
                 }
             }
@@ -269,14 +269,14 @@ namespace Eval::NNUE {
             std::fill(std::begin(max_activations_), std::end(max_activations_),
                       std::numeric_limits<LearnFloatType>::lowest());
 
-            DequantizeParameters();
+            dequantize_parameters();
         }
 
         // Weight saturation and parameterization
-        void QuantizeParameters() {
+        void quantize_parameters() {
             for (IndexType i = 0; i < kHalfDimensions; ++i) {
                 target_layer_->biases_[i] =
-                    Round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
+                    round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
             }
 
             std::vector<TrainingFeature> training_features;
@@ -284,23 +284,23 @@ namespace Eval::NNUE {
 #pragma omp parallel for private(training_features)
             for (IndexType j = 0; j < RawFeatures::kDimensions; ++j) {
                 training_features.clear();
-                Features::Factorizer<RawFeatures>::AppendTrainingFeatures(
+                Features::Factorizer<RawFeatures>::append_training_features(
                     j, &training_features);
 
                 for (IndexType i = 0; i < kHalfDimensions; ++i) {
                     double sum = 0.0;
                     for (const auto& feature : training_features) {
-                        sum += weights_[kHalfDimensions * feature.GetIndex() + i];
+                        sum += weights_[kHalfDimensions * feature.get_index() + i];
                     }
 
                     target_layer_->weights_[kHalfDimensions * j + i] =
-                        Round<typename LayerType::WeightType>(sum * kWeightScale);
+                        round<typename LayerType::WeightType>(sum * kWeightScale);
                 }
             }
         }
 
         // read parameterized integer
-        void DequantizeParameters() {
+        void dequantize_parameters() {
             for (IndexType i = 0; i < kHalfDimensions; ++i) {
                 biases_[i] = static_cast<LearnFloatType>(
                     target_layer_->biases_[i] / kBiasScale);
@@ -317,7 +317,7 @@ namespace Eval::NNUE {
         }
 
         // Set the weight corresponding to the feature that does not appear in the learning data to 0
-        void ClearUnobservedFeatureWeights() {
+        void clear_unobserved_feature_weights() {
             for (IndexType i = 0; i < kInputDimensions; ++i) {
                 if (!observed_features.test(i)) {
                     std::fill(std::begin(weights_) + kHalfDimensions * i,
@@ -325,11 +325,11 @@ namespace Eval::NNUE {
                 }
             }
 
-            QuantizeParameters();
+            quantize_parameters();
         }
 
         // Check if there are any problems with learning
-        void CheckHealth() {
+        void check_health() {
             std::cout << "INFO: observed " << observed_features.count()
                       << " (out of " << kInputDimensions << ") features" << std::endl;
 
@@ -359,7 +359,7 @@ namespace Eval::NNUE {
 
         // number of input/output dimensions
         static constexpr IndexType kInputDimensions =
-            Features::Factorizer<RawFeatures>::GetDimensions();
+            Features::Factorizer<RawFeatures>::get_dimensions();
         static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
         static constexpr IndexType kHalfDimensions = LayerType::kHalfDimensions;
 
diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
index 45dcbacc..43968776 100644
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -14,7 +14,7 @@ namespace Eval::NNUE {
     class SharedInputTrainer {
     public:
         // factory function
-        static std::shared_ptr<SharedInputTrainer> Create(
+        static std::shared_ptr<SharedInputTrainer> create(
             FeatureTransformer* ft) {
 
             static std::shared_ptr<SharedInputTrainer> instance;
@@ -29,10 +29,10 @@ namespace Eval::NNUE {
         }
 
         // Set options such as hyperparameters
-        void SendMessage(Message* message) {
+        void send_message(Message* message) {
             if (num_calls_ == 0) {
                 current_operation_ = Operation::kSendMessage;
-                feature_transformer_trainer_->SendMessage(message);
+                feature_transformer_trainer_->send_message(message);
             }
 
             assert(current_operation_ == Operation::kSendMessage);
@@ -45,10 +45,10 @@ namespace Eval::NNUE {
 
         // Initialize the parameters with random numbers
         template <typename RNG>
-        void Initialize(RNG& rng) {
+        void initialize(RNG& rng) {
             if (num_calls_ == 0) {
                 current_operation_ = Operation::kInitialize;
-                feature_transformer_trainer_->Initialize(rng);
+                feature_transformer_trainer_->initialize(rng);
             }
 
             assert(current_operation_ == Operation::kInitialize);
@@ -60,7 +60,7 @@ namespace Eval::NNUE {
         }
 
         // forward propagation
-        const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+        const LearnFloatType* propagate(const std::vector<Example>& batch) {
             if (gradients_.size() < kInputDimensions * batch.size()) {
                 gradients_.resize(kInputDimensions * batch.size());
             }
@@ -69,7 +69,7 @@ namespace Eval::NNUE {
 
             if (num_calls_ == 0) {
                 current_operation_ = Operation::kPropagate;
-                output_ = feature_transformer_trainer_->Propagate(batch);
+                output_ = feature_transformer_trainer_->propagate(batch);
             }
 
             assert(current_operation_ == Operation::kPropagate);
@@ -83,11 +83,11 @@ namespace Eval::NNUE {
         }
 
         // backpropagation
-        void Backpropagate(const LearnFloatType* gradients,
+        void backpropagate(const LearnFloatType* gradients,
                            LearnFloatType learning_rate) {
 
             if (num_referrers_ == 1) {
-                feature_transformer_trainer_->Backpropagate(gradients, learning_rate);
+                feature_transformer_trainer_->backpropagate(gradients, learning_rate);
                 return;
             }
 
@@ -111,7 +111,7 @@ namespace Eval::NNUE {
             }
 
             if (++num_calls_ == num_referrers_) {
-                feature_transformer_trainer_->Backpropagate(
+                feature_transformer_trainer_->backpropagate(
                     gradients_.data(), learning_rate);
                 num_calls_ = 0;
                 current_operation_ = Operation::kNone;
@@ -125,7 +125,7 @@ namespace Eval::NNUE {
             num_referrers_(0),
             num_calls_(0),
             current_operation_(Operation::kNone),
-            feature_transformer_trainer_(Trainer<FeatureTransformer>::Create(
+            feature_transformer_trainer_(Trainer<FeatureTransformer>::create(
                 ft)),
             output_(nullptr) {
         }
@@ -175,25 +175,25 @@ namespace Eval::NNUE {
 
     public:
         // factory function
-        static std::shared_ptr<Trainer> Create(
+        static std::shared_ptr<Trainer> create(
             LayerType* /*target_layer*/, FeatureTransformer* ft) {
 
             return std::shared_ptr<Trainer>(new Trainer(ft));
         }
 
         // Set options such as hyperparameters
-        void SendMessage(Message* message) {
-            shared_input_trainer_->SendMessage(message);
+        void send_message(Message* message) {
+            shared_input_trainer_->send_message(message);
         }
 
         // Initialize the parameters with random numbers
         template <typename RNG>
-        void Initialize(RNG& rng) {
-            shared_input_trainer_->Initialize(rng);
+        void initialize(RNG& rng) {
+            shared_input_trainer_->initialize(rng);
         }
 
         // forward propagation
-        const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+        const LearnFloatType* propagate(const std::vector<Example>& batch) {
             if (output_.size() < kOutputDimensions * batch.size()) {
               output_.resize(kOutputDimensions * batch.size());
               gradients_.resize(kInputDimensions * batch.size());
@@ -201,7 +201,7 @@ namespace Eval::NNUE {
 
             batch_size_ = static_cast<IndexType>(batch.size());
 
-            const auto input = shared_input_trainer_->Propagate(batch);
+            const auto input = shared_input_trainer_->propagate(batch);
             for (IndexType b = 0; b < batch_size_; ++b) {
                 const IndexType input_offset = kInputDimensions * b;
                 const IndexType output_offset = kOutputDimensions * b;
@@ -219,7 +219,7 @@ namespace Eval::NNUE {
         }
 
         // backpropagation
-        void Backpropagate(const LearnFloatType* gradients,
+        void backpropagate(const LearnFloatType* gradients,
                            LearnFloatType learning_rate) {
 
             for (IndexType b = 0; b < batch_size_; ++b) {
@@ -233,14 +233,14 @@ namespace Eval::NNUE {
                     }
                 }
             }
-            shared_input_trainer_->Backpropagate(gradients_.data(), learning_rate);
+            shared_input_trainer_->backpropagate(gradients_.data(), learning_rate);
         }
 
     private:
         // constructor
         Trainer(FeatureTransformer* ft):
             batch_size_(0),
-            shared_input_trainer_(SharedInputTrainer::Create(ft)) {
+            shared_input_trainer_(SharedInputTrainer::create(ft)) {
         }
 
         // number of input/output dimensions
diff --git a/src/nnue/trainer/trainer_sum.h b/src/nnue/trainer/trainer_sum.h
index 24fc6152..c2e40b1c 100644
--- a/src/nnue/trainer/trainer_sum.h
+++ b/src/nnue/trainer/trainer_sum.h
@@ -21,7 +21,7 @@ namespace Eval::NNUE {
 
     public:
         // factory function
-        static std::shared_ptr<Trainer> Create(
+        static std::shared_ptr<Trainer> create(
             LayerType* target_layer, FeatureTransformer* ft) {
 
             return std::shared_ptr<Trainer>(
@@ -29,26 +29,26 @@ namespace Eval::NNUE {
         }
 
         // Set options such as hyperparameters
-        void SendMessage(Message* message) {
+        void send_message(Message* message) {
             // The results of other member functions do not depend on the processing order, so
             // Tail is processed first for the purpose of simplifying the implementation, but
             // SendMessage processes Head first to make it easier to understand subscript correspondence
-            previous_layer_trainer_->SendMessage(message);
-            Tail::SendMessage(message);
+            previous_layer_trainer_->send_message(message);
+            Tail::send_message(message);
         }
 
         // Initialize the parameters with random numbers
         template <typename RNG>
-        void Initialize(RNG& rng) {
-            Tail::Initialize(rng);
-            previous_layer_trainer_->Initialize(rng);
+        void initialize(RNG& rng) {
+            Tail::initialize(rng);
+            previous_layer_trainer_->initialize(rng);
         }
 
         // forward propagation
-        /*const*/ LearnFloatType* Propagate(const std::vector<Example>& batch) {
+        /*const*/ LearnFloatType* propagate(const std::vector<Example>& batch) {
             batch_size_ = static_cast<IndexType>(batch.size());
-            auto output = Tail::Propagate(batch);
-            const auto head_output = previous_layer_trainer_->Propagate(batch);
+            auto output = Tail::propagate(batch);
+            const auto head_output = previous_layer_trainer_->propagate(batch);
 
 #if defined(USE_BLAS)
             cblas_saxpy(kOutputDimensions * batch_size_, 1.0,
@@ -66,11 +66,11 @@ namespace Eval::NNUE {
         }
 
         // backpropagation
-        void Backpropagate(const LearnFloatType* gradients,
+        void backpropagate(const LearnFloatType* gradients,
                            LearnFloatType learning_rate) {
 
-            Tail::Backpropagate(gradients, learning_rate);
-            previous_layer_trainer_->Backpropagate(gradients, learning_rate);
+            Tail::backpropagate(gradients, learning_rate);
+            previous_layer_trainer_->backpropagate(gradients, learning_rate);
         }
 
     private:
@@ -78,7 +78,7 @@ namespace Eval::NNUE {
         Trainer(LayerType* target_layer, FeatureTransformer* ft):
             Tail(target_layer, ft),
             batch_size_(0),
-            previous_layer_trainer_(Trainer<FirstPreviousLayer>::Create(
+            previous_layer_trainer_(Trainer<FirstPreviousLayer>::create(
                 &target_layer->previous_layer_, ft)),
             target_layer_(target_layer) {
         }
@@ -110,7 +110,7 @@ namespace Eval::NNUE {
 
     public:
         // factory function
-        static std::shared_ptr<Trainer> Create(
+        static std::shared_ptr<Trainer> create(
             LayerType* target_layer, FeatureTransformer* ft) {
 
             return std::shared_ptr<Trainer>(
@@ -118,24 +118,24 @@ namespace Eval::NNUE {
         }
 
         // Set options such as hyperparameters
-        void SendMessage(Message* message) {
-            previous_layer_trainer_->SendMessage(message);
+        void send_message(Message* message) {
+            previous_layer_trainer_->send_message(message);
         }
 
         // Initialize the parameters with random numbers
         template <typename RNG>
-        void Initialize(RNG& rng) {
-            previous_layer_trainer_->Initialize(rng);
+        void initialize(RNG& rng) {
+            previous_layer_trainer_->initialize(rng);
         }
 
         // forward propagation
-        /*const*/ LearnFloatType* Propagate(const std::vector<Example>& batch) {
+        /*const*/ LearnFloatType* propagate(const std::vector<Example>& batch) {
             if (output_.size() < kOutputDimensions * batch.size()) {
                 output_.resize(kOutputDimensions * batch.size());
             }
 
             batch_size_ = static_cast<IndexType>(batch.size());
-            const auto output = previous_layer_trainer_->Propagate(batch);
+            const auto output = previous_layer_trainer_->propagate(batch);
 
 #if defined(USE_BLAS)
             cblas_scopy(kOutputDimensions * batch_size_, output, 1, &output_[0], 1);
@@ -152,17 +152,17 @@ namespace Eval::NNUE {
         }
 
         // backpropagation
-        void Backpropagate(const LearnFloatType* gradients,
+        void backpropagate(const LearnFloatType* gradients,
                            LearnFloatType learning_rate) {
 
-            previous_layer_trainer_->Backpropagate(gradients, learning_rate);
+            previous_layer_trainer_->backpropagate(gradients, learning_rate);
         }
 
     private:
         // constructor
         Trainer(LayerType* target_layer, FeatureTransformer* ft) :
             batch_size_(0),
-            previous_layer_trainer_(Trainer<PreviousLayer>::Create(
+            previous_layer_trainer_(Trainer<PreviousLayer>::create(
                 &target_layer->previous_layer_, ft)),
             target_layer_(target_layer) {
         }
diff --git a/src/uci.cpp b/src/uci.cpp
index 896f6db8..b5a0524c 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -53,7 +53,7 @@ void test_cmd(Position& pos, istringstream& is)
     std::string param;
     is >> param;
 
-    if (param == "nnue") Eval::NNUE::TestCommand(pos, is);
+    if (param == "nnue") Eval::NNUE::test_command(pos, is);
 }
 
 namespace {

From 5188c26b2081740fc668aced2a544822a3ce479b Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 17 Oct 2020 23:26:29 +0200
Subject: [PATCH 220/398] Allow execution of tasks on the global thread pool.

---
 src/thread.cpp | 27 +++++++++++++++++++++++++--
 src/thread.h   |  6 ++++++
 src/uci.cpp    |  6 ++++++
 3 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/src/thread.cpp b/src/thread.cpp
index c81ac43d..e4226769 100644
--- a/src/thread.cpp
+++ b/src/thread.cpp
@@ -80,6 +80,13 @@ void Thread::start_searching() {
   cv.notify_one(); // Wake up the thread in idle_loop()
 }
 
+void Thread::execute_task(std::function<void(Thread&)> t)
+{
+  std::lock_guard<std::mutex> lk(mutex);
+  task = std::move(t);
+  cv.notify_one(); // Wake up the thread in idle_loop()
+}
+
 
 /// Thread::wait_for_search_finished() blocks on the condition variable
 /// until the thread has finished searching.
@@ -109,14 +116,22 @@ void Thread::idle_loop() {
       std::unique_lock<std::mutex> lk(mutex);
       searching = false;
       cv.notify_one(); // Wake up anyone waiting for search finished
-      cv.wait(lk, [&]{ return searching; });
+      cv.wait(lk, [&]{ return searching || task; });
 
       if (exit)
           return;
 
       lk.unlock();
 
-      search();
+      if (task)
+      {
+        task(*this);
+        task = nullptr;
+      }
+      else
+      {
+        search();
+      }
   }
 }
 
@@ -162,6 +177,14 @@ void ThreadPool::clear() {
 }
 
 
+void ThreadPool::execute_parallel(std::function<void(Thread&)> task)
+{
+  for(Thread* th : *this)
+  {
+    th->execute_task(task);
+  }
+}
+
 /// ThreadPool::start_thinking() wakes up main thread waiting in idle_loop() and
 /// returns immediately. Main thread will wake up other threads and start the search.
 
diff --git a/src/thread.h b/src/thread.h
index 501a6042..8e9e6fba 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -24,6 +24,7 @@
 #include <mutex>
 #include <thread>
 #include <vector>
+#include <functional>
 
 #include "material.h"
 #include "movepick.h"
@@ -50,10 +51,12 @@ public:
   explicit Thread(size_t);
   virtual ~Thread();
   virtual void search();
+  virtual void execute_task(std::function<void(Thread&)> t);
   void clear();
   void idle_loop();
   void start_searching();
   void wait_for_search_finished();
+  size_t thread_idx() const { return idx; }
 
   Pawns::Table pawnsTable;
   Material::Table materialTable;
@@ -78,6 +81,7 @@ public:
   bool UseRule50;
   Depth ProbeDepth;
 
+  std::function<void(Thread&)> task;
 };
 
 
@@ -105,6 +109,8 @@ struct MainThread : public Thread {
 
 struct ThreadPool : public std::vector<Thread*> {
 
+  void execute_parallel(std::function<void(Thread&)> task);
+
   void start_thinking(Position&, StateListPtr&, const Search::LimitsType&, bool = false);
   void clear();
   void set(size_t);
diff --git a/src/uci.cpp b/src/uci.cpp
index b5a0524c..1aa9f95e 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -345,6 +345,12 @@ void UCI::loop(int argc, char* argv[]) {
       // Command to call qsearch(),search() directly for testing
       else if (token == "qsearch") qsearch_cmd(pos);
       else if (token == "search") search_cmd(pos, is);
+      else if (token == "tasktest")
+      {
+        Threads.execute_parallel([](auto& th) {
+          std::cout << th.thread_idx() << '\n';
+        });
+      }
 
       // test command
       else if (token == "test") test_cmd(pos, is);

From 97fb9a89e46f485c64c55d585981c46f032c81d0 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 18 Oct 2020 10:34:48 +0200
Subject: [PATCH 221/398] allow waiting for task completion.

---
 src/thread.cpp | 13 +++++++++++++
 src/thread.h   |  2 ++
 2 files changed, 15 insertions(+)

diff --git a/src/thread.cpp b/src/thread.cpp
index e4226769..874b09ee 100644
--- a/src/thread.cpp
+++ b/src/thread.cpp
@@ -98,6 +98,12 @@ void Thread::wait_for_search_finished() {
 }
 
 
+void Thread::wait_for_task_finished() {
+
+  std::unique_lock<std::mutex> lk(mutex);
+  cv.wait(lk, [&]{ return !task; });
+}
+
 /// Thread::idle_loop() is where the thread is parked, blocked on the
 /// condition variable, when it has no work to do.
 
@@ -293,3 +299,10 @@ void ThreadPool::wait_for_search_finished() const {
         if (th != front())
             th->wait_for_search_finished();
 }
+
+
+void ThreadPool::wait_for_tasks_finished() const {
+
+    for (Thread* th : *this)
+        th->wait_for_task_finished();
+}
diff --git a/src/thread.h b/src/thread.h
index 8e9e6fba..8be6eb5a 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -56,6 +56,7 @@ public:
   void idle_loop();
   void start_searching();
   void wait_for_search_finished();
+  void wait_for_task_finished();
   size_t thread_idx() const { return idx; }
 
   Pawns::Table pawnsTable;
@@ -121,6 +122,7 @@ struct ThreadPool : public std::vector<Thread*> {
   Thread* get_best_thread() const;
   void start_searching();
   void wait_for_search_finished() const;
+  void wait_for_tasks_finished() const;
 
   std::atomic_bool stop, increaseDepth;
 

From fd229c0768d80e7a71353e044556bbf74dd5c145 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 18 Oct 2020 22:35:16 +0200
Subject: [PATCH 222/398] Fix races and UBs

---
 src/thread.cpp | 28 ++++++++++++++++------------
 src/thread.h   | 11 +++++------
 2 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/src/thread.cpp b/src/thread.cpp
index 874b09ee..2ecd167a 100644
--- a/src/thread.cpp
+++ b/src/thread.cpp
@@ -35,6 +35,7 @@ ThreadPool Threads; // Global object
 Thread::Thread(size_t n) : idx(n), stdThread(&Thread::idle_loop, this) {
 
   wait_for_search_finished();
+  wait_for_worker_finished();
 }
 
 
@@ -80,10 +81,11 @@ void Thread::start_searching() {
   cv.notify_one(); // Wake up the thread in idle_loop()
 }
 
-void Thread::execute_task(std::function<void(Thread&)> t)
+void Thread::execute_with_worker(std::function<void(Thread&)> t)
 {
   std::lock_guard<std::mutex> lk(mutex);
-  task = std::move(t);
+  worker = std::move(t);
+  searching = true;
   cv.notify_one(); // Wake up the thread in idle_loop()
 }
 
@@ -98,10 +100,10 @@ void Thread::wait_for_search_finished() {
 }
 
 
-void Thread::wait_for_task_finished() {
+void Thread::wait_for_worker_finished() {
 
   std::unique_lock<std::mutex> lk(mutex);
-  cv.wait(lk, [&]{ return !task; });
+  cv.wait(lk, [&]{ return !searching; });
 }
 
 /// Thread::idle_loop() is where the thread is parked, blocked on the
@@ -121,18 +123,20 @@ void Thread::idle_loop() {
   {
       std::unique_lock<std::mutex> lk(mutex);
       searching = false;
+      worker = nullptr;
       cv.notify_one(); // Wake up anyone waiting for search finished
-      cv.wait(lk, [&]{ return searching || task; });
+      cv.wait(lk, [&]{ return searching; });
 
       if (exit)
           return;
 
+      auto wrk = std::move(worker);
+
       lk.unlock();
 
-      if (task)
+      if (wrk)
       {
-        task(*this);
-        task = nullptr;
+        wrk(*this);
       }
       else
       {
@@ -183,11 +187,11 @@ void ThreadPool::clear() {
 }
 
 
-void ThreadPool::execute_parallel(std::function<void(Thread&)> task)
+void ThreadPool::execute_with_workers(std::function<void(Thread&)> worker)
 {
   for(Thread* th : *this)
   {
-    th->execute_task(task);
+    th->execute_with_worker(std::move(worker));
   }
 }
 
@@ -301,8 +305,8 @@ void ThreadPool::wait_for_search_finished() const {
 }
 
 
-void ThreadPool::wait_for_tasks_finished() const {
+void ThreadPool::wait_for_workers_finished() const {
 
     for (Thread* th : *this)
-        th->wait_for_task_finished();
+        th->wait_for_worker_finished();
 }
diff --git a/src/thread.h b/src/thread.h
index 8be6eb5a..7474ea44 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -45,18 +45,19 @@ class Thread {
   std::condition_variable cv;
   size_t idx;
   bool exit = false, searching = true; // Set before starting std::thread
+  std::function<void(Thread&)> worker;
   NativeThread stdThread;
 
 public:
   explicit Thread(size_t);
   virtual ~Thread();
   virtual void search();
-  virtual void execute_task(std::function<void(Thread&)> t);
+  virtual void execute_with_worker(std::function<void(Thread&)> t);
   void clear();
   void idle_loop();
   void start_searching();
   void wait_for_search_finished();
-  void wait_for_task_finished();
+  void wait_for_worker_finished();
   size_t thread_idx() const { return idx; }
 
   Pawns::Table pawnsTable;
@@ -81,8 +82,6 @@ public:
   int Cardinality;
   bool UseRule50;
   Depth ProbeDepth;
-
-  std::function<void(Thread&)> task;
 };
 
 
@@ -110,7 +109,7 @@ struct MainThread : public Thread {
 
 struct ThreadPool : public std::vector<Thread*> {
 
-  void execute_parallel(std::function<void(Thread&)> task);
+  void execute_with_workers(std::function<void(Thread&)> worker);
 
   void start_thinking(Position&, StateListPtr&, const Search::LimitsType&, bool = false);
   void clear();
@@ -122,7 +121,7 @@ struct ThreadPool : public std::vector<Thread*> {
   Thread* get_best_thread() const;
   void start_searching();
   void wait_for_search_finished() const;
-  void wait_for_tasks_finished() const;
+  void wait_for_workers_finished() const;
 
   std::atomic_bool stop, increaseDepth;
 

From 71862e2ebbf91527e0ca18ae44757425797b1f9e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 19 Oct 2020 13:27:39 +0200
Subject: [PATCH 223/398] remove incorrect move in execute_with_workers

---
 src/thread.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/thread.cpp b/src/thread.cpp
index 2ecd167a..72333078 100644
--- a/src/thread.cpp
+++ b/src/thread.cpp
@@ -191,7 +191,7 @@ void ThreadPool::execute_with_workers(std::function<void(Thread&)> worker)
 {
   for(Thread* th : *this)
   {
-    th->execute_with_worker(std::move(worker));
+    th->execute_with_worker(worker);
   }
 }
 

From 74af28763718258f250500dfd19b5d68c12339b8 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 19 Oct 2020 15:27:33 +0200
Subject: [PATCH 224/398] Fix execute_with_workers test call in uci

---
 src/uci.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/uci.cpp b/src/uci.cpp
index 1aa9f95e..b05c7eeb 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -347,7 +347,7 @@ void UCI::loop(int argc, char* argv[]) {
       else if (token == "search") search_cmd(pos, is);
       else if (token == "tasktest")
       {
-        Threads.execute_parallel([](auto& th) {
+        Threads.execute_with_workers([](auto& th) {
           std::cout << th.thread_idx() << '\n';
         });
       }

From f2ad307de313d18c56b147f8a682971cd8ca088a Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Tue, 20 Oct 2020 10:50:59 +0200
Subject: [PATCH 225/398] Clarify the behaviour of execute_with_worker[s]

---
 src/thread.cpp |  3 +--
 src/thread.h   | 11 ++++++++++-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/thread.cpp b/src/thread.cpp
index 72333078..e867048d 100644
--- a/src/thread.cpp
+++ b/src/thread.cpp
@@ -186,8 +186,7 @@ void ThreadPool::clear() {
   main()->previousTimeReduction = 1.0;
 }
 
-
-void ThreadPool::execute_with_workers(std::function<void(Thread&)> worker)
+void ThreadPool::execute_with_workers(const std::function<void(Thread&)>& worker)
 {
   for(Thread* th : *this)
   {
diff --git a/src/thread.h b/src/thread.h
index 7474ea44..c0a01770 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -52,7 +52,13 @@ public:
   explicit Thread(size_t);
   virtual ~Thread();
   virtual void search();
+
+  // The function object to be executed is taken by value to remove
+  // the need for separate lvalue and rvalue overloads.
+  // The worker thread needs to have ownership of the task
+  // to be executed because otherwise there's no way to manage its lifetime.
   virtual void execute_with_worker(std::function<void(Thread&)> t);
+
   void clear();
   void idle_loop();
   void start_searching();
@@ -109,7 +115,10 @@ struct MainThread : public Thread {
 
 struct ThreadPool : public std::vector<Thread*> {
 
-  void execute_with_workers(std::function<void(Thread&)> worker);
+  // Each thread gets its own copy of the `worker` function object.
+  // This means that each worker thread will have exclusive access
+  // to the state of the `worker` function object.
+  void execute_with_workers(const std::function<void(Thread&)>& worker);
 
   void start_thinking(Position&, StateListPtr&, const Search::LimitsType&, bool = false);
   void clear();

From ff06d1e0ad571a5d6f12de6b1b0f7b0a354d05d8 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 18 Oct 2020 11:23:58 +0200
Subject: [PATCH 226/398] Rewrite learner to be based on stockfish's thread
 pool. Reduce coupling along the way

---
 src/learn/learn.cpp | 1013 ++++++++++++++++++++-----------------------
 src/misc.h          |   12 +
 2 files changed, 472 insertions(+), 553 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index dfbba391..411e0016 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -20,7 +20,6 @@
 #include "learn.h"
 
 #include "convert.h"
-#include "multi_think.h"
 #include "sfen_stream.h"
 
 #include "misc.h"
@@ -95,6 +94,68 @@ namespace Learner
     // Using stockfish's WDL with win rate model instead of sigmoid
     static bool use_wdl = false;
 
+    namespace Detail {
+        template <bool AtomicV>
+        struct Loss
+        {
+            using T =
+                std::conditional_t<
+                    AtomicV,
+                    atomic<double>,
+                    double
+                >;
+
+            T cross_entropy_eval{0.0};
+            T cross_entropy_win{0.0};
+            T cross_entropy{0.0};
+            T entropy_eval{0.0};
+            T entropy_win{0.0};
+            T entropy{0.0};
+            T count{0.0};
+
+            template <bool OtherAtomicV>
+            Loss& operator += (const Loss<OtherAtomicV>& rhs)
+            {
+                cross_entropy_eval += rhs.cross_entropy_eval;
+                cross_entropy_win += rhs.cross_entropy_win;
+                cross_entropy += rhs.cross_entropy;
+                entropy_eval += rhs.entropy_eval;
+                entropy_win += rhs.entropy_win;
+                entropy += rhs.entropy;
+                count += rhs.count;
+
+                return *this;
+            }
+
+            void reset()
+            {
+                cross_entropy_eval = 0.0;
+                cross_entropy_win = 0.0;
+                cross_entropy = 0.0;
+                entropy_eval = 0.0;
+                entropy_win = 0.0;
+                entropy = 0.0;
+                count = 0.0;
+            }
+
+            void print(const std::string& prefix, ostream& s) const
+            {
+                s
+                    << "INFO: "
+                    << prefix << "_cross_entropy_eval = " << cross_entropy_eval / count
+                    << " , " << prefix << "_cross_entropy_win = " << cross_entropy_win / count
+                    << " , " << prefix << "_entropy_eval = " << entropy_eval / count
+                    << " , " << prefix << "_entropy_win = " << entropy_win / count
+                    << " , " << prefix << "_cross_entropy = " << cross_entropy / count
+                    << " , " << prefix << "_entropy = " << entropy / count
+                    << endl;
+            }
+        };
+    }
+
+    using Loss = Detail::Loss<false>;
+    using AtomicLoss = Detail::Loss<true>;
+
     // A function that converts the evaluation value to the winning rate [0,1]
     double winning_percentage(double value)
     {
@@ -243,16 +304,10 @@ namespace Learner
     // The individual cross entropy of the win/loss term and win
     // rate term of the elmo expression is returned
     // to the arguments cross_entropy_eval and cross_entropy_win.
-    void calc_cross_entropy(
+    Loss calc_cross_entropy(
         Value teacher_signal,
         Value shallow,
-        const PackedSfenValue& psv,
-        double& cross_entropy_eval,
-        double& cross_entropy_win,
-        double& cross_entropy,
-        double& entropy_eval,
-        double& entropy_win,
-        double& entropy)
+        const PackedSfenValue& psv)
     {
         // Teacher winning probability.
         const double q = winning_percentage(shallow, psv.gamePly);
@@ -264,19 +319,25 @@ namespace Learner
 
         const double m = (1.0 - lambda) * t + lambda * p;
 
-        cross_entropy_eval =
+        Loss loss{};
+
+        loss.cross_entropy_eval =
             (-p * std::log(q + epsilon) - (1.0 - p) * std::log(1.0 - q + epsilon));
-        cross_entropy_win =
+        loss.cross_entropy_win =
             (-t * std::log(q + epsilon) - (1.0 - t) * std::log(1.0 - q + epsilon));
-        entropy_eval =
+        loss.entropy_eval =
             (-p * std::log(p + epsilon) - (1.0 - p) * std::log(1.0 - p + epsilon));
-        entropy_win =
+        loss.entropy_win =
             (-t * std::log(t + epsilon) - (1.0 - t) * std::log(1.0 - t + epsilon));
 
-        cross_entropy =
+        loss.cross_entropy =
             (-m * std::log(q + epsilon) - (1.0 - m) * std::log(1.0 - q + epsilon));
-        entropy =
+        loss.entropy =
             (-m * std::log(m + epsilon) - (1.0 - m) * std::log(1.0 - m + epsilon));
+
+        loss.count = 1;
+
+        return loss;
     }
 
     // Other objective functions may be considered in the future...
@@ -288,12 +349,6 @@ namespace Learner
     // Sfen reader
     struct SfenReader
     {
-        // Number of phases used for calculation such as mse
-        // mini-batch size = 1M is standard, so 0.2% of that should be negligible in terms of time.
-        // Since search() is performed with depth = 1 in calculation of
-        // move match rate, simple comparison is not possible...
-        static constexpr uint64_t sfen_for_mse_size = 2000;
-
         // Number of phases buffered by each thread 0.1M phases. 4M phase at 40HT
         static constexpr size_t THREAD_BUFFER_SIZE = 10 * 1000;
 
@@ -303,11 +358,6 @@ namespace Learner
         // SFEN_READ_SIZE is a multiple of THREAD_BUFFER_SIZE.
         static constexpr const size_t SFEN_READ_SIZE = LEARN_SFEN_READ_SIZE;
 
-        // hash to limit the reading of the same situation
-        // Is there too many 64 million phases? Or Not really..
-        // It must be 2**N because it will be used as the mask to calculate hash_index.
-        static constexpr uint64_t READ_SFEN_HASH_SIZE = 64 * 1024 * 1024;
-
         // Do not use std::random_device().
         // Because it always the same integers on MinGW.
         SfenReader(int thread_num, const std::string& seed) :
@@ -315,15 +365,9 @@ namespace Learner
         {
             packed_sfens.resize(thread_num);
             total_read = 0;
-            total_done = 0;
-            last_done = 0;
-            next_update_weights = 0;
-            save_count = 0;
             end_of_files = false;
-            no_shuffle = false;
+            shuffle = true;
             stop_flag = false;
-
-            hash.resize(READ_SFEN_HASH_SIZE);
         }
 
         ~SfenReader()
@@ -333,30 +377,30 @@ namespace Learner
         }
 
         // Load the phase for calculation such as mse.
-        void read_for_mse()
+        PSVector read_for_mse(uint64_t count)
         {
-            auto th = Threads.main();
-            Position& pos = th->rootPos;
-            for (uint64_t i = 0; i < sfen_for_mse_size; ++i)
+            PSVector sfen_for_mse;
+            sfen_for_mse.reserve(count);
+
+            for (uint64_t i = 0; i < count; ++i)
             {
                 PackedSfenValue ps;
                 if (!read_to_thread_buffer(0, ps))
                 {
                     cout << "Error! read packed sfen , failed." << endl;
-                    break;
+                    return sfen_for_mse;
                 }
 
                 sfen_for_mse.push_back(ps);
-
-                // Get the hash key.
-                StateInfo si;
-                pos.set_from_packed_sfen(ps.sfen, &si, th);
-                sfen_for_mse_hash.insert(pos.key());
             }
+
+            return sfen_for_mse;
         }
 
-        void read_validation_set(const string& file_name, int eval_limit)
+        PSVector read_validation_set(const string& file_name, int eval_limit)
         {
+            PSVector sfen_for_mse;
+
             auto input = open_sfen_input_file(file_name);
 
             while(!input->eof())
@@ -379,6 +423,8 @@ namespace Learner
                     break;
                 }
             }
+
+            return sfen_for_mse;
         }
 
         // [ASYNC] Thread returns one aspect. Otherwise returns false.
@@ -465,8 +511,8 @@ namespace Learner
                         return false;
 
                     // Get the next file name.
-                    string filename = filenames.back();
-                    filenames.pop_back();
+                    string filename = filenames.front();
+                    filenames.pop_front();
 
                     sfen_input_stream = open_sfen_input_file(filename);
                     cout << "open filename = " << filename << endl;
@@ -515,7 +561,7 @@ namespace Learner
                 }
 
                 // Shuffle the read phase data.
-                if (!no_shuffle)
+                if (shuffle)
                 {
                     Algo::shuffle(sfens, prng);
                 }
@@ -553,45 +599,37 @@ namespace Learner
             }
         }
 
-        // Determine if it is a phase for calculating rmse.
-        // (The computational aspects of rmse should not be used for learning.)
-        bool is_for_rmse(Key key) const
+        void stop()
         {
-            return sfen_for_mse_hash.count(key) != 0;
+            stop_flag = true;
         }
 
-        // sfen files
-        vector<string> filenames;
+        void set_do_shuffle(bool v)
+        {
+            shuffle = v;
+        }
 
-        // number of phases read (file to memory buffer)
-        atomic<uint64_t> total_read;
-
-        // number of processed phases
-        atomic<uint64_t> total_done;
-
-        // number of cases processed so far
-        uint64_t last_done;
-
-        // If total_read exceeds this value, update_weights() and calculate mse.
-        std::atomic<uint64_t> next_update_weights;
-
-        uint64_t save_count;
-
-        // Do not shuffle when reading the phase.
-        bool no_shuffle;
-
-        std::atomic<bool> stop_flag;
-
-        vector<Key> hash;
-
-        // test phase for mse calculation
-        PSVector sfen_for_mse;
+        void add_file(const std::string& filename)
+        {
+            filenames.push_back(filename);
+        }
 
     protected:
 
         // worker thread reading file in background
         std::thread file_worker_thread;
 
+        // sfen files
+        deque<string> filenames;
+
+        std::atomic<bool> stop_flag;
+
+        // number of phases read (file to memory buffer)
+        atomic<uint64_t> total_read;
+
+        // Do not shuffle when reading the phase.
+        bool shuffle;
+
         // Random number to shuffle when reading the phase
         PRNG prng;
 
@@ -612,27 +650,25 @@ namespace Learner
         // Each worker thread fills its own packed_sfens[thread_id] from here.
         // * Lock and access the mutex.
         std::list<std::unique_ptr<PSVector>> packed_sfens_pool;
-
-        // Hold the hash key so that the mse calculation phase is not used for learning.
-        std::unordered_set<Key> sfen_for_mse_hash;
     };
 
     // Class to generate sfen with multiple threads
-    struct LearnerThink : public MultiThink
+    struct LearnerThink
     {
-        LearnerThink(SfenReader& sr_, const std::string& seed) :
-            MultiThink(seed),
-            sr(sr_),
-            stop_flag(false),
-            save_only_once(false)
-        {
-            learn_sum_cross_entropy_eval = 0.0;
-            learn_sum_cross_entropy_win = 0.0;
-            learn_sum_cross_entropy = 0.0;
-            learn_sum_entropy_eval = 0.0;
-            learn_sum_entropy_win = 0.0;
-            learn_sum_entropy = 0.0;
+        // Number of phases used for calculation such as mse
+        // mini-batch size = 1M is standard, so 0.2% of that should be negligible in terms of time.
+        // Since search() is performed with depth = 1 in calculation of
+        // move match rate, simple comparison is not possible...
+        static constexpr uint64_t sfen_for_mse_size = 2000;
 
+        LearnerThink(uint64_t thread_num, const std::string& seed) :
+            prng(seed),
+            sr(thread_num, std::to_string(prng.next_random_seed())),
+            learn_loss_sum{}
+        {
+            save_only_once = false;
+            save_count = 0;
+            loss_output_count = 0;
             newbob_decay = 1.0;
             newbob_num_trials = 2;
             auto_lr_drop = 0;
@@ -640,32 +676,27 @@ namespace Learner
             best_loss = std::numeric_limits<double>::infinity();
             latest_loss_sum = 0.0;
             latest_loss_count = 0;
+            total_done = 0;
         }
 
-        virtual void thread_worker(size_t thread_id);
-
-        // Start a thread that loads the phase file in the background.
-        void start_file_read_worker()
+        void set_do_shuffle(bool v)
         {
-            sr.start_file_read_worker();
+            sr.set_do_shuffle(v);
         }
 
-        Value get_shallow_value(Position& task_pos);
+        void add_file(const std::string& filename)
+        {
+            sr.add_file(filename);
+        }
 
-        // save merit function parameters to a file
-        bool save(bool is_final = false);
+        void learn();
 
-        // sfen reader
-        SfenReader& sr;
 
-        // Learning iteration counter
-        uint64_t epoch = 0;
+        std::string validation_set_file_name;
 
         // Mini batch size size. Be sure to set it on the side that uses this class.
         uint64_t mini_batch_size = LEARN_MINI_BATCH_SIZE;
 
-        std::atomic<bool> stop_flag;
-
         // Option to exclude early stage from learning
         int reduction_gameply;
 
@@ -677,342 +708,143 @@ namespace Learner
         // If true, do not dig the folder.
         bool save_only_once;
 
-        // --- loss calculation
-
-        // For calculation of learning data loss
-        atomic<double> learn_sum_cross_entropy_eval;
-        atomic<double> learn_sum_cross_entropy_win;
-        atomic<double> learn_sum_cross_entropy;
-        atomic<double> learn_sum_entropy_eval;
-        atomic<double> learn_sum_entropy_win;
-        atomic<double> learn_sum_entropy;
-
-        shared_timed_mutex nn_mutex;
         double newbob_decay;
         int newbob_num_trials;
         uint64_t auto_lr_drop;
-        uint64_t last_lr_drop;
-        double best_loss;
-        double latest_loss_sum;
-        uint64_t latest_loss_count;
+
         std::string best_nn_directory;
 
         uint64_t eval_save_interval;
         uint64_t loss_output_interval;
 
-        // Loss calculation.
-        // done: Number of phases targeted this time
-        void calc_loss(size_t thread_id, uint64_t done);
+    private:
+        void learn_worker(Thread& th, std::atomic<uint64_t>& counter, uint64_t limit);
 
-        // Define the loss calculation in ↑ as a task and execute it
-        TaskDispatcher task_dispatcher;
+        void update_weights(const PSVector& psv);
+
+        void calc_loss(const PSVector& psv);
+
+        void calc_loss_worker(
+            Thread& th,
+            std::atomic<uint64_t>& counter,
+            const PSVector& psv,
+            AtomicLoss& test_loss_sum,
+            atomic<double>& sum_norm,
+            atomic<int>& move_accord_count
+        );
+
+        Value get_shallow_value(Position& pos);
+
+        // save merit function parameters to a file
+        bool save(bool is_final = false);
+
+        PRNG prng;
+
+        // sfen reader
+        SfenReader sr;
+
+        uint64_t save_count;
+        uint64_t loss_output_count;
+
+        // Learning iteration counter
+        uint64_t epoch = 0;
+
+        std::atomic<bool> stop_flag;
+
+        uint64_t total_done;
+
+        uint64_t last_lr_drop;
+        double best_loss;
+        double latest_loss_sum;
+        uint64_t latest_loss_count;
+
+        // For calculation of learning data loss
+        AtomicLoss learn_loss_sum;
     };
 
-    Value LearnerThink::get_shallow_value(Position& task_pos)
+    void LearnerThink::learn()
     {
-        // Evaluation value for shallow search
-        // The value of evaluate() may be used, but when calculating loss, learn_cross_entropy and
-        // Use qsearch() because it is difficult to compare the values.
-        // EvalHash has been disabled in advance. (If not, the same value will be returned every time)
-        const auto [_, pv] = Search::qsearch(task_pos);
 
-        const auto rootColor = task_pos.side_to_move();
-
-        std::vector<StateInfo, AlignedAllocator<StateInfo>> states(pv.size());
-        for (size_t i = 0; i < pv.size(); ++i)
-        {
-            task_pos.do_move(pv[i], states[i]);
-        }
-
-        const Value shallow_value =
-            (rootColor == task_pos.side_to_move())
-            ? Eval::evaluate(task_pos)
-            : -Eval::evaluate(task_pos);
-
-        for (auto it = pv.rbegin(); it != pv.rend(); ++it)
-            task_pos.undo_move(*it);
-
-        return shallow_value;
-    }
-
-    void LearnerThink::calc_loss(size_t thread_id, uint64_t done)
-    {
-        // There is no point in hitting the replacement table,
-        // so at this timing the generation of the replacement table is updated.
-        // It doesn't matter if you have disabled the substitution table.
-        TT.new_search();
-        TimePoint elapsed = now() - Search::Limits.startTime + 1;
-
-        cout << "PROGRESS: " << now_string() << ", ";
-        cout << sr.total_done << " sfens, ";
-        cout << sr.total_done * 1000 / elapsed  << " sfens/second";
-        cout << ", iteration " << epoch;
-        cout << ", learning rate = " << global_learning_rate << ", ";
-
-        // For calculation of verification data loss
-        atomic<double> test_sum_cross_entropy_eval, test_sum_cross_entropy_win, test_sum_cross_entropy;
-        atomic<double> test_sum_entropy_eval, test_sum_entropy_win, test_sum_entropy;
-        test_sum_cross_entropy_eval = 0;
-        test_sum_cross_entropy_win = 0;
-        test_sum_cross_entropy = 0;
-        test_sum_entropy_eval = 0;
-        test_sum_entropy_win = 0;
-        test_sum_entropy = 0;
-
-        // norm for learning
-        atomic<double> sum_norm;
-        sum_norm = 0;
-
-        // The number of times the pv first move of deep
-        // search matches the pv first move of search(1).
-        atomic<int> move_accord_count;
-        move_accord_count = 0;
-
-        auto th = Threads[thread_id];
-        auto& pos = th->rootPos;
-        StateInfo si;
-        pos.set(StartFEN, false, &si, th);
-        cout << "startpos eval = " << Eval::evaluate(pos) << endl;
-
-        // It's better to parallelize here, but it's a bit
-        // troublesome because the search before slave has not finished.
-        // I created a mechanism to call task, so I will use it.
-
-        // The number of tasks to do.
-        atomic<int> task_count;
-        task_count = (int)sr.sfen_for_mse.size();
-        task_dispatcher.task_reserve(task_count);
-
-        // Create a task to search for the situation and give it to each thread.
-        for (const auto& ps : sr.sfen_for_mse)
-        {
-            // Assign work to each thread using TaskDispatcher.
-            // A task definition for that.
-            // It is not possible to capture pos used in ↑,
-            // so specify the variables you want to capture one by one.
-            auto task =
-                [
-                    this,
-                    &ps,
-                    &test_sum_cross_entropy_eval,
-                    &test_sum_cross_entropy_win,
-                    &test_sum_cross_entropy,
-                    &test_sum_entropy_eval,
-                    &test_sum_entropy_win,
-                    &test_sum_entropy,
-                    &sum_norm,
-                    &task_count,
-                    &move_accord_count
-                ](size_t task_thread_id)
-            {
-                auto task_th = Threads[task_thread_id];
-                auto& task_pos = task_th->rootPos;
-                StateInfo task_si;
-                if (task_pos.set_from_packed_sfen(ps.sfen, &task_si, task_th) != 0)
-                {
-                    // Unfortunately, as an sfen for rmse calculation, an invalid sfen was drawn.
-                    cout << "Error! : illegal packed sfen " << task_pos.fen() << endl;
-                }
-
-                const Value shallow_value = get_shallow_value(task_pos);
-
-                // Evaluation value of deep search
-                auto deep_value = (Value)ps.score;
-
-                // Note) This code does not consider when
-                //       eval_limit is specified in the learn command.
-
-                // --- calculation of cross entropy
-
-                // For the time being, regarding the win rate and loss terms only in the elmo method
-                // Calculate and display the cross entropy.
-
-                double test_cross_entropy_eval, test_cross_entropy_win, test_cross_entropy;
-                double test_entropy_eval, test_entropy_win, test_entropy;
-                calc_cross_entropy(
-                    deep_value,
-                    shallow_value,
-                    ps,
-                    test_cross_entropy_eval,
-                    test_cross_entropy_win,
-                    test_cross_entropy,
-                    test_entropy_eval,
-                    test_entropy_win,
-                    test_entropy);
-
-                // The total cross entropy need not be abs() by definition.
-                test_sum_cross_entropy_eval += test_cross_entropy_eval;
-                test_sum_cross_entropy_win += test_cross_entropy_win;
-                test_sum_cross_entropy += test_cross_entropy;
-                test_sum_entropy_eval += test_entropy_eval;
-                test_sum_entropy_win += test_entropy_win;
-                test_sum_entropy += test_entropy;
-                sum_norm += (double)abs(shallow_value);
-
-                // Determine if the teacher's move and the score of the shallow search match
-                {
-                    const auto [value, pv] = Search::search(task_pos, 1);
-                    if ((uint16_t)pv[0] == ps.move)
-                        move_accord_count.fetch_add(1, std::memory_order_relaxed);
-                }
-
-                // Reduced one task because I did it
-                --task_count;
-            };
-
-            // Throw the defined task to slave.
-            task_dispatcher.push_task_async(task);
-        }
-
-        // join yourself as a slave
-        task_dispatcher.on_idle(thread_id);
-
-        // wait for all tasks to complete
-        while (task_count)
-            sleep(1);
-
-        latest_loss_sum += test_sum_cross_entropy - test_sum_entropy;
-        latest_loss_count += sr.sfen_for_mse.size();
-
-        // learn_cross_entropy may be called train cross
-        // entropy in the world of machine learning,
-        // When omitting the acronym, it is nice to be able to
-        // distinguish it from test cross entropy(tce) by writing it as lce.
-
-        if (sr.sfen_for_mse.size() && done)
-        {
-            cout << "INFO: "
-                << "test_cross_entropy_eval = " << test_sum_cross_entropy_eval / sr.sfen_for_mse.size()
-                << " , test_cross_entropy_win = " << test_sum_cross_entropy_win / sr.sfen_for_mse.size()
-                << " , test_entropy_eval = " << test_sum_entropy_eval / sr.sfen_for_mse.size()
-                << " , test_entropy_win = " << test_sum_entropy_win / sr.sfen_for_mse.size()
-                << " , test_cross_entropy = " << test_sum_cross_entropy / sr.sfen_for_mse.size()
-                << " , test_entropy = " << test_sum_entropy / sr.sfen_for_mse.size()
-                << " , norm = " << sum_norm
-                << " , move accuracy = " << (move_accord_count * 100.0 / sr.sfen_for_mse.size()) << "%"
-                << endl;
-
-            if (done != static_cast<uint64_t>(-1))
-            {
-                cout << "INFO: "
-                    << "learn_cross_entropy_eval = " << learn_sum_cross_entropy_eval / done
-                    << " , learn_cross_entropy_win = " << learn_sum_cross_entropy_win / done
-                    << " , learn_entropy_eval = " << learn_sum_entropy_eval / done
-                    << " , learn_entropy_win = " << learn_sum_entropy_win / done
-                    << " , learn_cross_entropy = " << learn_sum_cross_entropy / done
-                    << " , learn_entropy = " << learn_sum_entropy / done
-                    << endl;
-            }
-        }
-        else
-        {
-            cout << "Error! : sr.sfen_for_mse.size() = " << sr.sfen_for_mse.size() << " ,  done = " << done << endl;
-        }
-
-        // Clear 0 for next time.
-        learn_sum_cross_entropy_eval = 0.0;
-        learn_sum_cross_entropy_win = 0.0;
-        learn_sum_cross_entropy = 0.0;
-        learn_sum_entropy_eval = 0.0;
-        learn_sum_entropy_win = 0.0;
-        learn_sum_entropy = 0.0;
-    }
-
-    void LearnerThink::thread_worker(size_t thread_id)
-    {
 #if defined(_OPENMP)
         omp_set_num_threads((int)Options["Threads"]);
 #endif
 
-        auto th = Threads[thread_id];
-        auto& pos = th->rootPos;
+        Eval::NNUE::verify_any_net_loaded();
 
-        while (true)
+        // Start a thread that loads the training data in the background
+        sr.start_file_read_worker();
+
+        const PSVector sfen_for_mse =
+            validation_set_file_name.empty()
+            ? sr.read_for_mse(sfen_for_mse_size)
+            : sr.read_validation_set(validation_set_file_name, eval_limit);
+
+        if (validation_set_file_name.empty()
+            && sfen_for_mse.size() != sfen_for_mse_size)
         {
-            // display mse (this is sometimes done only for thread 0)
-            // Immediately after being read from the file...
+            cout
+                << "Error reading sfen_for_mse. Read " << sfen_for_mse.size()
+                << " out of " << sfen_for_mse_size << '\n';
 
-            // Lock the evaluation function so that it is not used during updating.
-            shared_lock<shared_timed_mutex> read_lock(nn_mutex, defer_lock);
-            if (sr.next_update_weights <= sr.total_done ||
-                (thread_id != 0 && !read_lock.try_lock()))
-            {
-                if (thread_id != 0)
-                {
-                    // Wait except thread_id == 0.
+            sr.stop();
 
-                    if (stop_flag)
-                        break;
+            return;
+        }
 
-                    // I want to parallelize rmse calculation etc., so if task() is loaded, process it.
-                    task_dispatcher.on_idle(thread_id);
-                    continue;
-                }
-                else
-                {
-                    // Only thread_id == 0 performs the following update process.
+        if (newbob_decay != 1.0) {
 
-                    // The weight array is not updated for the first time.
-                    if (sr.next_update_weights == 0)
-                    {
-                        sr.next_update_weights += mini_batch_size;
-                        continue;
-                    }
+            calc_loss(sfen_for_mse);
 
-                    {
-                        // update parameters
+            best_loss = latest_loss_sum / latest_loss_count;
+            latest_loss_sum = 0.0;
+            latest_loss_count = 0;
 
-                        // Lock the evaluation function so that it is not used during updating.
-                        lock_guard<shared_timed_mutex> write_lock(nn_mutex);
-                        Eval::NNUE::update_parameters();
-                    }
+            cout << "initial loss: " << best_loss << endl;
+        }
 
-                    ++epoch;
+        stop_flag = false;
 
-                    // However, the elapsed time during update_weights() and calc_rmse() is ignored.
-                    if (++sr.save_count * mini_batch_size >= eval_save_interval)
-                    {
-                        sr.save_count = 0;
+        for(;;)
+        {
+            std::atomic<uint64_t> counter{0};
 
-                        // During this time, as the gradient calculation proceeds,
-                        // the value becomes too large and I feel annoyed, so stop other threads.
-                        const bool converged = save();
-                        if (converged)
-                        {
-                            stop_flag = true;
-                            sr.stop_flag = true;
-                            break;
-                        }
-                    }
+            Threads.execute_with_workers([this, &counter](auto& th){
+                learn_worker(th, counter, mini_batch_size);
+            });
 
-                    // Calculate rmse. This is done for samples of 10,000 phases.
-                    // If you do with 40 cores, update_weights every 1 million phases
-                    static uint64_t loss_output_count = 0;
-                    if (++loss_output_count * mini_batch_size >= loss_output_interval)
-                    {
-                        loss_output_count = 0;
+            total_done += mini_batch_size;
 
-                        // Number of cases processed this time
-                        uint64_t done = sr.total_done - sr.last_done;
+            Threads.wait_for_workers_finished();
 
-                        // loss calculation
-                        calc_loss(thread_id, done);
+            if (stop_flag)
+                break;
 
-                        Eval::NNUE::check_health();
+            update_weights(sfen_for_mse);
 
-                        // Make a note of how far you have totaled.
-                        sr.last_done = sr.total_done;
-                    }
+            if (stop_flag)
+                break;
+        }
 
-                    // Next time, I want you to do this series of
-                    // processing again when you process only mini_batch_size.
-                    sr.next_update_weights += mini_batch_size;
+        sr.stop();
 
-                    // Since I was waiting for the update of this
-                    // sr.next_update_weights except the main thread,
-                    // Once this value is updated, it will start moving again.
-                }
-            }
+        Eval::NNUE::finalize_net();
+
+        save(true);
+    }
+
+    void LearnerThink::learn_worker(Thread& th, std::atomic<uint64_t>& counter, uint64_t limit)
+    {
+        const auto thread_id = th.thread_idx();
+        auto& pos = th.rootPos;
+
+        Loss local_loss_sum{};
+        std::vector<StateInfo, AlignedAllocator<StateInfo>> state(MAX_PLY);
+
+        while(!stop_flag)
+        {
+            const auto iter = counter.fetch_add(1);
+            if (iter >= limit)
+                break;
 
             PackedSfenValue ps;
 
@@ -1020,16 +852,12 @@ namespace Learner
 
             if (!sr.read_to_thread_buffer(thread_id, ps))
             {
-                // ran out of thread pool for my thread.
-                // Because there are almost no phases left,
-                // Terminate all other threads.
-
+                // If we ran out of data we stop completely
+                // because there's nothing left to do.
                 stop_flag = true;
                 break;
             }
 
-            // The evaluation value exceeds the learning target value.
-            // Ignore this aspect information.
             if (eval_limit < abs(ps.score))
                 goto RETRY_READ;
 
@@ -1041,123 +869,242 @@ namespace Learner
                 goto RETRY_READ;
 
             StateInfo si;
-            if (pos.set_from_packed_sfen(ps.sfen, &si, th) != 0)
+            if (pos.set_from_packed_sfen(ps.sfen, &si, &th) != 0)
             {
-                // I got a strange sfen. Should be debugged!
-                // Since it is an illegal sfen, it may not be
-                // displayed with pos.sfen(), but it is better than not.
+                // Malformed sfen
                 cout << "Error! : illigal packed sfen = " << pos.fen() << endl;
                 goto RETRY_READ;
             }
 
-            // I can read it, so try displaying it.
-            //      cout << pos << value << endl;
-
             const auto rootColor = pos.side_to_move();
 
-            int ply = 0;
-            StateInfo state[MAX_PLY]; // PV of qsearch cannot be so long.
-
-            if (!pos.pseudo_legal((Move)ps.move) || !pos.legal((Move)ps.move))
-            {
-                goto RETRY_READ;
-            }
-
-            pos.do_move((Move)ps.move, state[ply++]);
-
-            // There is a possibility that all the pieces are blocked and stuck.
-            // Also, the declaration win phase is excluded from
-            // learning because you cannot go to leaf with PV moves.
-            // (shouldn't write out such teacher aspect itself,
-            // but may have written it out with an old generation routine)
-            // Skip the position if there are no legal moves (=checkmated or stalemate).
-            if (MoveList<LEGAL>(pos).size() == 0)
-                goto RETRY_READ;
-
-            // Evaluation value of shallow search (qsearch)
-            const auto [_, pv] = Search::qsearch(pos);
-
-            // Evaluation value of deep search
-            const auto deep_value = (Value)ps.score;
-
-            // I feel that the mini batch has a better gradient.
-            // Go to the leaf node as it is, add only to the gradient array,
-            // and later try AdaGrad at the time of rmse aggregation.
-
-
-            // If the initial PV is different, it is better not to use it for learning.
-            // If it is the result of searching a completely different place, it may become noise.
-            // It may be better not to study where the difference in evaluation values ​​is too large.
-
-
-            // A helper function that adds the gradient to the current phase.
+            // A function that adds the current `pos` and `ps`
+            // to the training set.
             auto pos_add_grad = [&]() {
-                // Use the value of evaluate in leaf as shallow_value.
-                // Using the return value of qsearch() as shallow_value,
-                // If PV is interrupted in the middle, the phase where
-                // evaluate() is called to calculate the gradient,
-                // and I don't think this is a very desirable property,
-                // as the aspect that gives that gradient will be different.
-                // I have turned off the substitution table, but since
-                // the pv array has not been updated due to one stumbling block etc...
+
+                // Evaluation value of deep search
+                const auto deep_value = (Value)ps.score;
 
                 const Value shallow_value =
                     (rootColor == pos.side_to_move())
                     ? Eval::evaluate(pos)
                     : -Eval::evaluate(pos);
 
-                // Calculate loss for training data
-                double learn_cross_entropy_eval, learn_cross_entropy_win, learn_cross_entropy;
-                double learn_entropy_eval, learn_entropy_win, learn_entropy;
-                calc_cross_entropy(
+                const auto loss = calc_cross_entropy(
                     deep_value,
                     shallow_value,
-                    ps,
-                    learn_cross_entropy_eval,
-                    learn_cross_entropy_win,
-                    learn_cross_entropy,
-                    learn_entropy_eval,
-                    learn_entropy_win,
-                    learn_entropy);
+                    ps);
 
-                learn_sum_cross_entropy_eval += learn_cross_entropy_eval;
-                learn_sum_cross_entropy_win += learn_cross_entropy_win;
-                learn_sum_cross_entropy += learn_cross_entropy;
-                learn_sum_entropy_eval += learn_entropy_eval;
-                learn_sum_entropy_win += learn_entropy_win;
-                learn_sum_entropy += learn_entropy;
+                local_loss_sum += loss;
 
                 Eval::NNUE::add_example(pos, rootColor, ps, 1.0);
-
-                // Since the processing is completed, the counter of the processed number is incremented
-                sr.total_done++;
             };
 
-            bool illegal_move = false;
-            for (auto m : pv)
-            {
-                // I shouldn't be an illegal player.
-                // An illegal move sometimes comes here...
-                if (!pos.pseudo_legal(m) || !pos.legal(m))
-                {
-                    //cout << pos << m << endl;
-                    //assert(false);
-                    illegal_move = true;
-                    break;
-                }
-
-                pos.do_move(m, state[ply++]);
-            }
-
-            if (illegal_move)
+            if (!pos.pseudo_legal((Move)ps.move) || !pos.legal((Move)ps.move))
             {
                 goto RETRY_READ;
             }
 
+            int ply = 0;
+            pos.do_move((Move)ps.move, state[ply++]);
+
+            // We want to position being trained on not to be terminal
+            if (MoveList<LEGAL>(pos).size() == 0)
+                goto RETRY_READ;
+
+            // Evaluation value of shallow search (qsearch)
+            const auto [_, pv] = Search::qsearch(pos);
+
+            for (auto m : pv)
+            {
+                pos.do_move(m, state[ply++]);
+            }
+
             // Since we have reached the end phase of PV, add the slope here.
             pos_add_grad();
         }
 
+        learn_loss_sum += local_loss_sum;
+    }
+
+    void LearnerThink::update_weights(const PSVector& psv)
+    {
+        // I'm not sure this fencing is correct. But either way there
+        // should be no real issues happening since
+        // the read/write phases are isolated.
+        atomic_thread_fence(memory_order_seq_cst);
+        Eval::NNUE::update_parameters();
+        atomic_thread_fence(memory_order_seq_cst);
+
+        ++epoch;
+
+        if (++save_count * mini_batch_size >= eval_save_interval)
+        {
+            save_count = 0;
+
+            const bool converged = save();
+            if (converged)
+            {
+                stop_flag = true;
+                return;
+            }
+        }
+
+        if (++loss_output_count * mini_batch_size >= loss_output_interval)
+        {
+            loss_output_count = 0;
+
+            // loss calculation
+            calc_loss(psv);
+
+            Eval::NNUE::check_health();
+        }
+    }
+
+    void LearnerThink::calc_loss(const PSVector& psv)
+    {
+        TT.new_search();
+        TimePoint elapsed = now() - Search::Limits.startTime + 1;
+
+        cout << "PROGRESS: " << now_string() << ", ";
+        cout << total_done << " sfens, ";
+        cout << total_done * 1000 / elapsed  << " sfens/second";
+        cout << ", iteration " << epoch;
+        cout << ", learning rate = " << global_learning_rate << ", ";
+
+        // For calculation of verification data loss
+        AtomicLoss test_loss_sum{};
+
+        // norm for learning
+        atomic<double> sum_norm{0.0};
+
+        // The number of times the pv first move of deep
+        // search matches the pv first move of search(1).
+        atomic<int> move_accord_count{0};
+
+        auto mainThread = Threads.main();
+        mainThread->execute_with_worker([](auto& th){
+            auto& pos = th.rootPos;
+            StateInfo si;
+            pos.set(StartFEN, false, &si, &th);
+            cout << "startpos eval = " << Eval::evaluate(pos) << endl;
+        });
+        mainThread->wait_for_worker_finished();
+
+        // The number of tasks to do.
+        atomic<uint64_t> counter{0};
+        Threads.execute_with_workers([&](auto& th){
+            calc_loss_worker(
+                th,
+                counter,
+                psv,
+                test_loss_sum,
+                sum_norm,
+                move_accord_count
+            );
+        });
+        Threads.wait_for_workers_finished();
+
+        latest_loss_sum += test_loss_sum.cross_entropy - test_loss_sum.entropy;
+        latest_loss_count += psv.size();
+
+        if (psv.size() && test_loss_sum.count > 0.0)
+        {
+            cout << "INFO: norm = " << sum_norm
+                << " , move accuracy = " << (move_accord_count * 100.0 / psv.size()) << "%"
+                << endl;
+
+            test_loss_sum.print("test", cout);
+
+            if (learn_loss_sum.count > 0.0)
+            {
+                learn_loss_sum.print("learn", cout);
+            }
+        }
+        else
+        {
+            cout << "Error! : psv.size() = " << psv.size() << " ,  done = " << test_loss_sum.count << endl;
+        }
+
+        learn_loss_sum.reset();
+    }
+
+    void LearnerThink::calc_loss_worker(
+        Thread& th,
+        std::atomic<uint64_t>& counter,
+        const PSVector& psv,
+        AtomicLoss& test_loss_sum,
+        atomic<double>& sum_norm,
+        atomic<int>& move_accord_count
+    )
+    {
+        Loss local_loss_sum{};
+        auto& pos = th.rootPos;
+
+        for(;;)
+        {
+            const auto task_id = counter.fetch_add(1);
+            if (task_id >= psv.size())
+            {
+                break;
+            }
+
+            const auto& ps = psv[task_id];
+
+            StateInfo si;
+            if (pos.set_from_packed_sfen(ps.sfen, &si, &th) != 0)
+            {
+                cout << "Error! : illegal packed sfen " << pos.fen() << endl;
+                continue;
+            }
+
+            const Value shallow_value = get_shallow_value(pos);
+
+            // Evaluation value of deep search
+            const auto deep_value = (Value)ps.score;
+
+            const auto loss = calc_cross_entropy(
+                deep_value,
+                shallow_value,
+                ps);
+
+            local_loss_sum += loss;
+            sum_norm += (double)abs(shallow_value);
+
+            // Determine if the teacher's move and the score of the shallow search match
+            const auto [value, pv] = Search::search(pos, 1);
+            if (pv.size() > 0 && (uint16_t)pv[0] == ps.move)
+                move_accord_count.fetch_add(1, std::memory_order_relaxed);
+        }
+
+        test_loss_sum += local_loss_sum;
+    }
+
+    Value LearnerThink::get_shallow_value(Position& pos)
+    {
+        // Evaluation value for shallow search
+        // The value of evaluate() may be used, but when calculating loss, learn_cross_entropy and
+        // Use qsearch() because it is difficult to compare the values.
+        // EvalHash has been disabled in advance. (If not, the same value will be returned every time)
+        const auto [_, pv] = Search::qsearch(pos);
+
+        const auto rootColor = pos.side_to_move();
+
+        std::vector<StateInfo, AlignedAllocator<StateInfo>> states(pv.size());
+        for (size_t i = 0; i < pv.size(); ++i)
+        {
+            pos.do_move(pv[i], states[i]);
+        }
+
+        const Value shallow_value =
+            (rootColor == pos.side_to_move())
+            ? Eval::evaluate(pos)
+            : -Eval::evaluate(pos);
+
+        for (auto it = pv.rbegin(); it != pv.rend(); ++it)
+            pos.undo_move(*it);
+
+        return shallow_value;
     }
 
     // Write evaluation function file.
@@ -1189,7 +1136,7 @@ namespace Learner
                 latest_loss_sum = 0.0;
                 latest_loss_count = 0;
                 cout << "loss: " << latest_loss;
-                auto tot = sr.total_done.load();
+                auto tot = total_done;
                 if (auto_lr_drop)
                 {
                     cout << " < best (" << best_loss << "), accepted" << endl;
@@ -1681,6 +1628,7 @@ namespace Learner
             else if (option == "seed") is >> seed;
             else if (option == "set_recommended_uci_options")
             {
+                UCI::setoption("Use NNUE", "pure");
                 UCI::setoption("MultiPV", "1");
                 UCI::setoption("Contempt", "0");
                 UCI::setoption("Skill Level", "20");
@@ -1707,8 +1655,7 @@ namespace Learner
         cout << "Warning! OpenMP disabled." << endl;
 #endif
 
-        SfenReader sr(thread_num, seed);
-        LearnerThink learn_think(sr, seed);
+        LearnerThink learn_think(thread_num, seed);
 
         // Display learning game file
         if (target_dir != "")
@@ -1807,17 +1754,6 @@ namespace Learner
         cout << "save_only_once    : " << (save_only_once ? "true" : "false") << endl;
         cout << "no_shuffle        : " << (no_shuffle ? "true" : "false") << endl;
 
-        // Insert the file name for the number of loops.
-        for (int i = 0; i < loop; ++i)
-        {
-            // sfen reader, I'll read it in reverse
-            // order so I'll reverse it here. I'm sorry.
-            for (auto it = filenames.rbegin(); it != filenames.rend(); ++it)
-            {
-                sr.filenames.push_back(Path::combine(base_dir, *it));
-            }
-        }
-
         cout << "Loss Function     : " << LOSS_FUNCTION << endl;
         cout << "mini-batch size   : " << mini_batch_size << endl;
 
@@ -1876,7 +1812,7 @@ namespace Learner
         // Reflect other option settings.
         learn_think.eval_limit = eval_limit;
         learn_think.save_only_once = save_only_once;
-        learn_think.sr.no_shuffle = no_shuffle;
+        learn_think.set_do_shuffle(!no_shuffle);
         learn_think.reduction_gameply = reduction_gameply;
 
         learn_think.newbob_decay = newbob_decay;
@@ -1886,49 +1822,20 @@ namespace Learner
         learn_think.eval_save_interval = eval_save_interval;
         learn_think.loss_output_interval = loss_output_interval;
 
-        // Start a thread that loads the phase file in the background
-        // (If this is not started, mse cannot be calculated.)
-        learn_think.start_file_read_worker();
-
         learn_think.mini_batch_size = mini_batch_size;
+        learn_think.validation_set_file_name = validation_set_file_name;
 
-        if (validation_set_file_name.empty())
+        // Insert the file name for the number of loops.
+        for (int i = 0; i < loop; ++i)
         {
-            // Get about 10,000 data for mse calculation.
-            sr.read_for_mse();
+            for(auto& file : filenames)
+            {
+                learn_think.add_file(Path::combine(base_dir, file));
+            }
         }
-        else
-        {
-            sr.read_validation_set(validation_set_file_name, eval_limit);
-        }
-
-        cout << "Forcing Use NNUE pure.\n";
-        UCI::setoption("Use NNUE", "pure");
-
-        Eval::NNUE::verify_any_net_loaded();
-
-        // Calculate rmse once at this point (timing of 0 sfen)
-        // sr.calc_rmse();
-
-        if (newbob_decay != 1.0) {
-            learn_think.calc_loss(0, -1);
-            learn_think.best_loss = learn_think.latest_loss_sum / learn_think.latest_loss_count;
-            learn_think.latest_loss_sum = 0.0;
-            learn_think.latest_loss_count = 0;
-            cout << "initial loss: " << learn_think.best_loss << endl;
-        }
-
-        // -----------------------------------
-        // start learning evaluation function parameters
-        // -----------------------------------
 
         // Start learning.
-        learn_think.go_think();
-
-        Eval::NNUE::finalize_net();
-
-        // Save once at the end.
-        learn_think.save(true);
+        learn_think.learn();
     }
 
 } // namespace Learner
diff --git a/src/misc.h b/src/misc.h
index 320eea76..dca959cd 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -128,6 +128,18 @@ public:
 
   void set_seed(uint64_t seed) { s = seed; }
 
+  uint64_t next_random_seed()
+  {
+    uint64_t seed = 0;
+    for(int i = 0; i < 64; ++i)
+    {
+      const auto off = rand64() % 64;
+      seed |= (rand64() & (uint64_t(1) << off)) >> off;
+      seed <<= 1;
+    }
+    return seed;
+  }
+
   void set_seed_from_time()
   {
       set_seed(std::chrono::system_clock::now().time_since_epoch().count());

From 8f3e64a6d5d48b5d94c7e4083914ab4c5d5b3aa0 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 18 Oct 2020 13:36:32 +0200
Subject: [PATCH 227/398] move sfen reader to separate file

---
 src/learn/learn.cpp     | 311 +-------------------------------------
 src/learn/sfen_reader.h | 326 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 329 insertions(+), 308 deletions(-)
 create mode 100644 src/learn/sfen_reader.h

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 411e0016..af53791c 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -20,7 +20,7 @@
 #include "learn.h"
 
 #include "convert.h"
-#include "sfen_stream.h"
+#include "sfen_reader.h"
 
 #include "misc.h"
 #include "position.h"
@@ -51,6 +51,7 @@
 #include <shared_mutex>
 #include <sstream>
 #include <unordered_set>
+#include <iostream>
 
 #if defined (_OPENMP)
 #include <omp.h>
@@ -346,312 +347,6 @@ namespace Learner
         return calc_grad((Value)psv.score, shallow, psv);
     }
 
-    // Sfen reader
-    struct SfenReader
-    {
-        // Number of phases buffered by each thread 0.1M phases. 4M phase at 40HT
-        static constexpr size_t THREAD_BUFFER_SIZE = 10 * 1000;
-
-        // Buffer for reading files (If this is made larger,
-        // the shuffle becomes larger and the phases may vary.
-        // If it is too large, the memory consumption will increase.
-        // SFEN_READ_SIZE is a multiple of THREAD_BUFFER_SIZE.
-        static constexpr const size_t SFEN_READ_SIZE = LEARN_SFEN_READ_SIZE;
-
-        // Do not use std::random_device().
-        // Because it always the same integers on MinGW.
-        SfenReader(int thread_num, const std::string& seed) :
-            prng(seed)
-        {
-            packed_sfens.resize(thread_num);
-            total_read = 0;
-            end_of_files = false;
-            shuffle = true;
-            stop_flag = false;
-        }
-
-        ~SfenReader()
-        {
-            if (file_worker_thread.joinable())
-                file_worker_thread.join();
-        }
-
-        // Load the phase for calculation such as mse.
-        PSVector read_for_mse(uint64_t count)
-        {
-            PSVector sfen_for_mse;
-            sfen_for_mse.reserve(count);
-
-            for (uint64_t i = 0; i < count; ++i)
-            {
-                PackedSfenValue ps;
-                if (!read_to_thread_buffer(0, ps))
-                {
-                    cout << "Error! read packed sfen , failed." << endl;
-                    return sfen_for_mse;
-                }
-
-                sfen_for_mse.push_back(ps);
-            }
-
-            return sfen_for_mse;
-        }
-
-        PSVector read_validation_set(const string& file_name, int eval_limit)
-        {
-            PSVector sfen_for_mse;
-
-            auto input = open_sfen_input_file(file_name);
-
-            while(!input->eof())
-            {
-                std::optional<PackedSfenValue> p_opt = input->next();
-                if (p_opt.has_value())
-                {
-                    auto& p = *p_opt;
-
-                    if (eval_limit < abs(p.score))
-                        continue;
-
-                    if (!use_draw_games_in_validation && p.game_result == 0)
-                        continue;
-
-                    sfen_for_mse.push_back(p);
-                }
-                else
-                {
-                    break;
-                }
-            }
-
-            return sfen_for_mse;
-        }
-
-        // [ASYNC] Thread returns one aspect. Otherwise returns false.
-        bool read_to_thread_buffer(size_t thread_id, PackedSfenValue& ps)
-        {
-            // If there are any positions left in the thread buffer
-            // then retrieve one and return it.
-            auto& thread_ps = packed_sfens[thread_id];
-
-            // Fill the read buffer if there is no remaining buffer,
-            // but if it doesn't even exist, finish.
-            // If the buffer is empty, fill it.
-            if ((thread_ps == nullptr || thread_ps->empty())
-                && !read_to_thread_buffer_impl(thread_id))
-                return false;
-
-            // read_to_thread_buffer_impl() returned true,
-            // Since the filling of the thread buffer with the
-            // phase has been completed successfully
-            // thread_ps->rbegin() is alive.
-
-            ps = thread_ps->back();
-            thread_ps->pop_back();
-
-            // If you've run out of buffers, call delete yourself to free this buffer.
-            if (thread_ps->empty())
-            {
-                thread_ps.reset();
-            }
-
-            return true;
-        }
-
-        // [ASYNC] Read some aspects into thread buffer.
-        bool read_to_thread_buffer_impl(size_t thread_id)
-        {
-            while (true)
-            {
-                {
-                    std::unique_lock<std::mutex> lk(mutex);
-                    // If you can fill from the file buffer, that's fine.
-                    if (packed_sfens_pool.size() != 0)
-                    {
-                        // It seems that filling is possible, so fill and finish.
-
-                        packed_sfens[thread_id] = std::move(packed_sfens_pool.front());
-                        packed_sfens_pool.pop_front();
-
-                        total_read += THREAD_BUFFER_SIZE;
-
-                        return true;
-                    }
-                }
-
-                // The file to read is already gone. No more use.
-                if (end_of_files)
-                    return false;
-
-                // Waiting for file worker to fill packed_sfens_pool.
-                // The mutex isn't locked, so it should fill up soon.
-                // Poor man's condition variable.
-                sleep(1);
-            }
-
-        }
-
-        // Start a thread that loads the phase file in the background.
-        void start_file_read_worker()
-        {
-            file_worker_thread = std::thread([&] {
-                this->file_read_worker();
-                });
-        }
-
-        void file_read_worker()
-        {
-            auto open_next_file = [&]() {
-                // no more
-                for(;;)
-                {
-                    sfen_input_stream.reset();
-
-                    if (filenames.empty())
-                        return false;
-
-                    // Get the next file name.
-                    string filename = filenames.front();
-                    filenames.pop_front();
-
-                    sfen_input_stream = open_sfen_input_file(filename);
-                    cout << "open filename = " << filename << endl;
-
-                    // in case the file is empty or was deleted.
-                    if (!sfen_input_stream->eof())
-                        return true;
-                }
-            };
-
-            if (sfen_input_stream == nullptr && !open_next_file())
-            {
-                cout << "..end of files." << endl;
-                end_of_files = true;
-                return;
-            }
-
-            while (true)
-            {
-                // Wait for the buffer to run out.
-                // This size() is read only, so you don't need to lock it.
-                while (!stop_flag && packed_sfens_pool.size() >= SFEN_READ_SIZE / THREAD_BUFFER_SIZE)
-                    sleep(100);
-
-                if (stop_flag)
-                    return;
-
-                PSVector sfens;
-                sfens.reserve(SFEN_READ_SIZE);
-
-                // Read from the file into the file buffer.
-                while (sfens.size() < SFEN_READ_SIZE)
-                {
-                    std::optional<PackedSfenValue> p = sfen_input_stream->next();
-                    if (p.has_value())
-                    {
-                        sfens.push_back(*p);
-                    }
-                    else if(!open_next_file())
-                    {
-                        // There was no next file. Abort.
-                        cout << "..end of files." << endl;
-                        end_of_files = true;
-                        return;
-                    }
-                }
-
-                // Shuffle the read phase data.
-                if (shuffle)
-                {
-                    Algo::shuffle(sfens, prng);
-                }
-
-                // Divide this by THREAD_BUFFER_SIZE. There should be size pieces.
-                // SFEN_READ_SIZE shall be a multiple of THREAD_BUFFER_SIZE.
-                assert((SFEN_READ_SIZE % THREAD_BUFFER_SIZE) == 0);
-
-                auto size = size_t(SFEN_READ_SIZE / THREAD_BUFFER_SIZE);
-                std::vector<std::unique_ptr<PSVector>> buffers;
-                buffers.reserve(size);
-
-                for (size_t i = 0; i < size; ++i)
-                {
-                    // Delete this pointer on the receiving side.
-                    auto buf = std::make_unique<PSVector>();
-                    buf->resize(THREAD_BUFFER_SIZE);
-                    memcpy(
-                        buf->data(),
-                        &sfens[i * THREAD_BUFFER_SIZE],
-                        sizeof(PackedSfenValue) * THREAD_BUFFER_SIZE);
-
-                    buffers.emplace_back(std::move(buf));
-                }
-
-                {
-                    std::unique_lock<std::mutex> lk(mutex);
-
-                    // The mutex lock is required because the%
-                    // contents of packed_sfens_pool are changed.
-
-                    for (auto& buf : buffers)
-                        packed_sfens_pool.emplace_back(std::move(buf));
-                }
-            }
-        }
-
-        void stop()
-        {
-            stop_flag = true;
-        }
-
-        void set_do_shuffle(bool v)
-        {
-            shuffle = v;
-        }
-
-        void add_file(const std::string& filename)
-        {
-            filenames.push_back(filename);
-        }
-
-    protected:
-
-        // worker thread reading file in background
-        std::thread file_worker_thread;
-
-        // sfen files
-        deque<string> filenames;
-
-        std::atomic<bool> stop_flag;
-
-        // number of phases read (file to memory buffer)
-        atomic<uint64_t> total_read;
-
-        // Do not shuffle when reading the phase.
-        bool shuffle;
-
-        // Random number to shuffle when reading the phase
-        PRNG prng;
-
-        // Did you read the files and reached the end?
-        atomic<bool> end_of_files;
-
-        // handle of sfen file
-        std::unique_ptr<BasicSfenInputStream> sfen_input_stream;
-
-        // sfen for each thread
-        // (When the thread is used up, the thread should call delete to release it.)
-        std::vector<std::unique_ptr<PSVector>> packed_sfens;
-
-        // Mutex when accessing packed_sfens_pool
-        std::mutex mutex;
-
-        // pool of sfen. The worker thread read from the file is added here.
-        // Each worker thread fills its own packed_sfens[thread_id] from here.
-        // * Lock and access the mutex.
-        std::list<std::unique_ptr<PSVector>> packed_sfens_pool;
-    };
-
     // Class to generate sfen with multiple threads
     struct LearnerThink
     {
@@ -777,7 +472,7 @@ namespace Learner
         const PSVector sfen_for_mse =
             validation_set_file_name.empty()
             ? sr.read_for_mse(sfen_for_mse_size)
-            : sr.read_validation_set(validation_set_file_name, eval_limit);
+            : sr.read_validation_set(validation_set_file_name, eval_limit, use_draw_games_in_validation);
 
         if (validation_set_file_name.empty()
             && sfen_for_mse.size() != sfen_for_mse_size)
diff --git a/src/learn/sfen_reader.h b/src/learn/sfen_reader.h
new file mode 100644
index 00000000..2645bb6c
--- /dev/null
+++ b/src/learn/sfen_reader.h
@@ -0,0 +1,326 @@
+#include "sfen_stream.h"
+
+#include "packed_sfen.h"
+
+#include "misc.h"
+
+#include <string>
+#include <vector>
+#include <deque>
+#include <memory>
+#include <mutex>
+#include <list>
+#include <atomic>
+#include <optional>
+#include <iostream>
+#include <cstdint>
+#include <thread>
+
+namespace Learner{
+
+    // Sfen reader
+    struct SfenReader
+    {
+        // Number of phases buffered by each thread 0.1M phases. 4M phase at 40HT
+        static constexpr size_t THREAD_BUFFER_SIZE = 10 * 1000;
+
+        // Buffer for reading files (If this is made larger,
+        // the shuffle becomes larger and the phases may vary.
+        // If it is too large, the memory consumption will increase.
+        // SFEN_READ_SIZE is a multiple of THREAD_BUFFER_SIZE.
+        static constexpr const size_t SFEN_READ_SIZE = LEARN_SFEN_READ_SIZE;
+
+        // Do not use std::random_device().
+        // Because it always the same integers on MinGW.
+        SfenReader(int thread_num, const std::string& seed) :
+            prng(seed)
+        {
+            packed_sfens.resize(thread_num);
+            total_read = 0;
+            end_of_files = false;
+            shuffle = true;
+            stop_flag = false;
+        }
+
+        ~SfenReader()
+        {
+            if (file_worker_thread.joinable())
+                file_worker_thread.join();
+        }
+
+        // Load the phase for calculation such as mse.
+        PSVector read_for_mse(uint64_t count)
+        {
+            PSVector sfen_for_mse;
+            sfen_for_mse.reserve(count);
+
+            for (uint64_t i = 0; i < count; ++i)
+            {
+                PackedSfenValue ps;
+                if (!read_to_thread_buffer(0, ps))
+                {
+                    std::cout << "Error! read packed sfen , failed." << std::endl;
+                    return sfen_for_mse;
+                }
+
+                sfen_for_mse.push_back(ps);
+            }
+
+            return sfen_for_mse;
+        }
+
+        PSVector read_validation_set(const std::string& file_name, int eval_limit, bool use_draw_games)
+        {
+            PSVector sfen_for_mse;
+
+            auto input = open_sfen_input_file(file_name);
+
+            while(!input->eof())
+            {
+                std::optional<PackedSfenValue> p_opt = input->next();
+                if (p_opt.has_value())
+                {
+                    auto& p = *p_opt;
+
+                    if (eval_limit < abs(p.score))
+                        continue;
+
+                    if (!use_draw_games && p.game_result == 0)
+                        continue;
+
+                    sfen_for_mse.push_back(p);
+                }
+                else
+                {
+                    break;
+                }
+            }
+
+            return sfen_for_mse;
+        }
+
+        // [ASYNC] Thread returns one aspect. Otherwise returns false.
+        bool read_to_thread_buffer(size_t thread_id, PackedSfenValue& ps)
+        {
+            // If there are any positions left in the thread buffer
+            // then retrieve one and return it.
+            auto& thread_ps = packed_sfens[thread_id];
+
+            // Fill the read buffer if there is no remaining buffer,
+            // but if it doesn't even exist, finish.
+            // If the buffer is empty, fill it.
+            if ((thread_ps == nullptr || thread_ps->empty())
+                && !read_to_thread_buffer_impl(thread_id))
+                return false;
+
+            // read_to_thread_buffer_impl() returned true,
+            // Since the filling of the thread buffer with the
+            // phase has been completed successfully
+            // thread_ps->rbegin() is alive.
+
+            ps = thread_ps->back();
+            thread_ps->pop_back();
+
+            // If you've run out of buffers, call delete yourself to free this buffer.
+            if (thread_ps->empty())
+            {
+                thread_ps.reset();
+            }
+
+            return true;
+        }
+
+        // [ASYNC] Read some aspects into thread buffer.
+        bool read_to_thread_buffer_impl(size_t thread_id)
+        {
+            while (true)
+            {
+                {
+                    std::unique_lock<std::mutex> lk(mutex);
+                    // If you can fill from the file buffer, that's fine.
+                    if (packed_sfens_pool.size() != 0)
+                    {
+                        // It seems that filling is possible, so fill and finish.
+
+                        packed_sfens[thread_id] = std::move(packed_sfens_pool.front());
+                        packed_sfens_pool.pop_front();
+
+                        total_read += THREAD_BUFFER_SIZE;
+
+                        return true;
+                    }
+                }
+
+                // The file to read is already gone. No more use.
+                if (end_of_files)
+                    return false;
+
+                // Waiting for file worker to fill packed_sfens_pool.
+                // The mutex isn't locked, so it should fill up soon.
+                // Poor man's condition variable.
+                sleep(1);
+            }
+
+        }
+
+        // Start a thread that loads the phase file in the background.
+        void start_file_read_worker()
+        {
+            file_worker_thread = std::thread([&] {
+                this->file_read_worker();
+                });
+        }
+
+        void file_read_worker()
+        {
+            auto open_next_file = [&]() {
+                // no more
+                for(;;)
+                {
+                    sfen_input_stream.reset();
+
+                    if (filenames.empty())
+                        return false;
+
+                    // Get the next file name.
+                    std::string filename = filenames.front();
+                    filenames.pop_front();
+
+                    sfen_input_stream = open_sfen_input_file(filename);
+                    std::cout << "open filename = " << filename << std::endl;
+
+                    // in case the file is empty or was deleted.
+                    if (!sfen_input_stream->eof())
+                        return true;
+                }
+            };
+
+            if (sfen_input_stream == nullptr && !open_next_file())
+            {
+                std::cout << "..end of files." << std::endl;
+                end_of_files = true;
+                return;
+            }
+
+            while (true)
+            {
+                // Wait for the buffer to run out.
+                // This size() is read only, so you don't need to lock it.
+                while (!stop_flag && packed_sfens_pool.size() >= SFEN_READ_SIZE / THREAD_BUFFER_SIZE)
+                    sleep(100);
+
+                if (stop_flag)
+                    return;
+
+                PSVector sfens;
+                sfens.reserve(SFEN_READ_SIZE);
+
+                // Read from the file into the file buffer.
+                while (sfens.size() < SFEN_READ_SIZE)
+                {
+                    std::optional<PackedSfenValue> p = sfen_input_stream->next();
+                    if (p.has_value())
+                    {
+                        sfens.push_back(*p);
+                    }
+                    else if(!open_next_file())
+                    {
+                        // There was no next file. Abort.
+                        std::cout << "..end of files." << std::endl;
+                        end_of_files = true;
+                        return;
+                    }
+                }
+
+                // Shuffle the read phase data.
+                if (shuffle)
+                {
+                    Algo::shuffle(sfens, prng);
+                }
+
+                // Divide this by THREAD_BUFFER_SIZE. There should be size pieces.
+                // SFEN_READ_SIZE shall be a multiple of THREAD_BUFFER_SIZE.
+                assert((SFEN_READ_SIZE % THREAD_BUFFER_SIZE) == 0);
+
+                auto size = size_t(SFEN_READ_SIZE / THREAD_BUFFER_SIZE);
+                std::vector<std::unique_ptr<PSVector>> buffers;
+                buffers.reserve(size);
+
+                for (size_t i = 0; i < size; ++i)
+                {
+                    // Delete this pointer on the receiving side.
+                    auto buf = std::make_unique<PSVector>();
+                    buf->resize(THREAD_BUFFER_SIZE);
+                    memcpy(
+                        buf->data(),
+                        &sfens[i * THREAD_BUFFER_SIZE],
+                        sizeof(PackedSfenValue) * THREAD_BUFFER_SIZE);
+
+                    buffers.emplace_back(std::move(buf));
+                }
+
+                {
+                    std::unique_lock<std::mutex> lk(mutex);
+
+                    // The mutex lock is required because the%
+                    // contents of packed_sfens_pool are changed.
+
+                    for (auto& buf : buffers)
+                        packed_sfens_pool.emplace_back(std::move(buf));
+                }
+            }
+        }
+
+        void stop()
+        {
+            stop_flag = true;
+        }
+
+        void set_do_shuffle(bool v)
+        {
+            shuffle = v;
+        }
+
+        void add_file(const std::string& filename)
+        {
+            filenames.push_back(filename);
+        }
+
+    protected:
+
+        // worker thread reading file in background
+        std::thread file_worker_thread;
+
+        // sfen files
+        std::deque<std::string> filenames;
+
+        std::atomic<bool> stop_flag;
+
+        // number of phases read (file to memory buffer)
+        std::atomic<uint64_t> total_read;
+
+        // Do not shuffle when reading the phase.
+        bool shuffle;
+
+        // Random number to shuffle when reading the phase
+        PRNG prng;
+
+        // Did you read the files and reached the end?
+        std::atomic<bool> end_of_files;
+
+        // handle of sfen file
+        std::unique_ptr<BasicSfenInputStream> sfen_input_stream;
+
+        // sfen for each thread
+        // (When the thread is used up, the thread should call delete to release it.)
+        std::vector<std::unique_ptr<PSVector>> packed_sfens;
+
+        // Mutex when accessing packed_sfens_pool
+        std::mutex mutex;
+
+        // pool of sfen. The worker thread read from the file is added here.
+        // Each worker thread fills its own packed_sfens[thread_id] from here.
+        // * Lock and access the mutex.
+        std::list<std::unique_ptr<PSVector>> packed_sfens_pool;
+    };
+}

From 11b28ad3b5c455ab7db9b6c1276a23457079a453 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 18 Oct 2020 14:25:39 +0200
Subject: [PATCH 228/398] Don't treat unknown options in learn as file names.
 Add targetfile to specify individual files.

---
 src/learn/learn.cpp | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index af53791c..f6f4b3f4 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -1245,6 +1245,12 @@ namespace Learner
 
             // Specify the folder in which the game record is stored and make it the rooting target.
             else if (option == "targetdir") is >> target_dir;
+            else if (option == "targetfile")
+            {
+                std::string filename;
+                is >> filename;
+                filenames.push_back(filename);
+            }
 
             // Specify the number of loops
             else if (option == "loop")      is >> loop;
@@ -1333,9 +1339,10 @@ namespace Learner
                 UCI::setoption("PruneAtShallowDepth", "false");
                 UCI::setoption("EnableTranspositionTable", "false");
             }
-            // Otherwise, it's a filename.
             else
-                filenames.push_back(option);
+            {
+                cout << "Unknown option: " << option << ". Ignoring.\n";
+            }
         }
 
         if (loss_output_interval == 0)

From 886467e09f815fda97bbea9090b045bb6fb803f3 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 22 Oct 2020 11:16:41 +0200
Subject: [PATCH 229/398] Fix crash when trying to read a non existing .binpack
 file.

---
 src/extra/nnue_data_binpack_format.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/extra/nnue_data_binpack_format.h b/src/extra/nnue_data_binpack_format.h
index 826b2959..b9e45c3e 100644
--- a/src/extra/nnue_data_binpack_format.h
+++ b/src/extra/nnue_data_binpack_format.h
@@ -6141,6 +6141,11 @@ namespace binpack
 
         [[nodiscard]] bool hasNextChunk()
         {
+            if (!m_file)
+            {
+                return false;
+            }
+
             m_file.peek();
             return !m_file.eof();
         }

From af138d19379effc9862691639d0f7c4f393ae7ff Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 22 Oct 2020 11:20:31 +0200
Subject: [PATCH 230/398] Fix crashes when trying to open a file of unknown
 type. Increase robustness of error handling.

---
 src/learn/sfen_reader.h | 22 ++++++++++++++++++----
 src/learn/sfen_stream.h |  1 -
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/src/learn/sfen_reader.h b/src/learn/sfen_reader.h
index 2645bb6c..38c2532c 100644
--- a/src/learn/sfen_reader.h
+++ b/src/learn/sfen_reader.h
@@ -187,11 +187,25 @@ namespace Learner{
                     filenames.pop_front();
 
                     sfen_input_stream = open_sfen_input_file(filename);
-                    std::cout << "open filename = " << filename << std::endl;
 
-                    // in case the file is empty or was deleted.
-                    if (!sfen_input_stream->eof())
-                        return true;
+                    if (sfen_input_stream == nullptr)
+                    {
+                        std::cout << "File does not exist: " << filename << '\n';
+                    }
+                    else
+                    {
+                        std::cout << "Opened file for reading: " << filename << '\n';
+
+                        // in case the file is empty or was deleted.
+                        if (sfen_input_stream->eof())
+                        {
+                            std::cout << "File empty, nothing to read.\n";
+                        }
+                        else
+                        {
+                            return true;
+                        }
+                    }
                 }
             };
 
diff --git a/src/learn/sfen_stream.h b/src/learn/sfen_stream.h
index 4d44901b..d25dd41d 100644
--- a/src/learn/sfen_stream.h
+++ b/src/learn/sfen_stream.h
@@ -191,7 +191,6 @@ namespace Learner {
         else if (has_extension(filename, BinpackSfenInputStream::extension))
             return std::make_unique<BinpackSfenInputStream>(filename);
 
-        assert(false);
         return nullptr;
     }
 

From 7b4a769cca7bba6971460fd96149dd1c4f29d374 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 22 Oct 2020 11:26:59 +0200
Subject: [PATCH 231/398] Fix base_dir not being applied to singular filenames.

---
 src/learn/learn.cpp | 40 +++++++++++++++++++++++++++++-----------
 1 file changed, 29 insertions(+), 11 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index f6f4b3f4..b945e06c 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -157,6 +157,32 @@ namespace Learner
     using Loss = Detail::Loss<false>;
     using AtomicLoss = Detail::Loss<true>;
 
+    static void append_files_from_dir(
+        std::vector<std::string>& filenames,
+        const std::string& base_dir,
+        const std::string& target_dir)
+    {
+        string kif_base_dir = Path::combine(base_dir, target_dir);
+
+        namespace sys = std::filesystem;
+        sys::path p(kif_base_dir); // Origin of enumeration
+        std::for_each(sys::directory_iterator(p), sys::directory_iterator(),
+            [&](const sys::path& path) {
+                if (sys::is_regular_file(path))
+                    filenames.push_back(Path::combine(target_dir, path.filename().generic_string()));
+            });
+    }
+
+    static void rebase_files(
+        std::vector<std::string>& filenames,
+        const std::string& base_dir)
+    {
+        for (auto& file : filenames)
+        {
+            file = Path::combine(base_dir, file);
+        }
+    }
+
     // A function that converts the evaluation value to the winning rate [0,1]
     double winning_percentage(double value)
     {
@@ -1359,18 +1385,10 @@ namespace Learner
 
         LearnerThink learn_think(thread_num, seed);
 
-        // Display learning game file
-        if (target_dir != "")
+        rebase_files(filenames, base_dir);
+        if (!target_dir.empty())
         {
-            string kif_base_dir = Path::combine(base_dir, target_dir);
-
-            namespace sys = std::filesystem;
-            sys::path p(kif_base_dir); // Origin of enumeration
-            std::for_each(sys::directory_iterator(p), sys::directory_iterator(),
-                [&](const sys::path& path) {
-                    if (sys::is_regular_file(path))
-                        filenames.push_back(Path::combine(target_dir, path.filename().generic_string()));
-                });
+            append_files_from_dir(filenames, base_dir, target_dir);
         }
 
         cout << "learn from ";

From f5dfad5d72e164b57b787c0224046d641b3ade84 Mon Sep 17 00:00:00 2001
From: xoto10 <buylow001@gmail.com>
Date: Wed, 21 Oct 2020 14:52:13 +0100
Subject: [PATCH 232/398] Reduce big time spikes by reducing PV re-searches.

Save time by reducing PV re-searches above original depth. Instead use 5% extra time on every move.

STC 10+0.1 th 1 :
LLR: 2.93 (-2.94,2.94) {-0.25,1.25}
Total: 90688 W: 9702 L: 9436 D: 71550
Ptnml(0-2): 408, 7252, 29792, 7450, 442
https://tests.stockfishchess.org/tests/view/5f8df807bacb75a4f9a47223

LTC 60+0.6 th 1 :
LLR: 2.97 (-2.94,2.94) {0.25,1.25}
Total: 97856 W: 4602 L: 4303 D: 88951
Ptnml(0-2): 53, 3757, 41057, 3960, 101
https://tests.stockfishchess.org/tests/view/5f8ec4872c92c7fe3a8c602d

closes https://github.com/official-stockfish/Stockfish/pull/3192

Bench 3943959
---
 src/search.cpp  | 4 +++-
 src/timeman.cpp | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index ab58ca64..65ed9b73 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -565,6 +565,7 @@ namespace {
 
     constexpr bool PvNode = NT == PV;
     const bool rootNode = PvNode && ss->ply == 0;
+    const Depth maxNextDepth = rootNode ? depth : depth + 1;
 
     // Check if we have an upcoming move which draws by repetition, or
     // if the opponent had an alternative move earlier to this position.
@@ -1259,7 +1260,8 @@ moves_loop: // When in check, search starts from here
           (ss+1)->pv = pv;
           (ss+1)->pv[0] = MOVE_NONE;
 
-          value = -search<PV>(pos, ss+1, -beta, -alpha, newDepth, false);
+          value = -search<PV>(pos, ss+1, -beta, -alpha,
+                              std::min(maxNextDepth, newDepth), false);
       }
 
       // Step 18. Undo move
diff --git a/src/timeman.cpp b/src/timeman.cpp
index 6d9c95ef..da08f12d 100644
--- a/src/timeman.cpp
+++ b/src/timeman.cpp
@@ -75,7 +75,7 @@ void TimeManagement::init(Search::LimitsType& limits, Color us, int ply) {
   // game time for the current move, so also cap to 20% of available game time.
   if (limits.movestogo == 0)
   {
-      optScale = std::min(0.008 + std::pow(ply + 3.0, 0.5) / 250.0,
+      optScale = std::min(0.0084 + std::pow(ply + 3.0, 0.5) * 0.0042,
                            0.2 * limits.time[us] / double(timeLeft));
       maxScale = std::min(7.0, 4.0 + ply / 12.0);
   }

From 258af8ae44fc15407996e0a21a80ee8b9cfa12cb Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Sun, 18 Oct 2020 15:01:19 +0200
Subject: [PATCH 233/398] Add net as dependency of config

cleaner output and error message if the server is down and the net is not available.

closes https://github.com/official-stockfish/Stockfish/pull/3188

No functional change
---
 src/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 54868b39..87203547 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -711,7 +711,7 @@ endif
         config-sanity icc-profile-use icc-profile-make gcc-profile-use gcc-profile-make \
         clang-profile-use clang-profile-make
 
-build: config-sanity net
+build: net config-sanity
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) all
 
 profile-build: net config-sanity objclean profileclean
@@ -784,7 +784,7 @@ default:
 
 all: $(EXE) .depend
 
-config-sanity:
+config-sanity: net
 	@echo ""
 	@echo "Config:"
 	@echo "debug: '$(debug)'"

From 2046d5da30b2cd505b69bddb40062b0d37b43bc7 Mon Sep 17 00:00:00 2001
From: syzygy1 <3028851+syzygy1@users.noreply.github.com>
Date: Tue, 20 Oct 2020 21:06:06 +0200
Subject: [PATCH 234/398] More incremental accumulator updates

This patch was inspired by c065abd which updates the accumulator,
if possible, based on the accumulator of two plies back if
the accumulator of the preceding ply is not available.

With this patch we look back even further in the position history
in an attempt to reduce the number of complete recomputations.
When we find a usable accumulator for the position N plies back,
we also update the accumulator of the position N-1 plies back
because that accumulator is most likely to be helpful later
when evaluating positions in sibling branches.
By not updating all intermediate accumulators immediately,
we avoid doing too much work that is not certain to be useful.
Overall, roughly 2-3% speedup.

This patch makes the code more specific to the net architecture,
changing input features of the net will require additional changes
to the incremental update code as discussed in the PR #3193 and #3191.

Passed STC:
https://tests.stockfishchess.org/tests/view/5f9056712c92c7fe3a8c60d0
LLR: 2.94 (-2.94,2.94) {-0.25,1.25}
Total: 10040 W: 1116 L: 968 D: 7956
Ptnml(0-2): 42, 722, 3365, 828, 63

closes https://github.com/official-stockfish/Stockfish/pull/3193

No functional change.
---
 src/nnue/features/feature_set.h     | 108 -----------
 src/nnue/nnue_accumulator.h         |   5 +-
 src/nnue/nnue_feature_transformer.h | 288 ++++++++++++++--------------
 src/position.cpp                    |  17 +-
 4 files changed, 157 insertions(+), 261 deletions(-)

diff --git a/src/nnue/features/feature_set.h b/src/nnue/features/feature_set.h
index 26198114..975824b6 100644
--- a/src/nnue/features/feature_set.h
+++ b/src/nnue/features/feature_set.h
@@ -43,90 +43,6 @@ namespace Eval::NNUE::Features {
   template <typename Derived>
   class FeatureSetBase {
 
-   public:
-    // Get a list of indices for active features
-    template <typename IndexListType>
-    static void AppendActiveIndices(
-        const Position& pos, TriggerEvent trigger, IndexListType active[2]) {
-
-      for (Color perspective : { WHITE, BLACK }) {
-        Derived::CollectActiveIndices(
-            pos, trigger, perspective, &active[perspective]);
-      }
-    }
-
-    // Get a list of indices for recently changed features
-    template <typename PositionType, typename IndexListType>
-    static void AppendChangedIndices(
-        const PositionType& pos, TriggerEvent trigger,
-        IndexListType removed[2], IndexListType added[2], bool reset[2]) {
-
-      auto collect_for_one = [&](const DirtyPiece& dp) {
-        for (Color perspective : { WHITE, BLACK }) {
-          switch (trigger) {
-            case TriggerEvent::kFriendKingMoved:
-              reset[perspective] = dp.piece[0] == make_piece(perspective, KING);
-              break;
-            default:
-              assert(false);
-              break;
-          }
-          if (reset[perspective]) {
-            Derived::CollectActiveIndices(
-                pos, trigger, perspective, &added[perspective]);
-          } else {
-            Derived::CollectChangedIndices(
-                pos, dp, trigger, perspective,
-                &removed[perspective], &added[perspective]);
-          }
-        }
-      };
-
-      auto collect_for_two = [&](const DirtyPiece& dp1, const DirtyPiece& dp2) {
-        for (Color perspective : { WHITE, BLACK }) {
-          switch (trigger) {
-            case TriggerEvent::kFriendKingMoved:
-              reset[perspective] = dp1.piece[0] == make_piece(perspective, KING)
-                                || dp2.piece[0] == make_piece(perspective, KING);
-              break;
-            default:
-              assert(false);
-              break;
-          }
-          if (reset[perspective]) {
-            Derived::CollectActiveIndices(
-                pos, trigger, perspective, &added[perspective]);
-          } else {
-            Derived::CollectChangedIndices(
-                pos, dp1, trigger, perspective,
-                &removed[perspective], &added[perspective]);
-            Derived::CollectChangedIndices(
-                pos, dp2, trigger, perspective,
-                &removed[perspective], &added[perspective]);
-          }
-        }
-      };
-
-      if (pos.state()->previous->accumulator.computed_accumulation) {
-        const auto& prev_dp = pos.state()->dirtyPiece;
-        if (prev_dp.dirty_num == 0) return;
-        collect_for_one(prev_dp);
-      } else {
-        const auto& prev_dp = pos.state()->previous->dirtyPiece;
-        if (prev_dp.dirty_num == 0) {
-          const auto& prev2_dp = pos.state()->dirtyPiece;
-          if (prev2_dp.dirty_num == 0) return;
-          collect_for_one(prev2_dp);
-        } else {
-          const auto& prev2_dp = pos.state()->dirtyPiece;
-          if (prev2_dp.dirty_num == 0) {
-            collect_for_one(prev_dp);
-          } else {
-            collect_for_two(prev_dp, prev2_dp);
-          }
-        }
-      }
-    }
   };
 
   // Class template that represents the feature set
@@ -146,30 +62,6 @@ namespace Eval::NNUE::Features {
         CompileTimeList<TriggerEvent, FeatureType::kRefreshTrigger>;
     static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
 
-   private:
-    // Get a list of indices for active features
-    static void CollectActiveIndices(
-        const Position& pos, const TriggerEvent trigger, const Color perspective,
-        IndexList* const active) {
-      if (FeatureType::kRefreshTrigger == trigger) {
-        FeatureType::AppendActiveIndices(pos, perspective, active);
-      }
-    }
-
-    // Get a list of indices for recently changed features
-    static void CollectChangedIndices(
-        const Position& pos, const DirtyPiece& dp, const TriggerEvent trigger, const Color perspective,
-        IndexList* const removed, IndexList* const added) {
-
-      if (FeatureType::kRefreshTrigger == trigger) {
-        FeatureType::AppendChangedIndices(pos, dp, perspective, removed, added);
-      }
-    }
-
-    // Make the base class and the class template that recursively uses itself a friend
-    friend class FeatureSetBase<FeatureSet>;
-    template <typename... FeatureTypes>
-    friend class FeatureSet;
   };
 
 }  // namespace Eval::NNUE::Features
diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h
index 26370710..a357d835 100644
--- a/src/nnue/nnue_accumulator.h
+++ b/src/nnue/nnue_accumulator.h
@@ -25,11 +25,14 @@
 
 namespace Eval::NNUE {
 
+  // The accumulator of a StateInfo without parent is set to the INIT state
+  enum AccumulatorState { EMPTY, COMPUTED, INIT };
+
   // Class that holds the result of affine transformation of input features
   struct alignas(kCacheLineSize) Accumulator {
     std::int16_t
         accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions];
-    bool computed_accumulation;
+    AccumulatorState state[2];
   };
 
 }  // namespace Eval::NNUE
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index 2f86d20a..f145c848 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -32,7 +32,7 @@ namespace Eval::NNUE {
   // If vector instructions are enabled, we update and refresh the
   // accumulator tile by tile such that each tile fits in the CPU's
   // vector registers.
-  #define TILING
+  #define VECTOR
 
   #ifdef USE_AVX512
   typedef __m512i vec_t;
@@ -75,7 +75,7 @@ namespace Eval::NNUE {
   static constexpr IndexType kNumRegs = 16;
 
   #else
-  #undef TILING
+  #undef VECTOR
 
   #endif
 
@@ -86,7 +86,7 @@ namespace Eval::NNUE {
     // Number of output dimensions for one side
     static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions;
 
-    #ifdef TILING
+    #ifdef VECTOR
     static constexpr IndexType kTileHeight = kNumRegs * sizeof(vec_t) / 2;
     static_assert(kHalfDimensions % kTileHeight == 0, "kTileHeight must divide kHalfDimensions");
     #endif
@@ -119,32 +119,11 @@ namespace Eval::NNUE {
       return !stream.fail();
     }
 
-    // Proceed with the difference calculation if possible
-    bool UpdateAccumulatorIfPossible(const Position& pos) const {
-
-      const auto now = pos.state();
-      if (now->accumulator.computed_accumulation)
-        return true;
-
-      const auto prev = now->previous;
-      if (prev) {
-        if (prev->accumulator.computed_accumulation) {
-          UpdateAccumulator(pos);
-          return true;
-        } else if (prev->previous && prev->previous->accumulator.computed_accumulation) {
-          UpdateAccumulator(pos);
-          return true;
-        }
-      }
-
-      return false;
-    }
-
     // Convert input features
     void Transform(const Position& pos, OutputType* output) const {
 
-      if (!UpdateAccumulatorIfPossible(pos))
-        RefreshAccumulator(pos);
+      UpdateAccumulator(pos, WHITE);
+      UpdateAccumulator(pos, BLACK);
 
       const auto& accumulation = pos.state()->accumulator.accumulation;
 
@@ -240,27 +219,142 @@ namespace Eval::NNUE {
     }
 
    private:
-    // Calculate cumulative value without using difference calculation
-    void RefreshAccumulator(const Position& pos) const {
+    void UpdateAccumulator(const Position& pos, const Color c) const {
 
-      auto& accumulator = pos.state()->accumulator;
-      IndexType i = 0;
-      Features::IndexList active_indices[2];
-      RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i],
-                                       active_indices);
-      for (Color perspective : { WHITE, BLACK }) {
-  #ifdef TILING
-        for (unsigned j = 0; j < kHalfDimensions / kTileHeight; ++j) {
+  #ifdef VECTOR
+      // Gcc-10.2 unnecessarily spills AVX2 registers if this array
+      // is defined in the VECTOR code below, once in each branch
+      vec_t acc[kNumRegs];
+  #endif
+
+      // Look for a usable accumulator of an earlier position. We keep track
+      // of the estimated gain in terms of features to be added/subtracted.
+      StateInfo *st = pos.state(), *next = nullptr;
+      int gain = popcount(pos.pieces()) - 2;
+      while (st->accumulator.state[c] == EMPTY)
+      {
+        auto& dp = st->dirtyPiece;
+        // The first condition tests whether an incremental update is
+        // possible at all: if this side's king has moved, it is not possible.
+        static_assert(std::is_same_v<RawFeatures::SortedTriggerSet,
+              Features::CompileTimeList<Features::TriggerEvent, Features::TriggerEvent::kFriendKingMoved>>,
+              "Current code assumes that only kFriendlyKingMoved refresh trigger is being used.");
+        if (   dp.piece[0] == make_piece(c, KING)
+            || (gain -= dp.dirty_num + 1) < 0)
+          break;
+        next = st;
+        st = st->previous;
+      }
+
+      if (st->accumulator.state[c] == COMPUTED)
+      {
+        if (next == nullptr)
+          return;
+
+        // Update incrementally in two steps. First, we update the "next"
+        // accumulator. Then, we update the current accumulator (pos.state()).
+
+        // Gather all features to be updated. This code assumes HalfKP features
+        // only and doesn't support refresh triggers.
+        static_assert(std::is_same_v<Features::FeatureSet<Features::HalfKP<Features::Side::kFriend>>,
+                                     RawFeatures>);
+        Features::IndexList removed[2], added[2];
+        Features::HalfKP<Features::Side::kFriend>::AppendChangedIndices(pos,
+            next->dirtyPiece, c, &removed[0], &added[0]);
+        for (StateInfo *st2 = pos.state(); st2 != next; st2 = st2->previous)
+          Features::HalfKP<Features::Side::kFriend>::AppendChangedIndices(pos,
+              st2->dirtyPiece, c, &removed[1], &added[1]);
+
+        // Mark the accumulators as computed.
+        next->accumulator.state[c] = COMPUTED;
+        pos.state()->accumulator.state[c] = COMPUTED;
+
+        // Now update the accumulators listed in info[], where the last element is a sentinel.
+        StateInfo *info[3] =
+          { next, next == pos.state() ? nullptr : pos.state(), nullptr };
+  #ifdef VECTOR
+        for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j)
+        {
+          // Load accumulator
+          auto accTile = reinterpret_cast<vec_t*>(
+            &st->accumulator.accumulation[c][0][j * kTileHeight]);
+          for (IndexType k = 0; k < kNumRegs; ++k)
+            acc[k] = vec_load(&accTile[k]);
+
+          for (IndexType i = 0; info[i]; ++i)
+          {
+            // Difference calculation for the deactivated features
+            for (const auto index : removed[i])
+            {
+              const IndexType offset = kHalfDimensions * index + j * kTileHeight;
+              auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
+              for (IndexType k = 0; k < kNumRegs; ++k)
+                acc[k] = vec_sub_16(acc[k], column[k]);
+            }
+
+            // Difference calculation for the activated features
+            for (const auto index : added[i])
+            {
+              const IndexType offset = kHalfDimensions * index + j * kTileHeight;
+              auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
+              for (IndexType k = 0; k < kNumRegs; ++k)
+                acc[k] = vec_add_16(acc[k], column[k]);
+            }
+
+            // Store accumulator
+            accTile = reinterpret_cast<vec_t*>(
+              &info[i]->accumulator.accumulation[c][0][j * kTileHeight]);
+            for (IndexType k = 0; k < kNumRegs; ++k)
+              vec_store(&accTile[k], acc[k]);
+          }
+        }
+
+  #else
+        for (IndexType i = 0; info[i]; ++i)
+        {
+          std::memcpy(info[i]->accumulator.accumulation[c][0],
+              st->accumulator.accumulation[c][0],
+              kHalfDimensions * sizeof(BiasType));
+          st = info[i];
+
+          // Difference calculation for the deactivated features
+          for (const auto index : removed[i])
+          {
+            const IndexType offset = kHalfDimensions * index;
+
+            for (IndexType j = 0; j < kHalfDimensions; ++j)
+              st->accumulator.accumulation[c][0][j] -= weights_[offset + j];
+          }
+
+          // Difference calculation for the activated features
+          for (const auto index : added[i])
+          {
+            const IndexType offset = kHalfDimensions * index;
+
+            for (IndexType j = 0; j < kHalfDimensions; ++j)
+              st->accumulator.accumulation[c][0][j] += weights_[offset + j];
+          }
+        }
+  #endif
+      }
+      else
+      {
+        // Refresh the accumulator
+        auto& accumulator = pos.state()->accumulator;
+        accumulator.state[c] = COMPUTED;
+        Features::IndexList active;
+        Features::HalfKP<Features::Side::kFriend>::AppendActiveIndices(pos, c, &active);
+
+  #ifdef VECTOR
+        for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j)
+        {
           auto biasesTile = reinterpret_cast<const vec_t*>(
               &biases_[j * kTileHeight]);
-          auto accTile = reinterpret_cast<vec_t*>(
-              &accumulator.accumulation[perspective][i][j * kTileHeight]);
-          vec_t acc[kNumRegs];
-
-          for (unsigned k = 0; k < kNumRegs; ++k)
+          for (IndexType k = 0; k < kNumRegs; ++k)
             acc[k] = biasesTile[k];
 
-          for (const auto index : active_indices[perspective]) {
+          for (const auto index : active)
+          {
             const IndexType offset = kHalfDimensions * index + j * kTileHeight;
             auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
 
@@ -268,18 +362,22 @@ namespace Eval::NNUE {
               acc[k] = vec_add_16(acc[k], column[k]);
           }
 
+          auto accTile = reinterpret_cast<vec_t*>(
+              &accumulator.accumulation[c][0][j * kTileHeight]);
           for (unsigned k = 0; k < kNumRegs; k++)
             vec_store(&accTile[k], acc[k]);
         }
+
   #else
-        std::memcpy(accumulator.accumulation[perspective][i], biases_,
+        std::memcpy(accumulator.accumulation[c][0], biases_,
             kHalfDimensions * sizeof(BiasType));
 
-        for (const auto index : active_indices[perspective]) {
+        for (const auto index : active)
+        {
           const IndexType offset = kHalfDimensions * index;
 
           for (IndexType j = 0; j < kHalfDimensions; ++j)
-            accumulator.accumulation[perspective][i][j] += weights_[offset + j];
+            accumulator.accumulation[c][0][j] += weights_[offset + j];
         }
   #endif
       }
@@ -287,106 +385,6 @@ namespace Eval::NNUE {
   #if defined(USE_MMX)
       _mm_empty();
   #endif
-
-      accumulator.computed_accumulation = true;
-    }
-
-    // Calculate cumulative value using difference calculation
-    void UpdateAccumulator(const Position& pos) const {
-
-      Accumulator* prev_accumulator;
-      assert(pos.state()->previous);
-      if (pos.state()->previous->accumulator.computed_accumulation) {
-        prev_accumulator = &pos.state()->previous->accumulator;
-      }
-      else {
-        assert(pos.state()->previous->previous);
-        assert(pos.state()->previous->previous->accumulator.computed_accumulation);
-        prev_accumulator = &pos.state()->previous->previous->accumulator;
-      }
-
-      auto& accumulator = pos.state()->accumulator;
-      IndexType i = 0;
-      Features::IndexList removed_indices[2], added_indices[2];
-      bool reset[2] = { false, false };
-      RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i],
-                                        removed_indices, added_indices, reset);
-
-  #ifdef TILING
-      for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
-        for (Color perspective : { WHITE, BLACK }) {
-          auto accTile = reinterpret_cast<vec_t*>(
-              &accumulator.accumulation[perspective][i][j * kTileHeight]);
-          vec_t acc[kNumRegs];
-
-          if (reset[perspective]) {
-            auto biasesTile = reinterpret_cast<const vec_t*>(
-                &biases_[j * kTileHeight]);
-            for (unsigned k = 0; k < kNumRegs; ++k)
-              acc[k] = biasesTile[k];
-          } else {
-            auto prevAccTile = reinterpret_cast<const vec_t*>(
-                &prev_accumulator->accumulation[perspective][i][j * kTileHeight]);
-            for (IndexType k = 0; k < kNumRegs; ++k)
-              acc[k] = vec_load(&prevAccTile[k]);
-
-            // Difference calculation for the deactivated features
-            for (const auto index : removed_indices[perspective]) {
-              const IndexType offset = kHalfDimensions * index + j * kTileHeight;
-              auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
-
-              for (IndexType k = 0; k < kNumRegs; ++k)
-                acc[k] = vec_sub_16(acc[k], column[k]);
-            }
-          }
-          { // Difference calculation for the activated features
-            for (const auto index : added_indices[perspective]) {
-              const IndexType offset = kHalfDimensions * index + j * kTileHeight;
-              auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
-
-              for (IndexType k = 0; k < kNumRegs; ++k)
-                acc[k] = vec_add_16(acc[k], column[k]);
-            }
-          }
-
-          for (IndexType k = 0; k < kNumRegs; ++k)
-            vec_store(&accTile[k], acc[k]);
-        }
-      }
-  #if defined(USE_MMX)
-      _mm_empty();
-  #endif
-
-  #else
-      for (Color perspective : { WHITE, BLACK }) {
-
-        if (reset[perspective]) {
-          std::memcpy(accumulator.accumulation[perspective][i], biases_,
-                      kHalfDimensions * sizeof(BiasType));
-        } else {
-          std::memcpy(accumulator.accumulation[perspective][i],
-                      prev_accumulator->accumulation[perspective][i],
-                      kHalfDimensions * sizeof(BiasType));
-          // Difference calculation for the deactivated features
-          for (const auto index : removed_indices[perspective]) {
-            const IndexType offset = kHalfDimensions * index;
-
-            for (IndexType j = 0; j < kHalfDimensions; ++j)
-              accumulator.accumulation[perspective][i][j] -= weights_[offset + j];
-          }
-        }
-        { // Difference calculation for the activated features
-          for (const auto index : added_indices[perspective]) {
-            const IndexType offset = kHalfDimensions * index;
-
-            for (IndexType j = 0; j < kHalfDimensions; ++j)
-              accumulator.accumulation[perspective][i][j] += weights_[offset + j];
-          }
-        }
-      }
-  #endif
-
-      accumulator.computed_accumulation = true;
     }
 
     using BiasType = std::int16_t;
diff --git a/src/position.cpp b/src/position.cpp
index e6a760d2..b707293d 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -279,6 +279,8 @@ Position& Position::set(const string& fenStr, bool isChess960, StateInfo* si, Th
   chess960 = isChess960;
   thisThread = th;
   set_state(st);
+  st->accumulator.state[WHITE] = Eval::NNUE::INIT;
+  st->accumulator.state[BLACK] = Eval::NNUE::INIT;
 
   assert(pos_is_ok());
 
@@ -703,7 +705,8 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
   ++st->pliesFromNull;
 
   // Used by NNUE
-  st->accumulator.computed_accumulation = false;
+  st->accumulator.state[WHITE] = Eval::NNUE::EMPTY;
+  st->accumulator.state[BLACK] = Eval::NNUE::EMPTY;
   auto& dp = st->dirtyPiece;
   dp.dirty_num = 1;
 
@@ -996,16 +999,16 @@ void Position::do_null_move(StateInfo& newSt) {
   assert(!checkers());
   assert(&newSt != st);
 
-  if (Eval::useNNUE)
-  {
-      std::memcpy(&newSt, st, sizeof(StateInfo));
-  }
-  else
-      std::memcpy(&newSt, st, offsetof(StateInfo, accumulator));
+  std::memcpy(&newSt, st, offsetof(StateInfo, accumulator));
 
   newSt.previous = st;
   st = &newSt;
 
+  st->dirtyPiece.dirty_num = 0;
+  st->dirtyPiece.piece[0] = NO_PIECE; // Avoid checks in UpdateAccumulator()
+  st->accumulator.state[WHITE] = Eval::NNUE::EMPTY;
+  st->accumulator.state[BLACK] = Eval::NNUE::EMPTY;
+
   if (st->epSquare != SQ_NONE)
   {
       st->key ^= Zobrist::enpassant[file_of(st->epSquare)];

From 9564a52523b6001ea4d0e34fa17b8835c4a7b116 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 18 Oct 2020 14:29:12 +0200
Subject: [PATCH 235/398] Remove whole file shuffling as it does not change
 learning behaviour, only works for bin, and is considered harmful for
 binpack.

---
 src/learn/learn.cpp | 283 --------------------------------------------
 1 file changed, 283 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index b945e06c..2cab54b7 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -904,252 +904,6 @@ namespace Learner
         return false;
     }
 
-    // Shuffle_files(), shuffle_files_quick() subcontracting, writing part.
-    // output_file_name: Name of the file to write
-    // prng: random number generator
-    // sfen_file_streams: fstream of each teacher phase file
-    // sfen_count_in_file: The number of teacher positions present in each file.
-    void shuffle_write(
-        const string& output_file_name,
-        PRNG& prng,
-        vector<fstream>& sfen_file_streams,
-        vector<uint64_t>& sfen_count_in_file)
-    {
-        uint64_t total_sfen_count = 0;
-        for (auto c : sfen_count_in_file)
-            total_sfen_count += c;
-
-        // number of exported phases
-        uint64_t write_sfen_count = 0;
-
-        // Output the progress on the screen for each phase.
-        const uint64_t buffer_size = 10000000;
-
-        auto print_status = [&]()
-        {
-            // Output progress every 10M phase or when all writing is completed
-            if (((write_sfen_count % buffer_size) == 0) ||
-                (write_sfen_count == total_sfen_count))
-            {
-                cout << write_sfen_count << " / " << total_sfen_count << endl;
-            }
-        };
-
-        cout << endl << "write : " << output_file_name << endl;
-
-        fstream fs(output_file_name, ios::out | ios::binary);
-
-        // total teacher positions
-        uint64_t sfen_count_left = total_sfen_count;
-
-        while (sfen_count_left != 0)
-        {
-            auto r = prng.rand(sfen_count_left);
-
-            // Aspects stored in fs[0] file ... Aspects stored in fs[1] file ...
-            //Think of it as a series like, and determine in which file r is pointing.
-            // The contents of the file are shuffled, so you can take the next element from that file.
-            // Each file has a_count[x] phases, so this process can be written as follows.
-
-            uint64_t i = 0;
-            while (sfen_count_in_file[i] <= r)
-                r -= sfen_count_in_file[i++];
-
-            // This confirms n. Before you forget it, reduce the remaining number.
-
-            --sfen_count_in_file[i];
-            --sfen_count_left;
-
-            PackedSfenValue psv;
-            // It's better to read and write all at once until the performance is not so good...
-            if (sfen_file_streams[i].read((char*)&psv, sizeof(PackedSfenValue)))
-            {
-                fs.write((char*)&psv, sizeof(PackedSfenValue));
-                ++write_sfen_count;
-                print_status();
-            }
-        }
-
-        print_status();
-        fs.close();
-
-        cout << "done!" << endl;
-    }
-
-    // Subcontracting the teacher shuffle "learn shuffle" command.
-    // output_file_name: name of the output file where the shuffled teacher positions will be written
-    void shuffle_files(const vector<string>& filenames, const string& output_file_name, uint64_t buffer_size, const std::string& seed)
-    {
-        // The destination folder is
-        // tmp/ for temporary writing
-
-        // Temporary file is written to tmp/ folder for each buffer_size phase.
-        // For example, if buffer_size = 20M, you need a buffer of 20M*40bytes = 800MB.
-        // In a PC with a small memory, it would be better to reduce this.
-        // However, if the number of files increases too much,
-        // it will not be possible to open at the same time due to OS restrictions.
-        // There should have been a limit of 512 per process on Windows, so you can open here as 500,
-        // The current setting is 500 files x 20M = 10G = 10 billion phases.
-
-        PSVector buf(buffer_size);
-
-        // ↑ buffer, a marker that indicates how much you have used
-        uint64_t buf_write_marker = 0;
-
-        // File name to write (incremental counter because it is a serial number)
-        uint64_t write_file_count = 0;
-
-        // random number to shuffle
-        // Do not use std::random_device().  Because it always the same integers on MinGW.
-        PRNG prng(seed);
-
-        // generate the name of the temporary file
-        auto make_filename = [](uint64_t i)
-        {
-            return "tmp/" + to_string(i) + ".bin";
-        };
-
-        // Exported files in tmp/ folder, number of teacher positions stored in each
-        vector<uint64_t> a_count;
-
-        auto write_buffer = [&](uint64_t size)
-        {
-            Algo::shuffle(buf, prng);
-
-            // write to a file
-            fstream fs;
-            fs.open(make_filename(write_file_count++), ios::out | ios::binary);
-            fs.write(reinterpret_cast<char*>(buf.data()), size * sizeof(PackedSfenValue));
-            fs.close();
-            a_count.push_back(size);
-
-            buf_write_marker = 0;
-            cout << ".";
-        };
-
-        std::filesystem::create_directory("tmp");
-
-        // Shuffle and export as a 10M phase shredded file.
-        for (auto filename : filenames)
-        {
-            fstream fs(filename, ios::in | ios::binary);
-            cout << endl << "open file = " << filename;
-            while (fs.read(reinterpret_cast<char*>(&buf[buf_write_marker]), sizeof(PackedSfenValue)))
-                if (++buf_write_marker == buffer_size)
-                    write_buffer(buffer_size);
-
-            // Read in units of sizeof(PackedSfenValue),
-            // Ignore the last remaining fraction. (Fails in fs.read, so exit while)
-            // (The remaining fraction seems to be half-finished data
-            // that was created because it was stopped halfway during teacher generation.)
-        }
-
-        if (buf_write_marker != 0)
-            write_buffer(buf_write_marker);
-
-        // Only shuffled files have been written write_file_count.
-        // As a second pass, if you open all of them at the same time,
-        // select one at random and load one phase at a time
-        // Now you have shuffled.
-
-        // Original file for shirt full + tmp file + file to write
-        // requires 3 times the storage capacity of the original file.
-        // 1 billion SSD is not enough for shuffling because it is 400GB for 10 billion phases.
-        // If you want to delete (or delete by hand) the
-        // original file at this point after writing to tmp,
-        // The storage capacity is about twice that of the original file.
-        // So, maybe we should have an option to delete the original file.
-
-        // Files are opened at the same time. It is highly possible that this will exceed FOPEN_MAX.
-        // In that case, rather than adjusting buffer_size to reduce the number of files.
-
-        vector<fstream> afs;
-        for (uint64_t i = 0; i < write_file_count; ++i)
-            afs.emplace_back(fstream(make_filename(i), ios::in | ios::binary));
-
-        // Throw to the subcontract function and end.
-        shuffle_write(output_file_name, prng, afs, a_count);
-    }
-
-    // Subcontracting the teacher shuffle "learn shuffleq" command.
-    // This is written in 1 pass.
-    // output_file_name: name of the output file where the shuffled teacher positions will be written
-    void shuffle_files_quick(const vector<string>& filenames, const string& output_file_name, const std::string& seed)
-    {
-        // random number to shuffle
-        // Do not use std::random_device().  Because it always the same integers on MinGW.
-        PRNG prng(seed);
-
-        // number of files
-        const size_t file_count = filenames.size();
-
-        // Number of teacher positions stored in each file in filenames
-        vector<uint64_t> sfen_count_in_file(file_count);
-
-        // Count the number of teacher aspects in each file.
-        vector<fstream> sfen_file_streams(file_count);
-
-        for (size_t i = 0; i < file_count; ++i)
-        {
-            auto filename = filenames[i];
-            auto& fs = sfen_file_streams[i];
-
-            fs.open(filename, ios::in | ios::binary);
-            const uint64_t file_size = get_file_size(fs);
-            const uint64_t sfen_count = file_size / sizeof(PackedSfenValue);
-            sfen_count_in_file[i] = sfen_count;
-
-            // Output the number of sfen stored in each file.
-            cout << filename << " = " << sfen_count << " sfens." << endl;
-        }
-
-        // Since we know the file size of each file,
-        // open them all at once (already open),
-        // Select one at a time and load one phase at a time
-        // Now you have shuffled.
-
-        // Throw to the subcontract function and end.
-        shuffle_write(output_file_name, prng, sfen_file_streams, sfen_count_in_file);
-    }
-
-    // Subcontracting the teacher shuffle "learn shufflem" command.
-    // Read the whole memory and write it out with the specified file name.
-    void shuffle_files_on_memory(const vector<string>& filenames, const string output_file_name, const std::string& seed)
-    {
-        PSVector buf;
-
-        for (auto filename : filenames)
-        {
-            std::cout << "read : " << filename << std::endl;
-            read_file_to_memory(filename, [&buf](uint64_t size) {
-                assert((size % sizeof(PackedSfenValue)) == 0);
-                // Expand the buffer and read after the last end.
-                uint64_t last = buf.size();
-                buf.resize(last + size / sizeof(PackedSfenValue));
-                return (void*)&buf[last];
-                });
-        }
-
-        // shuffle from buf[0] to buf[size-1]
-        // Do not use std::random_device().  Because it always the same integers on MinGW.
-        PRNG prng(seed);
-        uint64_t size = (uint64_t)buf.size();
-        std::cout << "shuffle buf.size() = " << size << std::endl;
-
-        Algo::shuffle(buf, prng);
-
-        std::cout << "write : " << output_file_name << endl;
-
-        // If the file to be written exceeds 2GB, it cannot be
-        // written in one shot with fstream::write, so use wrapper.
-        write_memory_to_file(
-            output_file_name,
-            (void*)&buf[0],
-            sizeof(PackedSfenValue) * buf.size());
-
-        std::cout << "..shuffle_on_memory done." << std::endl;
-    }
-
     static void set_learning_search_limits()
     {
         // About Search::Limits
@@ -1192,13 +946,6 @@ namespace Learner
         // --- Function that only shuffles the teacher aspect
 
         // normal shuffle
-        bool shuffle_normal = false;
-        uint64_t buffer_size = 20000000;
-        // fast shuffling assuming each file is shuffled
-        bool shuffle_quick = false;
-        // A function to read the entire file in memory and shuffle it.
-        // (Requires file size memory)
-        bool shuffle_on_memory = false;
         // Conversion of packed sfen. In plain, it consists of sfen(string),
         // evaluation value (integer), move (eg 7g7f, string), result (loss-1, win 1, draw 0)
         bool use_convert_plain = false;
@@ -1318,13 +1065,6 @@ namespace Learner
 
             else if (option == "reduction_gameply") is >> reduction_gameply;
 
-            // shuffle related
-            else if (option == "shuffle")   shuffle_normal = true;
-            else if (option == "buffer_size") is >> buffer_size;
-            else if (option == "shuffleq")  shuffle_quick = true;
-            else if (option == "shufflem")  shuffle_on_memory = true;
-            else if (option == "output_file_name") is >> output_file_name;
-
             else if (option == "eval_limit") is >> eval_limit;
             else if (option == "save_only_once") save_only_once = true;
             else if (option == "no_shuffle") no_shuffle = true;
@@ -1404,29 +1144,6 @@ namespace Learner
         cout << "base dir        : " << base_dir << endl;
         cout << "target dir      : " << target_dir << endl;
 
-        // shuffle mode
-        if (shuffle_normal)
-        {
-            cout << "buffer_size     : " << buffer_size << endl;
-            cout << "shuffle mode.." << endl;
-            shuffle_files(filenames, output_file_name, buffer_size, seed);
-            return;
-        }
-
-        if (shuffle_quick)
-        {
-            cout << "quick shuffle mode.." << endl;
-            shuffle_files_quick(filenames, output_file_name, seed);
-            return;
-        }
-
-        if (shuffle_on_memory)
-        {
-            cout << "shuffle on memory.." << endl;
-            shuffle_files_on_memory(filenames, output_file_name, seed);
-            return;
-        }
-
         if (use_convert_plain)
         {
             Eval::NNUE::init();

From f7530de20def38f858bc00cf3608d0247ea8c925 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 22 Oct 2020 15:39:43 +0200
Subject: [PATCH 236/398] Fix assertion in trainer

---
 src/nnue/trainer/trainer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nnue/trainer/trainer.h b/src/nnue/trainer/trainer.h
index 85666576..763bd5c8 100644
--- a/src/nnue/trainer/trainer.h
+++ b/src/nnue/trainer/trainer.h
@@ -38,7 +38,7 @@ namespace Eval::NNUE {
 
         TrainingFeature& operator+=(const TrainingFeature& other) {
             assert(other.get_index() == get_index());
-            assert(other.get_index() + get_count() < (1 << kCountBits));
+            assert(other.get_count() + get_count() < (1 << kCountBits));
             index_and_count_ += other.get_count();
             return *this;
         }

From c7ac3688a7bc87b0984cadb50778952ca1149ccd Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 18 Oct 2020 14:51:38 +0200
Subject: [PATCH 237/398] Move the old convert stuff from learn to their own
 commands.

---
 src/learn/convert.cpp | 212 ++++++++++++++++++++++++++++++++++++++++++
 src/learn/convert.h   |  29 ++----
 src/learn/learn.cpp   |  84 ++---------------
 src/uci.cpp           |   3 +
 4 files changed, 228 insertions(+), 100 deletions(-)

diff --git a/src/learn/convert.cpp b/src/learn/convert.cpp
index 59111dcf..a7528b02 100644
--- a/src/learn/convert.cpp
+++ b/src/learn/convert.cpp
@@ -10,6 +10,8 @@
 
 #include "extra/nnue_data_binpack_format.h"
 
+#include "nnue/evaluate_nnue.h"
+
 #include "syzygy/tbprobe.h"
 
 #include <sstream>
@@ -600,4 +602,214 @@ namespace Learner
 
         convert(args);
     }
+
+    static void append_files_from_dir(
+        std::vector<std::string>& filenames,
+        const std::string& base_dir,
+        const std::string& target_dir)
+    {
+        string kif_base_dir = Path::combine(base_dir, target_dir);
+
+        namespace sys = std::filesystem;
+        sys::path p(kif_base_dir); // Origin of enumeration
+        std::for_each(sys::directory_iterator(p), sys::directory_iterator(),
+            [&](const sys::path& path) {
+                if (sys::is_regular_file(path))
+                    filenames.push_back(Path::combine(target_dir, path.filename().generic_string()));
+            });
+    }
+
+    static void rebase_files(
+        std::vector<std::string>& filenames,
+        const std::string& base_dir)
+    {
+        for (auto& file : filenames)
+        {
+            file = Path::combine(base_dir, file);
+        }
+    }
+
+    void convert_bin_from_pgn_extract(std::istringstream& is)
+    {
+        std::vector<std::string> filenames;
+
+        string base_dir;
+        string target_dir;
+
+        bool pgn_eval_side_to_move = false;
+        bool convert_no_eval_fens_as_score_zero = false;
+
+        string output_file_name = "shuffled_sfen.bin";
+
+        while (true)
+        {
+            string option;
+            is >> option;
+
+            if (option == "")
+                break;
+
+            if (option == "targetdir") is >> target_dir;
+            else if (option == "targetfile")
+            {
+                std::string filename;
+                is >> filename;
+                filenames.push_back(filename);
+            }
+
+            else if (option == "basedir")   is >> base_dir;
+
+            else if (option == "pgn_eval_side_to_move") is >> pgn_eval_side_to_move;
+            else if (option == "convert_no_eval_fens_as_score_zero") is >> convert_no_eval_fens_as_score_zero;
+            else if (option == "output_file_name") is >> output_file_name;
+            else
+            {
+                cout << "Unknown option: " << option << ". Ignoring.\n";
+            }
+        }
+
+        if (!target_dir.empty())
+        {
+            append_files_from_dir(filenames, base_dir, target_dir);
+        }
+        rebase_files(filenames, base_dir);
+
+        Eval::NNUE::init();
+
+        cout << "convert_bin_from_pgn-extract.." << endl;
+        convert_bin_from_pgn_extract(
+            filenames,
+            output_file_name,
+            pgn_eval_side_to_move,
+            convert_no_eval_fens_as_score_zero);
+    }
+
+    void convert_bin(std::istringstream& is)
+    {
+        std::vector<std::string> filenames;
+
+        string base_dir;
+        string target_dir;
+
+        int ply_minimum = 0;
+        int ply_maximum = 114514;
+        bool interpolate_eval = 0;
+        bool check_invalid_fen = false;
+        bool check_illegal_move = false;
+
+        bool pgn_eval_side_to_move = false;
+        bool convert_no_eval_fens_as_score_zero = false;
+
+        double src_score_min_value = 0.0;
+        double src_score_max_value = 1.0;
+        double dest_score_min_value = 0.0;
+        double dest_score_max_value = 1.0;
+
+        string output_file_name = "shuffled_sfen.bin";
+
+        while (true)
+        {
+            string option;
+            is >> option;
+
+            if (option == "")
+                break;
+
+            if (option == "targetdir") is >> target_dir;
+            else if (option == "targetfile")
+            {
+                std::string filename;
+                is >> filename;
+                filenames.push_back(filename);
+            }
+
+            else if (option == "basedir")   is >> base_dir;
+
+            else if (option == "ply_minimum") is >> ply_minimum;
+            else if (option == "ply_maximum") is >> ply_maximum;
+            else if (option == "interpolate_eval") is >> interpolate_eval;
+            else if (option == "check_invalid_fen") is >> check_invalid_fen;
+            else if (option == "check_illegal_move") is >> check_illegal_move;
+            else if (option == "pgn_eval_side_to_move") is >> pgn_eval_side_to_move;
+            else if (option == "convert_no_eval_fens_as_score_zero") is >> convert_no_eval_fens_as_score_zero;
+            else if (option == "src_score_min_value") is >> src_score_min_value;
+            else if (option == "src_score_max_value") is >> src_score_max_value;
+            else if (option == "dest_score_min_value") is >> dest_score_min_value;
+            else if (option == "dest_score_max_value") is >> dest_score_max_value;
+            else if (option == "output_file_name") is >> output_file_name;
+            else
+            {
+                cout << "Unknown option: " << option << ". Ignoring.\n";
+            }
+        }
+
+        if (!target_dir.empty())
+        {
+            append_files_from_dir(filenames, base_dir, target_dir);
+        }
+        rebase_files(filenames, base_dir);
+
+        Eval::NNUE::init();
+
+        cout << "convert_bin.." << endl;
+            convert_bin(
+                filenames,
+                output_file_name,
+                ply_minimum,
+                ply_maximum,
+                interpolate_eval,
+                src_score_min_value,
+                src_score_max_value,
+                dest_score_min_value,
+                dest_score_max_value,
+                check_invalid_fen,
+                check_illegal_move
+            );
+    }
+
+    void convert_plain(std::istringstream& is)
+    {
+        std::vector<std::string> filenames;
+
+        string base_dir;
+        string target_dir;
+
+        string output_file_name = "shuffled_sfen.bin";
+
+        while (true)
+        {
+            string option;
+            is >> option;
+
+            if (option == "")
+                break;
+
+            if (option == "targetdir") is >> target_dir;
+            else if (option == "targetfile")
+            {
+                std::string filename;
+                is >> filename;
+                filenames.push_back(filename);
+            }
+
+            else if (option == "basedir")   is >> base_dir;
+
+            else if (option == "output_file_name") is >> output_file_name;
+            else
+            {
+                cout << "Unknown option: " << option << ". Ignoring.\n";
+            }
+        }
+
+        if (!target_dir.empty())
+        {
+            append_files_from_dir(filenames, base_dir, target_dir);
+        }
+        rebase_files(filenames, base_dir);
+
+        Eval::NNUE::init();
+
+        cout << "convert_plain.." << endl;
+        convert_plain(filenames, output_file_name);
+    }
 }
diff --git a/src/learn/convert.h b/src/learn/convert.h
index a41885d9..227f0799 100644
--- a/src/learn/convert.h
+++ b/src/learn/convert.h
@@ -6,30 +6,13 @@
 #include <sstream>
 
 namespace Learner {
-    void convert_bin_from_pgn_extract(
-        const std::vector<std::string>& filenames,
-        const std::string& output_file_name,
-        const bool pgn_eval_side_to_move,
-        const bool convert_no_eval_fens_as_score_zero);
-
-    void convert_bin(
-        const std::vector<std::string>& filenames,
-        const std::string& output_file_name,
-        const int ply_minimum,
-        const int ply_maximum,
-        const int interpolate_eval,
-        const int src_score_min_value,
-        const int src_score_max_value,
-        const int dest_score_min_value,
-        const int dest_score_max_value,
-        const bool check_invalid_fen,
-        const bool check_illegal_move);
-
-    void convert_plain(
-        const std::vector<std::string>& filenames,
-        const std::string& output_file_name);
-
     void convert(std::istringstream& is);
+
+    void convert_bin_from_pgn_extract(std::istringstream& is);
+
+    void convert_bin(std::istringstream& is);
+
+    void convert_plain(std::istringstream& is);
 }
 
 #endif
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 2cab54b7..32aa986f 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -19,7 +19,6 @@
 
 #include "learn.h"
 
-#include "convert.h"
 #include "sfen_reader.h"
 
 #include "misc.h"
@@ -940,29 +939,8 @@ namespace Learner
 
         // Game file storage folder (get game file with relative path from here)
         string base_dir;
-
         string target_dir;
 
-        // --- Function that only shuffles the teacher aspect
-
-        // normal shuffle
-        // Conversion of packed sfen. In plain, it consists of sfen(string),
-        // evaluation value (integer), move (eg 7g7f, string), result (loss-1, win 1, draw 0)
-        bool use_convert_plain = false;
-        // convert plain format teacher to Yaneura King's bin
-        bool use_convert_bin = false;
-        int ply_minimum = 0;
-        int ply_maximum = 114514;
-        bool interpolate_eval = 0;
-        bool check_invalid_fen = false;
-        bool check_illegal_move = false;
-        // convert teacher in pgn-extract format to Yaneura King's bin
-        bool use_convert_bin_from_pgn_extract = false;
-        bool pgn_eval_side_to_move = false;
-        bool convert_no_eval_fens_as_score_zero = false;
-        // File name to write in those cases (default is "shuffled_sfen.bin")
-        string output_file_name = "shuffled_sfen.bin";
-
         // If the absolute value of the evaluation value
         // in the deep search of the teacher phase exceeds this value,
         // that phase is discarded.
@@ -1079,19 +1057,11 @@ namespace Learner
             else if (option == "loss_output_interval") is >> loss_output_interval;
             else if (option == "validation_set_file_name") is >> validation_set_file_name;
 
-            // Rabbit convert related
-            else if (option == "convert_plain") use_convert_plain = true;
-            else if (option == "convert_bin") use_convert_bin = true;
-            else if (option == "interpolate_eval") is >> interpolate_eval;
-            else if (option == "check_invalid_fen") is >> check_invalid_fen;
-            else if (option == "check_illegal_move") is >> check_illegal_move;
-            else if (option == "convert_bin_from_pgn-extract") use_convert_bin_from_pgn_extract = true;
-            else if (option == "pgn_eval_side_to_move") is >> pgn_eval_side_to_move;
-            else if (option == "convert_no_eval_fens_as_score_zero") is >> convert_no_eval_fens_as_score_zero;
             else if (option == "src_score_min_value") is >> src_score_min_value;
             else if (option == "src_score_max_value") is >> src_score_max_value;
             else if (option == "dest_score_min_value") is >> dest_score_min_value;
             else if (option == "dest_score_max_value") is >> dest_score_max_value;
+
             else if (option == "seed") is >> seed;
             else if (option == "set_recommended_uci_options")
             {
@@ -1123,8 +1093,8 @@ namespace Learner
         cout << "Warning! OpenMP disabled." << endl;
 #endif
 
-        LearnerThink learn_think(thread_num, seed);
-
+        // Right now we only have the individual files.
+        // We need to apply base_dir here
         rebase_files(filenames, base_dir);
         if (!target_dir.empty())
         {
@@ -1144,48 +1114,6 @@ namespace Learner
         cout << "base dir        : " << base_dir << endl;
         cout << "target dir      : " << target_dir << endl;
 
-        if (use_convert_plain)
-        {
-            Eval::NNUE::init();
-            cout << "convert_plain.." << endl;
-            convert_plain(filenames, output_file_name);
-            return;
-        }
-
-        if (use_convert_bin)
-        {
-            Eval::NNUE::init();
-            cout << "convert_bin.." << endl;
-            convert_bin(
-                filenames,
-                output_file_name,
-                ply_minimum,
-                ply_maximum,
-                interpolate_eval,
-                src_score_min_value,
-                src_score_max_value,
-                dest_score_min_value,
-                dest_score_max_value,
-                check_invalid_fen,
-                check_illegal_move);
-
-            return;
-
-        }
-
-        if (use_convert_bin_from_pgn_extract)
-        {
-            Eval::NNUE::init();
-            cout << "convert_bin_from_pgn-extract.." << endl;
-            convert_bin_from_pgn_extract(
-                filenames,
-                output_file_name,
-                pgn_eval_side_to_move,
-                convert_no_eval_fens_as_score_zero);
-
-            return;
-        }
-
         cout << "loop              : " << loop << endl;
         cout << "eval_limit        : " << eval_limit << endl;
         cout << "save_only_once    : " << (save_only_once ? "true" : "false") << endl;
@@ -1226,6 +1154,8 @@ namespace Learner
 
         cout << "init.." << endl;
 
+        LearnerThink learn_think(thread_num, seed);
+
         Threads.main()->ponder = false;
 
         set_learning_search_limits();
@@ -1244,8 +1174,6 @@ namespace Learner
                 Path::combine(Options["EvalSaveDir"], "original");
         }
 
-        cout << "init done." << endl;
-
         // Reflect other option settings.
         learn_think.eval_limit = eval_limit;
         learn_think.save_only_once = save_only_once;
@@ -1271,6 +1199,8 @@ namespace Learner
             }
         }
 
+        cout << "init done." << endl;
+
         // Start learning.
         learn_think.learn();
     }
diff --git a/src/uci.cpp b/src/uci.cpp
index b05c7eeb..398fd01a 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -341,6 +341,9 @@ void UCI::loop(int argc, char* argv[]) {
       else if (token == "gensfen") Learner::gen_sfen(pos, is);
       else if (token == "learn") Learner::learn(pos, is);
       else if (token == "convert") Learner::convert(is);
+      else if (token == "convert_bin") Learner::convert_bin(is);
+      else if (token == "convert_plain") Learner::convert_plain(is);
+      else if (token == "convert_bin_from_pgn_extract") Learner::convert_bin_from_pgn_extract(is);
 
       // Command to call qsearch(),search() directly for testing
       else if (token == "qsearch") qsearch_cmd(pos);

From e4e9f7e39b7bf6c2aa627584fafd3afd7c5f74d5 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 23 Oct 2020 17:14:59 +0200
Subject: [PATCH 238/398] Reduce bench depth for testing with valgrind to
 prevent timeouts in CI.

---
 tests/instrumented.sh | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/tests/instrumented.sh b/tests/instrumented.sh
index 03e9c9de..788d8741 100755
--- a/tests/instrumented.sh
+++ b/tests/instrumented.sh
@@ -16,6 +16,9 @@ case $1 in
     exeprefix='valgrind --error-exitcode=42'
     postfix='1>/dev/null'
     threads="1"
+    bench_depth=5
+    go_depth=10
+    tt_size=16
   ;;
   --valgrind-thread)
     echo "valgrind-thread testing started"
@@ -23,6 +26,9 @@ case $1 in
     exeprefix='valgrind --fair-sched=try --error-exitcode=42'
     postfix='1>/dev/null'
     threads="2"
+    bench_depth=5
+    go_depth=10
+    tt_size=16
   ;;
   --sanitizer-undefined)
     echo "sanitizer-undefined testing started"
@@ -30,6 +36,9 @@ case $1 in
     exeprefix=''
     postfix='2>&1 | grep -A50 "runtime error:"'
     threads="1"
+    bench_depth=8
+    go_depth=20
+    tt_size=128
   ;;
   --sanitizer-thread)
     echo "sanitizer-thread testing started"
@@ -37,6 +46,9 @@ case $1 in
     exeprefix=''
     postfix='2>&1 | grep -A50 "WARNING: ThreadSanitizer:"'
     threads="2"
+    bench_depth=8
+    go_depth=20
+    tt_size=128
 
 cat << EOF > tsan.supp
 race:TTEntry::move
@@ -70,7 +82,7 @@ for args in "eval" \
             "go depth 10" \
             "go movetime 1000" \
             "go wtime 8000 btime 8000 winc 500 binc 500" \
-            "bench 128 $threads 8 default depth"
+            "bench $tt_size $threads $bench_depth default depth"
 do
 
    echo "$prefix $exeprefix ./stockfish $args $postfix"
@@ -98,7 +110,7 @@ cat << EOF > game.exp
  expect "bestmove"
 
  send "position fen 5rk1/1K4p1/8/8/3B4/8/8/8 b - - 0 1\n"
- send "go depth 20\n"
+ send "go depth $go_depth\n"
  expect "bestmove"
 
  send "quit\n"
@@ -121,7 +133,7 @@ cat << EOF > syzygy.exp
  send "uci\n"
  send "setoption name SyzygyPath value ../tests/syzygy/\n"
  expect "info string Found 35 tablebases" {} timeout {exit 1}
- send "bench 128 1 8 default depth\n"
+ send "bench $tt_size 1 $bench_depth default depth\n"
  send "quit\n"
  expect eof
 

From e4a38c18dd75d5109f6c5ba93071f68d197d5ed4 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 23 Oct 2020 22:03:01 +0200
Subject: [PATCH 239/398] Don't test syzygi

---
 tests/instrumented.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/instrumented.sh b/tests/instrumented.sh
index 788d8741..07ecbb9c 100755
--- a/tests/instrumented.sh
+++ b/tests/instrumented.sh
@@ -142,7 +142,7 @@ cat << EOF > syzygy.exp
  exit \$value
 EOF
 
-for exp in game.exp syzygy.exp
+for exp in game.exp
 do
 
   echo "$prefix expect $exp $postfix"

From 0636e1256d09edde22df3bc75207d75c24b6f2fa Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 18 Oct 2020 14:51:38 +0200
Subject: [PATCH 240/398] Add cyclic mode to the sfen reader. Make sfen reader
 take all files at construction

---
 src/learn/learn.cpp     | 44 ++++++++++++++++++--------------
 src/learn/sfen_reader.h | 56 ++++++++++++++++++++++++++++++-----------
 2 files changed, 66 insertions(+), 34 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 32aa986f..57dbeb63 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -381,9 +381,13 @@ namespace Learner
         // move match rate, simple comparison is not possible...
         static constexpr uint64_t sfen_for_mse_size = 2000;
 
-        LearnerThink(uint64_t thread_num, const std::string& seed) :
+        LearnerThink(
+            const std::vector<std::string>& filenames,
+            uint64_t thread_num,
+            const std::string& seed
+        ) :
             prng(seed),
-            sr(thread_num, std::to_string(prng.next_random_seed())),
+            sr(filenames, SfenReaderMode::Sequential, thread_num, std::to_string(prng.next_random_seed())),
             learn_loss_sum{}
         {
             save_only_once = false;
@@ -404,11 +408,6 @@ namespace Learner
             sr.set_do_shuffle(v);
         }
 
-        void add_file(const std::string& filename)
-        {
-            sr.add_file(filename);
-        }
-
         void learn();
 
 
@@ -1095,11 +1094,26 @@ namespace Learner
 
         // Right now we only have the individual files.
         // We need to apply base_dir here
-        rebase_files(filenames, base_dir);
         if (!target_dir.empty())
         {
             append_files_from_dir(filenames, base_dir, target_dir);
         }
+        rebase_files(filenames, base_dir);
+
+        // Insert the file name for the number of loops.
+        {
+            std::vector<std::string> filenamesTimesLoop;
+
+            for (int i = 0; i < loop; ++i)
+            {
+                for(auto& file : filenames)
+                {
+                    filenamesTimesLoop.emplace_back(file);
+                }
+            }
+
+            filenames = std::move(filenamesTimesLoop);
+        }
 
         cout << "learn from ";
         for (auto s : filenames)
@@ -1154,8 +1168,6 @@ namespace Learner
 
         cout << "init.." << endl;
 
-        LearnerThink learn_think(thread_num, seed);
-
         Threads.main()->ponder = false;
 
         set_learning_search_limits();
@@ -1164,6 +1176,9 @@ namespace Learner
         Eval::NNUE::initialize_training(seed);
         Eval::NNUE::set_batch_size(nn_batch_size);
         Eval::NNUE::set_options(nn_options);
+
+        LearnerThink learn_think(filenames, thread_num, seed);
+
         if (newbob_decay != 1.0 && !Options["SkipLoadingEval"]) {
             // Save the current net to [EvalSaveDir]\original.
             Eval::NNUE::save_eval("original");
@@ -1190,15 +1205,6 @@ namespace Learner
         learn_think.mini_batch_size = mini_batch_size;
         learn_think.validation_set_file_name = validation_set_file_name;
 
-        // Insert the file name for the number of loops.
-        for (int i = 0; i < loop; ++i)
-        {
-            for(auto& file : filenames)
-            {
-                learn_think.add_file(Path::combine(base_dir, file));
-            }
-        }
-
         cout << "init done." << endl;
 
         // Start learning.
diff --git a/src/learn/sfen_reader.h b/src/learn/sfen_reader.h
index 38c2532c..1ba9bd3b 100644
--- a/src/learn/sfen_reader.h
+++ b/src/learn/sfen_reader.h
@@ -18,6 +18,12 @@
 
 namespace Learner{
 
+    enum struct SfenReaderMode
+    {
+        Sequential,
+        Cyclic
+    };
+
     // Sfen reader
     struct SfenReader
     {
@@ -32,7 +38,14 @@ namespace Learner{
 
         // Do not use std::random_device().
         // Because it always the same integers on MinGW.
-        SfenReader(int thread_num, const std::string& seed) :
+        SfenReader(
+            const std::vector<std::string>& filenames_,
+            SfenReaderMode mode_,
+            int thread_num,
+            const std::string& seed
+        ) :
+            filenames(filenames_.begin(), filenames_.end()),
+            mode(mode_),
             prng(seed)
         {
             packed_sfens.resize(thread_num);
@@ -173,6 +186,9 @@ namespace Learner{
 
         void file_read_worker()
         {
+            std::string currentFilename;
+            uint64_t numEntriesReadFromCurrentFile = 0;
+
             auto open_next_file = [&]() {
                 // no more
                 for(;;)
@@ -183,18 +199,20 @@ namespace Learner{
                         return false;
 
                     // Get the next file name.
-                    std::string filename = filenames.front();
+                    currentFilename = filenames.front();
                     filenames.pop_front();
 
-                    sfen_input_stream = open_sfen_input_file(filename);
+                    numEntriesReadFromCurrentFile = 0;
+
+                    sfen_input_stream = open_sfen_input_file(currentFilename);
 
                     if (sfen_input_stream == nullptr)
                     {
-                        std::cout << "File does not exist: " << filename << '\n';
+                        std::cout << "File does not exist: " << currentFilename << '\n';
                     }
                     else
                     {
-                        std::cout << "Opened file for reading: " << filename << '\n';
+                        std::cout << "Opened file for reading: " << currentFilename << '\n';
 
                         // in case the file is empty or was deleted.
                         if (sfen_input_stream->eof())
@@ -236,13 +254,24 @@ namespace Learner{
                     if (p.has_value())
                     {
                         sfens.push_back(*p);
+                        ++numEntriesReadFromCurrentFile;
                     }
-                    else if(!open_next_file())
+                    else
                     {
-                        // There was no next file. Abort.
-                        std::cout << "..end of files." << std::endl;
-                        end_of_files = true;
-                        return;
+                        if (mode == SfenReaderMode::Cyclic
+                            && numEntriesReadFromCurrentFile > 0)
+                        {
+                            // The file contained data so we add it again to the end of the queue.
+                            filenames.emplace_back(currentFilename);
+                        }
+
+                        if(!open_next_file())
+                        {
+                            // There was no next file. Abort.
+                            std::cout << "..end of files." << std::endl;
+                            end_of_files = true;
+                            return;
+                        }
                     }
                 }
 
@@ -295,11 +324,6 @@ namespace Learner{
             shuffle = v;
         }
 
-        void add_file(const std::string& filename)
-        {
-            filenames.push_back(filename);
-        }
-
     protected:
 
         // worker thread reading file in background
@@ -316,6 +340,8 @@ namespace Learner{
         // Do not shuffle when reading the phase.
         bool shuffle;
 
+        SfenReaderMode mode;
+
         // Random number to shuffle when reading the phase
         PRNG prng;
 

From c58aa9696ad3c579fe7f610505fe6b0903062182 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 22 Oct 2020 14:43:38 +0200
Subject: [PATCH 241/398] Start sfen reader worker thread in the constructor.

---
 src/learn/learn.cpp     |  3 ---
 src/learn/sfen_reader.h | 12 ++++--------
 2 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 57dbeb63..cc51b04e 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -490,9 +490,6 @@ namespace Learner
 
         Eval::NNUE::verify_any_net_loaded();
 
-        // Start a thread that loads the training data in the background
-        sr.start_file_read_worker();
-
         const PSVector sfen_for_mse =
             validation_set_file_name.empty()
             ? sr.read_for_mse(sfen_for_mse_size)
diff --git a/src/learn/sfen_reader.h b/src/learn/sfen_reader.h
index 1ba9bd3b..78bf4ee8 100644
--- a/src/learn/sfen_reader.h
+++ b/src/learn/sfen_reader.h
@@ -53,6 +53,10 @@ namespace Learner{
             end_of_files = false;
             shuffle = true;
             stop_flag = false;
+
+            file_worker_thread = std::thread([&] {
+                this->file_read_worker();
+            });
         }
 
         ~SfenReader()
@@ -176,14 +180,6 @@ namespace Learner{
 
         }
 
-        // Start a thread that loads the phase file in the background.
-        void start_file_read_worker()
-        {
-            file_worker_thread = std::thread([&] {
-                this->file_read_worker();
-                });
-        }
-
         void file_read_worker()
         {
             std::string currentFilename;

From ad3d1b42e4f5de24608053e0d27031c4c52887d9 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 22 Oct 2020 14:46:19 +0200
Subject: [PATCH 242/398] Make sfen reader only stop when it's destroyed. Now
 it is fully RAII.

---
 src/learn/learn.cpp     | 4 ----
 src/learn/sfen_reader.h | 7 ++-----
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index cc51b04e..3e4f9495 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -502,8 +502,6 @@ namespace Learner
                 << "Error reading sfen_for_mse. Read " << sfen_for_mse.size()
                 << " out of " << sfen_for_mse_size << '\n';
 
-            sr.stop();
-
             return;
         }
 
@@ -541,8 +539,6 @@ namespace Learner
                 break;
         }
 
-        sr.stop();
-
         Eval::NNUE::finalize_net();
 
         save(true);
diff --git a/src/learn/sfen_reader.h b/src/learn/sfen_reader.h
index 78bf4ee8..0ef9765b 100644
--- a/src/learn/sfen_reader.h
+++ b/src/learn/sfen_reader.h
@@ -61,6 +61,8 @@ namespace Learner{
 
         ~SfenReader()
         {
+            stop_flag = true;
+
             if (file_worker_thread.joinable())
                 file_worker_thread.join();
         }
@@ -310,11 +312,6 @@ namespace Learner{
             }
         }
 
-        void stop()
-        {
-            stop_flag = true;
-        }
-
         void set_do_shuffle(bool v)
         {
             shuffle = v;

From fc3788f630b4524bfd50ade8ed46b0f007fa1b5a Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 22 Oct 2020 15:45:33 +0200
Subject: [PATCH 243/398] Use cyclic sfen reader for learning, change loop
 option to epochs.

---
 src/learn/learn.cpp | 52 ++++++++++++++-------------------------------
 1 file changed, 16 insertions(+), 36 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 3e4f9495..66a27b28 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -387,7 +387,7 @@ namespace Learner
             const std::string& seed
         ) :
             prng(seed),
-            sr(filenames, SfenReaderMode::Sequential, thread_num, std::to_string(prng.next_random_seed())),
+            sr(filenames, SfenReaderMode::Cyclic, thread_num, std::to_string(prng.next_random_seed())),
             learn_loss_sum{}
         {
             save_only_once = false;
@@ -408,7 +408,7 @@ namespace Learner
             sr.set_do_shuffle(v);
         }
 
-        void learn();
+        void learn(uint64_t epochs);
 
 
         std::string validation_set_file_name;
@@ -439,9 +439,9 @@ namespace Learner
     private:
         void learn_worker(Thread& th, std::atomic<uint64_t>& counter, uint64_t limit);
 
-        void update_weights(const PSVector& psv);
+        void update_weights(const PSVector& psv, uint64_t epoch);
 
-        void calc_loss(const PSVector& psv);
+        void calc_loss(const PSVector& psv, uint64_t epoch);
 
         void calc_loss_worker(
             Thread& th,
@@ -465,9 +465,6 @@ namespace Learner
         uint64_t save_count;
         uint64_t loss_output_count;
 
-        // Learning iteration counter
-        uint64_t epoch = 0;
-
         std::atomic<bool> stop_flag;
 
         uint64_t total_done;
@@ -481,7 +478,7 @@ namespace Learner
         AtomicLoss learn_loss_sum;
     };
 
-    void LearnerThink::learn()
+    void LearnerThink::learn(uint64_t epochs)
     {
 
 #if defined(_OPENMP)
@@ -507,7 +504,7 @@ namespace Learner
 
         if (newbob_decay != 1.0) {
 
-            calc_loss(sfen_for_mse);
+            calc_loss(sfen_for_mse, 0);
 
             best_loss = latest_loss_sum / latest_loss_count;
             latest_loss_sum = 0.0;
@@ -518,7 +515,7 @@ namespace Learner
 
         stop_flag = false;
 
-        for(;;)
+        for(uint64_t epoch = 1; epoch <= epochs; ++epoch)
         {
             std::atomic<uint64_t> counter{0};
 
@@ -533,7 +530,7 @@ namespace Learner
             if (stop_flag)
                 break;
 
-            update_weights(sfen_for_mse);
+            update_weights(sfen_for_mse, epoch);
 
             if (stop_flag)
                 break;
@@ -639,7 +636,7 @@ namespace Learner
         learn_loss_sum += local_loss_sum;
     }
 
-    void LearnerThink::update_weights(const PSVector& psv)
+    void LearnerThink::update_weights(const PSVector& psv, uint64_t epoch)
     {
         // I'm not sure this fencing is correct. But either way there
         // should be no real issues happening since
@@ -648,8 +645,6 @@ namespace Learner
         Eval::NNUE::update_parameters();
         atomic_thread_fence(memory_order_seq_cst);
 
-        ++epoch;
-
         if (++save_count * mini_batch_size >= eval_save_interval)
         {
             save_count = 0;
@@ -667,13 +662,13 @@ namespace Learner
             loss_output_count = 0;
 
             // loss calculation
-            calc_loss(psv);
+            calc_loss(psv, epoch);
 
             Eval::NNUE::check_health();
         }
     }
 
-    void LearnerThink::calc_loss(const PSVector& psv)
+    void LearnerThink::calc_loss(const PSVector& psv, uint64_t epoch)
     {
         TT.new_search();
         TimePoint elapsed = now() - Search::Limits.startTime + 1;
@@ -926,8 +921,8 @@ namespace Learner
         // mini_batch_size 1M aspect by default. This can be increased.
         auto mini_batch_size = LEARN_MINI_BATCH_SIZE;
 
-        // Number of loops (read the game record file this number of times)
-        int loop = 1;
+        // Number of epochs
+        uint64_t epochs = 1;
 
         // Game file storage folder (get game file with relative path from here)
         string base_dir;
@@ -996,7 +991,7 @@ namespace Learner
             }
 
             // Specify the number of loops
-            else if (option == "loop")      is >> loop;
+            else if (option == "epochs")      is >> epochs;
 
             // Game file storage folder (get game file with relative path from here)
             else if (option == "basedir")   is >> base_dir;
@@ -1093,21 +1088,6 @@ namespace Learner
         }
         rebase_files(filenames, base_dir);
 
-        // Insert the file name for the number of loops.
-        {
-            std::vector<std::string> filenamesTimesLoop;
-
-            for (int i = 0; i < loop; ++i)
-            {
-                for(auto& file : filenames)
-                {
-                    filenamesTimesLoop.emplace_back(file);
-                }
-            }
-
-            filenames = std::move(filenamesTimesLoop);
-        }
-
         cout << "learn from ";
         for (auto s : filenames)
             cout << s << " , ";
@@ -1121,7 +1101,7 @@ namespace Learner
         cout << "base dir        : " << base_dir << endl;
         cout << "target dir      : " << target_dir << endl;
 
-        cout << "loop              : " << loop << endl;
+        cout << "epochs            : " << epochs << endl;
         cout << "eval_limit        : " << eval_limit << endl;
         cout << "save_only_once    : " << (save_only_once ? "true" : "false") << endl;
         cout << "no_shuffle        : " << (no_shuffle ? "true" : "false") << endl;
@@ -1201,7 +1181,7 @@ namespace Learner
         cout << "init done." << endl;
 
         // Start learning.
-        learn_think.learn();
+        learn_think.learn(epochs);
     }
 
 } // namespace Learner

From 31f94a18b3368a533b874e5e5b65970725f30597 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 22 Oct 2020 15:57:46 +0200
Subject: [PATCH 244/398] Update readme and docs after change from loop to
 epochs.

---
 README.md           | 2 +-
 docs/learn.md       | 2 +-
 src/learn/learn.cpp | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 84898792..5fa8179e 100644
--- a/README.md
+++ b/README.md
@@ -78,7 +78,7 @@ setoption name SkipLoadingEval value true
 setoption name Use NNUE value pure
 setoption name Threads value x
 isready
-learn targetdir trainingdata loop 100 batchsize 1000000 use_draw_in_training 1 use_draw_in_validation 1 lr 1 lambda 1 eval_limit 32000 nn_batch_size 1000 newbob_decay 0.5 eval_save_interval 250000000 loss_output_interval 1000000 validation_set_file_name validationdata\val.binpack
+learn targetdir trainingdata epochs 10000 batchsize 1000000 use_draw_in_training 1 use_draw_in_validation 1 lr 1 lambda 1 eval_limit 32000 nn_batch_size 1000 newbob_decay 0.5 eval_save_interval 250000000 loss_output_interval 1000000 validation_set_file_name validationdata\val.binpack
 ```
 
 This will utilize training data files in the "trainingdata" directory and validation data from file "validationdata\val.bin". Produced nets are saved in the "evalsave" folder.
diff --git a/docs/learn.md b/docs/learn.md
index 3a580134..dc55ec1f 100644
--- a/docs/learn.md
+++ b/docs/learn.md
@@ -20,7 +20,7 @@ Currently the following options are available:
 
 `targetdir` - path to the direction from which training data will be read. All files in this directory are read sequentially. If not specified then only the list of files from positional arguments will be used. If specified then files from the given directory will be used after the explicitly specified files.
 
-`loop` - the number of times to loop over all training data.
+`epochs` - the number of weight update cycles (epochs) to train the network for. One such cycle is `batchsize` positions. If not specified then the training will loop forever.
 
 `basedir` - the base directory for the paths. Default: "" (current directory)
 
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 66a27b28..328f646a 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -922,7 +922,7 @@ namespace Learner
         auto mini_batch_size = LEARN_MINI_BATCH_SIZE;
 
         // Number of epochs
-        uint64_t epochs = 1;
+        uint64_t epochs = std::numeric_limits<uint64_t>::max();
 
         // Game file storage folder (get game file with relative path from here)
         string base_dir;

From 8fb208598b496cf79b8f457b064eb9800ca59cf3 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 23 Oct 2020 22:19:50 +0200
Subject: [PATCH 245/398] pass shuffle flag in the constructor

---
 src/learn/learn.cpp     | 11 +++--------
 src/learn/sfen_reader.h |  8 ++------
 2 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 328f646a..fa447a77 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -383,11 +383,12 @@ namespace Learner
 
         LearnerThink(
             const std::vector<std::string>& filenames,
+            bool shuffle,
             uint64_t thread_num,
             const std::string& seed
         ) :
             prng(seed),
-            sr(filenames, SfenReaderMode::Cyclic, thread_num, std::to_string(prng.next_random_seed())),
+            sr(filenames, shuffle, SfenReaderMode::Cyclic, thread_num, std::to_string(prng.next_random_seed())),
             learn_loss_sum{}
         {
             save_only_once = false;
@@ -403,11 +404,6 @@ namespace Learner
             total_done = 0;
         }
 
-        void set_do_shuffle(bool v)
-        {
-            sr.set_do_shuffle(v);
-        }
-
         void learn(uint64_t epochs);
 
 
@@ -1150,7 +1146,7 @@ namespace Learner
         Eval::NNUE::set_batch_size(nn_batch_size);
         Eval::NNUE::set_options(nn_options);
 
-        LearnerThink learn_think(filenames, thread_num, seed);
+        LearnerThink learn_think(filenames, !no_shuffle, thread_num, seed);
 
         if (newbob_decay != 1.0 && !Options["SkipLoadingEval"]) {
             // Save the current net to [EvalSaveDir]\original.
@@ -1165,7 +1161,6 @@ namespace Learner
         // Reflect other option settings.
         learn_think.eval_limit = eval_limit;
         learn_think.save_only_once = save_only_once;
-        learn_think.set_do_shuffle(!no_shuffle);
         learn_think.reduction_gameply = reduction_gameply;
 
         learn_think.newbob_decay = newbob_decay;
diff --git a/src/learn/sfen_reader.h b/src/learn/sfen_reader.h
index 0ef9765b..d39fef4e 100644
--- a/src/learn/sfen_reader.h
+++ b/src/learn/sfen_reader.h
@@ -40,6 +40,7 @@ namespace Learner{
         // Because it always the same integers on MinGW.
         SfenReader(
             const std::vector<std::string>& filenames_,
+            bool do_shuffle,
             SfenReaderMode mode_,
             int thread_num,
             const std::string& seed
@@ -51,7 +52,7 @@ namespace Learner{
             packed_sfens.resize(thread_num);
             total_read = 0;
             end_of_files = false;
-            shuffle = true;
+            shuffle = do_shuffle;
             stop_flag = false;
 
             file_worker_thread = std::thread([&] {
@@ -312,11 +313,6 @@ namespace Learner{
             }
         }
 
-        void set_do_shuffle(bool v)
-        {
-            shuffle = v;
-        }
-
     protected:
 
         // worker thread reading file in background

From d31169bab5f9545b147701ad55bc68984180ed71 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 13:44:19 +0200
Subject: [PATCH 246/398] Update CI to use epochs instead of loops.

---
 tests/instrumented_learn.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/instrumented_learn.sh b/tests/instrumented_learn.sh
index 267a3bb6..ff1a8a72 100755
--- a/tests/instrumented_learn.sh
+++ b/tests/instrumented_learn.sh
@@ -127,7 +127,7 @@ cat << EOF > learn01.exp
  send "setoption name Use NNUE value true\n"
  send "setoption name Threads value $threads\n"
  send "isready\n"
- send "learn targetdir training_data loop 2 batchsize 100 use_draw_in_training 1 use_draw_in_validation 1 lr 1 eval_limit 32000 nn_batch_size 30 newbob_decay 0.5 eval_save_interval 30 loss_output_interval 10 validation_set_file_name validation_data/validation_data.bin\n"
+ send "learn targetdir training_data epochs 1 batchsize 100 use_draw_in_training 1 use_draw_in_validation 1 lr 1 eval_limit 32000 nn_batch_size 30 newbob_decay 0.5 eval_save_interval 30 loss_output_interval 10 validation_set_file_name validation_data/validation_data.bin\n"
 
  expect "save_eval() finished."
 

From 371acaa0b56391919075dcabfac5e33ca830495d Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 14:51:28 +0200
Subject: [PATCH 247/398] Allow changing sfen reader buffer sizes for the learn
 command.

---
 src/learn/learn.cpp     | 27 ++++++++++++++++++++++++---
 src/learn/learn.h       |  6 ------
 src/learn/sfen_reader.h | 35 +++++++++++++++++++++--------------
 3 files changed, 45 insertions(+), 23 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index fa447a77..7de359ef 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -385,10 +385,19 @@ namespace Learner
             const std::vector<std::string>& filenames,
             bool shuffle,
             uint64_t thread_num,
-            const std::string& seed
+            const std::string& seed,
+            size_t read_size,
+            size_t buffer_size
         ) :
             prng(seed),
-            sr(filenames, shuffle, SfenReaderMode::Cyclic, thread_num, std::to_string(prng.next_random_seed())),
+            sr(
+                filenames,
+                shuffle,
+                SfenReaderMode::Cyclic,
+                thread_num,
+                std::to_string(prng.next_random_seed()),
+                read_size,
+                buffer_size),
             learn_loss_sum{}
         {
             save_only_once = false;
@@ -958,6 +967,9 @@ namespace Learner
         uint64_t eval_save_interval = LEARN_EVAL_SAVE_INTERVAL;
         uint64_t loss_output_interval = 1'000'000;
 
+        size_t sfen_read_size = SfenReader::DEFAULT_SFEN_READ_SIZE;
+        size_t thread_buffer_size = SfenReader::DEFAULT_THREAD_BUFFER_SIZE;
+
         string validation_set_file_name;
         string seed;
 
@@ -1045,6 +1057,9 @@ namespace Learner
             else if (option == "dest_score_min_value") is >> dest_score_min_value;
             else if (option == "dest_score_max_value") is >> dest_score_max_value;
 
+            else if (option == "sfen_read_size") is >> sfen_read_size;
+            else if (option == "thread_buffer_size") is >> thread_buffer_size;
+
             else if (option == "seed") is >> seed;
             else if (option == "set_recommended_uci_options")
             {
@@ -1146,7 +1161,13 @@ namespace Learner
         Eval::NNUE::set_batch_size(nn_batch_size);
         Eval::NNUE::set_options(nn_options);
 
-        LearnerThink learn_think(filenames, !no_shuffle, thread_num, seed);
+        LearnerThink learn_think(
+            filenames,
+            !no_shuffle,
+            thread_num,
+            seed,
+            sfen_read_size,
+            thread_buffer_size);
 
         if (newbob_decay != 1.0 && !Options["SkipLoadingEval"]) {
             // Save the current net to [EvalSaveDir]\original.
diff --git a/src/learn/learn.h b/src/learn/learn.h
index 3ba75ce3..5efeb516 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -54,12 +54,6 @@ namespace Learner
 
     constexpr std::size_t LEARN_MINI_BATCH_SIZE = 1000 * 1000 * 1;
 
-    // The number of phases to read from the file at one time. After reading this much, shuffle.
-    // It is better to have a certain size, but this number x 40 bytes x 3 times as much memory is consumed. 400MB*3 is consumed in the 10M phase.
-    // Must be a multiple of THREAD_BUFFER_SIZE(=10000).
-
-    constexpr std::size_t LEARN_SFEN_READ_SIZE = 1000 * 1000 * 10;
-
     // Saving interval of evaluation function at learning. Save each time you learn this number of phases.
     // Needless to say, the longer the saving interval, the shorter the learning time.
     // Folder name is incremented for each save like 0/, 1/, 2/...
diff --git a/src/learn/sfen_reader.h b/src/learn/sfen_reader.h
index d39fef4e..71767bc6 100644
--- a/src/learn/sfen_reader.h
+++ b/src/learn/sfen_reader.h
@@ -28,13 +28,13 @@ namespace Learner{
     struct SfenReader
     {
         // Number of phases buffered by each thread 0.1M phases. 4M phase at 40HT
-        static constexpr size_t THREAD_BUFFER_SIZE = 10 * 1000;
+        static constexpr size_t DEFAULT_THREAD_BUFFER_SIZE = 10 * 1000;
 
         // Buffer for reading files (If this is made larger,
         // the shuffle becomes larger and the phases may vary.
         // If it is too large, the memory consumption will increase.
         // SFEN_READ_SIZE is a multiple of THREAD_BUFFER_SIZE.
-        static constexpr const size_t SFEN_READ_SIZE = LEARN_SFEN_READ_SIZE;
+        static constexpr const size_t DEFAULT_SFEN_READ_SIZE = 1000 * 1000 * 10;
 
         // Do not use std::random_device().
         // Because it always the same integers on MinGW.
@@ -43,10 +43,14 @@ namespace Learner{
             bool do_shuffle,
             SfenReaderMode mode_,
             int thread_num,
-            const std::string& seed
+            const std::string& seed,
+            size_t read_size = DEFAULT_SFEN_READ_SIZE,
+            size_t buffer_size = DEFAULT_THREAD_BUFFER_SIZE
         ) :
             filenames(filenames_.begin(), filenames_.end()),
             mode(mode_),
+            sfen_read_size(read_size),
+            thread_buffer_size(buffer_size),
             prng(seed)
         {
             packed_sfens.resize(thread_num);
@@ -165,7 +169,7 @@ namespace Learner{
                         packed_sfens[thread_id] = std::move(packed_sfens_pool.front());
                         packed_sfens_pool.pop_front();
 
-                        total_read += THREAD_BUFFER_SIZE;
+                        total_read += thread_buffer_size;
 
                         return true;
                     }
@@ -237,17 +241,17 @@ namespace Learner{
             {
                 // Wait for the buffer to run out.
                 // This size() is read only, so you don't need to lock it.
-                while (!stop_flag && packed_sfens_pool.size() >= SFEN_READ_SIZE / THREAD_BUFFER_SIZE)
+                while (!stop_flag && packed_sfens_pool.size() >= sfen_read_size / thread_buffer_size)
                     sleep(100);
 
                 if (stop_flag)
                     return;
 
                 PSVector sfens;
-                sfens.reserve(SFEN_READ_SIZE);
+                sfens.reserve(sfen_read_size);
 
                 // Read from the file into the file buffer.
-                while (sfens.size() < SFEN_READ_SIZE)
+                while (sfens.size() < sfen_read_size)
                 {
                     std::optional<PackedSfenValue> p = sfen_input_stream->next();
                     if (p.has_value())
@@ -280,11 +284,11 @@ namespace Learner{
                     Algo::shuffle(sfens, prng);
                 }
 
-                // Divide this by THREAD_BUFFER_SIZE. There should be size pieces.
-                // SFEN_READ_SIZE shall be a multiple of THREAD_BUFFER_SIZE.
-                assert((SFEN_READ_SIZE % THREAD_BUFFER_SIZE) == 0);
+                // Divide this by thread_buffer_size. There should be size pieces.
+                // sfen_read_size shall be a multiple of thread_buffer_size.
+                assert((sfen_read_size % thread_buffer_size) == 0);
 
-                auto size = size_t(SFEN_READ_SIZE / THREAD_BUFFER_SIZE);
+                auto size = size_t(sfen_read_size / thread_buffer_size);
                 std::vector<std::unique_ptr<PSVector>> buffers;
                 buffers.reserve(size);
 
@@ -292,11 +296,11 @@ namespace Learner{
                 {
                     // Delete this pointer on the receiving side.
                     auto buf = std::make_unique<PSVector>();
-                    buf->resize(THREAD_BUFFER_SIZE);
+                    buf->resize(thread_buffer_size);
                     memcpy(
                         buf->data(),
-                        &sfens[i * THREAD_BUFFER_SIZE],
-                        sizeof(PackedSfenValue) * THREAD_BUFFER_SIZE);
+                        &sfens[i * thread_buffer_size],
+                        sizeof(PackedSfenValue) * thread_buffer_size);
 
                     buffers.emplace_back(std::move(buf));
                 }
@@ -331,6 +335,9 @@ namespace Learner{
 
         SfenReaderMode mode;
 
+        size_t sfen_read_size;
+        size_t thread_buffer_size;
+
         // Random number to shuffle when reading the phase
         PRNG prng;
 

From 47a82bfc912516fbbad325a24134647af0a4e81d Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 14:52:50 +0200
Subject: [PATCH 248/398] Document new options.

---
 docs/learn.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/learn.md b/docs/learn.md
index dc55ec1f..f815284c 100644
--- a/docs/learn.md
+++ b/docs/learn.md
@@ -74,6 +74,10 @@ Currently the following options are available:
 
 `validation_set_file_name` - path to the file with training data to be used for validation (loss computation and move accuracy)
 
+`sfen_read_size` - the number of sfens to always keep in the buffer. Default: 10000000 (10M)
+
+`thread_buffer_size` - the number of sfens to copy at once to each thread requesting more sfens for learning. Default: 10000
+
 `seed` - seed for the PRNG. Can be either a number or a string. If it's a string then its hash will be used. If not specified then the current time will be used.
 
 ## Legacy subcommands and parameters

From 3bf397a569f958e1fbbb4d96c4f8f89af53b9a41 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 14:54:00 +0200
Subject: [PATCH 249/398] Update instrumented_learn for the current codebase.

---
 tests/instrumented_learn.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/instrumented_learn.sh b/tests/instrumented_learn.sh
index ff1a8a72..4ce3dc1c 100755
--- a/tests/instrumented_learn.sh
+++ b/tests/instrumented_learn.sh
@@ -80,7 +80,7 @@ cat << EOF > gensfen01.exp
  send "isready\n"
  send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data/training_data.bin sfen_format bin\n"
  expect "gensfen finished."
- send "learn training_data/training_data.bin convert_plain output_file_name training_data.txt\n"
+ send "convert_plain targetfile training_data/training_data.bin output_file_name training_data.txt\n"
  expect "all done"
  send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data/training_data.binpack sfen_format binpack\n"
  expect "gensfen finished."
@@ -127,7 +127,7 @@ cat << EOF > learn01.exp
  send "setoption name Use NNUE value true\n"
  send "setoption name Threads value $threads\n"
  send "isready\n"
- send "learn targetdir training_data epochs 1 batchsize 100 use_draw_in_training 1 use_draw_in_validation 1 lr 1 eval_limit 32000 nn_batch_size 30 newbob_decay 0.5 eval_save_interval 30 loss_output_interval 10 validation_set_file_name validation_data/validation_data.bin\n"
+ send "learn targetdir training_data epochs 1 sfen_read_size 100 thread_buffer_size 10 batchsize 100 use_draw_in_training 1 use_draw_in_validation 1 lr 1 eval_limit 32000 nn_batch_size 30 newbob_decay 0.5 eval_save_interval 30 loss_output_interval 10 validation_set_file_name validation_data/validation_data.bin\n"
 
  expect "save_eval() finished."
 

From be3937c37bfc6e69a26b307d2179c16409ccadc8 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 10:52:49 +0200
Subject: [PATCH 250/398] Print layers and their indices during training
 initialization.

---
 src/nnue/evaluate_nnue.cpp          |  6 ++
 src/nnue/evaluate_nnue.h            |  2 +
 src/nnue/evaluate_nnue_learner.cpp  |  7 +++
 src/nnue/layers/affine_transform.h  | 21 ++++++-
 src/nnue/layers/clipped_relu.h      | 19 +++++-
 src/nnue/layers/input_slice.h       | 93 ++++++++++++++++-------------
 src/nnue/layers/sum.h               | 38 ++++++++++--
 src/nnue/nnue_feature_transformer.h | 17 +++++-
 8 files changed, 152 insertions(+), 51 deletions(-)

diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index 67398f81..9da8b1e6 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -71,6 +71,12 @@ namespace Eval::NNUE {
             ",Network=" + Network::get_structure_string();
     }
 
+    std::string get_layers_info() {
+        return
+            FeatureTransformer::get_layers_info()
+            + '\n' + Network::get_layers_info();
+    }
+
     UseNNUEMode useNNUE;
     std::string eval_file_loaded = "None";
 
diff --git a/src/nnue/evaluate_nnue.h b/src/nnue/evaluate_nnue.h
index d0f61644..100e693c 100644
--- a/src/nnue/evaluate_nnue.h
+++ b/src/nnue/evaluate_nnue.h
@@ -81,6 +81,8 @@ namespace Eval::NNUE {
     // Get a string that represents the structure of the evaluation function
     std::string get_architecture_string();
 
+    std::string get_layers_info();
+
     // read the header
     bool read_header(std::istream& stream,
         std::uint32_t* hash_value, std::string* architecture);
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index e0236781..54525fe4 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -58,6 +58,13 @@ namespace Eval::NNUE {
         std::cout << "Initializing NN training for "
                   << get_architecture_string() << std::endl;
 
+        std::cout << std::endl;
+
+        std::cout << "Layers:\n"
+                  << get_layers_info() << std::endl;
+
+        std::cout << std::endl;
+
         assert(feature_transformer);
         assert(network);
         trainer = Trainer<Network>::create(network.get(), feature_transformer.get());
diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index 6efaecbc..e734580e 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -57,6 +57,8 @@ namespace Eval::NNUE::Layers {
         static constexpr std::size_t kBufferSize =
             PreviousLayer::kBufferSize + kSelfBufferSize;
 
+        static constexpr int kLayerIndex = PreviousLayer::kLayerIndex + 1;
+
         // Hash value embedded in the evaluation file
         static constexpr std::uint32_t get_hash_value() {
             std::uint32_t hash_value = 0xCC03DAE4u;
@@ -66,14 +68,27 @@ namespace Eval::NNUE::Layers {
             return hash_value;
         }
 
-        // A string that represents the structure from the input layer to this layer
-        static std::string get_structure_string() {
+        static std::string get_name() {
             return "AffineTransform[" +
                 std::to_string(kOutputDimensions) + "<-" +
-                std::to_string(kInputDimensions) + "](" +
+                std::to_string(kInputDimensions) + "]";
+        }
+
+        // A string that represents the structure from the input layer to this layer
+        static std::string get_structure_string() {
+            return get_name() + "(" +
                 PreviousLayer::get_structure_string() + ")";
         }
 
+        static std::string get_layers_info() {
+            std::string info = PreviousLayer::get_layers_info();
+            info += '\n';
+            info += std::to_string(kLayerIndex);
+            info += ": ";
+            info += get_name();
+            return info;
+        }
+
        // Read network parameters
         bool read_parameters(std::istream& stream) {
             if (!previous_layer_.read_parameters(stream))
diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h
index 889effa7..5fbd66cc 100644
--- a/src/nnue/layers/clipped_relu.h
+++ b/src/nnue/layers/clipped_relu.h
@@ -54,6 +54,8 @@ namespace Eval::NNUE::Layers {
         static constexpr std::size_t kBufferSize =
             PreviousLayer::kBufferSize + kSelfBufferSize;
 
+        static constexpr int kLayerIndex = PreviousLayer::kLayerIndex + 1;
+
         // Hash value embedded in the evaluation file
         static constexpr std::uint32_t get_hash_value() {
             std::uint32_t hash_value = 0x538D24C7u;
@@ -61,13 +63,26 @@ namespace Eval::NNUE::Layers {
             return hash_value;
         }
 
+        static std::string get_name() {
+            return "ClippedReLU[" +
+                std::to_string(kOutputDimensions) + "]";
+        }
+
         // A string that represents the structure from the input layer to this layer
         static std::string get_structure_string() {
-            return "ClippedReLU[" +
-                std::to_string(kOutputDimensions) + "](" +
+            return get_name() + "(" +
                 PreviousLayer::get_structure_string() + ")";
         }
 
+        static std::string get_layers_info() {
+            std::string info = PreviousLayer::get_layers_info();
+            info += '\n';
+            info += std::to_string(kLayerIndex);
+            info += ": ";
+            info += get_name();
+            return info;
+        }
+
         // Read network parameters
         bool read_parameters(std::istream& stream) {
             return previous_layer_.read_parameters(stream);
diff --git a/src/nnue/layers/input_slice.h b/src/nnue/layers/input_slice.h
index b69028ab..56c738af 100644
--- a/src/nnue/layers/input_slice.h
+++ b/src/nnue/layers/input_slice.h
@@ -28,56 +28,69 @@
 
 namespace Eval::NNUE::Layers {
 
-  // Input layer
-  template <IndexType OutputDimensions, IndexType Offset = 0>
-  class InputSlice {
-  public:
-      // Need to maintain alignment
-      static_assert(Offset % kMaxSimdWidth == 0, "");
+    // Input layer
+    template <IndexType OutputDimensions, IndexType Offset = 0>
+    class InputSlice {
+    public:
+        // Need to maintain alignment
+        static_assert(Offset % kMaxSimdWidth == 0, "");
 
-      // Output type
-      using OutputType = TransformedFeatureType;
+        // Output type
+        using OutputType = TransformedFeatureType;
 
-      // Output dimensionality
-      static constexpr IndexType kOutputDimensions = OutputDimensions;
+        // Output dimensionality
+        static constexpr IndexType kOutputDimensions = OutputDimensions;
 
-      // Size of forward propagation buffer used from the input layer to this layer
-      static constexpr std::size_t kBufferSize = 0;
+        // Size of forward propagation buffer used from the input layer to this layer
+        static constexpr std::size_t kBufferSize = 0;
 
-      // Hash value embedded in the evaluation file
-      static constexpr std::uint32_t get_hash_value() {
-          std::uint32_t hash_value = 0xEC42E90Du;
-          hash_value ^= kOutputDimensions ^ (Offset << 10);
-          return hash_value;
-      }
+        static constexpr int kLayerIndex = 1;
 
-      // A string that represents the structure from the input layer to this layer
-      static std::string get_structure_string() {
-          return "InputSlice[" + std::to_string(kOutputDimensions) + "(" +
-              std::to_string(Offset) + ":" +
-              std::to_string(Offset + kOutputDimensions) + ")]";
-      }
+        // Hash value embedded in the evaluation file
+        static constexpr std::uint32_t get_hash_value() {
+            std::uint32_t hash_value = 0xEC42E90Du;
+            hash_value ^= kOutputDimensions ^ (Offset << 10);
+            return hash_value;
+        }
 
-      // Read network parameters
-      bool read_parameters(std::istream& /*stream*/) {
-          return true;
-      }
+        static std::string get_name() {
+            return "InputSlice[" + std::to_string(kOutputDimensions) + "(" +
+                std::to_string(Offset) + ":" +
+                std::to_string(Offset + kOutputDimensions) + ")]";
+        }
 
-      // write parameters
-      bool write_parameters(std::ostream& /*stream*/) const {
-          return true;
-      }
+        // A string that represents the structure from the input layer to this layer
+        static std::string get_structure_string() {
+            return get_name();
+        }
 
-      // Forward propagation
-      const OutputType* propagate(
-          const TransformedFeatureType* transformed_features,
-          char* /*buffer*/) const {
+        static std::string get_layers_info() {
+            std::string info = std::to_string(kLayerIndex);
+            info += ": ";
+            info += get_name();
+            return info;
+        }
 
-          return transformed_features + Offset;
-      }
+        // Read network parameters
+        bool read_parameters(std::istream& /*stream*/) {
+            return true;
+        }
 
-  private:
-  };
+        // write parameters
+        bool write_parameters(std::ostream& /*stream*/) const {
+            return true;
+        }
+
+        // Forward propagation
+        const OutputType* propagate(
+            const TransformedFeatureType* transformed_features,
+            char* /*buffer*/) const {
+
+            return transformed_features + Offset;
+        }
+
+    private:
+    };
 
 }  // namespace Layers
 
diff --git a/src/nnue/layers/sum.h b/src/nnue/layers/sum.h
index 64ef30f9..0f71bd61 100644
--- a/src/nnue/layers/sum.h
+++ b/src/nnue/layers/sum.h
@@ -36,6 +36,8 @@ namespace Eval::NNUE::Layers {
         static constexpr std::size_t kBufferSize =
             std::max(Head::kBufferSize + kSelfBufferSize, Tail::kBufferSize);
 
+        static constexpr int kLayerIndex = Tail::kLayerIndex + 1;
+
         // Hash value embedded in the evaluation function file
         static constexpr std::uint32_t get_hash_value() {
             std::uint32_t hash_value = 0xBCE400B4u;
@@ -46,10 +48,23 @@ namespace Eval::NNUE::Layers {
             return hash_value;
         }
 
+        static std::string get_name() {
+             return "Sum[" +
+                std::to_string(kOutputDimensions) + "]";
+        }
+
         // A string that represents the structure from the input layer to this layer
         static std::string get_structure_string() {
-            return "Sum[" +
-                std::to_string(kOutputDimensions) + "](" + get_summands_string() + ")";
+            return get_name() + "(" + get_summands_string() + ")";
+        }
+
+        static std::string get_layers_info() {
+            std::string info = Tail::get_layers_info();
+            info += '\n';
+            info += std::to_string(kLayerIndex);
+            info += ": ";
+            info += get_name();
+            return info;
         }
 
         // read parameters
@@ -117,6 +132,8 @@ namespace Eval::NNUE::Layers {
         // Size of the forward propagation buffer used from the input layer to this layer
         static constexpr std::size_t kBufferSize = PreviousLayer::kBufferSize;
 
+        static constexpr int kLayerIndex = PreviousLayer::kLayerIndex + 1;
+
         // Hash value embedded in the evaluation function file
         static constexpr std::uint32_t get_hash_value() {
             std::uint32_t hash_value = 0xBCE400B4u;
@@ -125,10 +142,23 @@ namespace Eval::NNUE::Layers {
             return hash_value;
         }
 
+        static std::string get_name() {
+             return "Sum[" +
+                std::to_string(kOutputDimensions) + "]";
+        }
+
         // A string that represents the structure from the input layer to this layer
         static std::string get_structure_string() {
-            return "Sum[" +
-                std::to_string(kOutputDimensions) + "](" + get_summands_string() + ")";
+            return get_name() + "(" + get_summands_string() + ")";
+        }
+
+        static std::string get_layers_info() {
+            std::string info = PreviousLayer::get_layers_info();
+            info += '\n';
+            info += std::to_string(kLayerIndex);
+            info += ": ";
+            info += get_name();
+            return info;
         }
 
         // read parameters
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index 87b8ee58..3e18e68a 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -110,19 +110,32 @@ namespace Eval::NNUE {
         static constexpr std::size_t kBufferSize =
             kOutputDimensions * sizeof(OutputType);
 
+        static constexpr int kLayerIndex = 0;
+
         // Hash value embedded in the evaluation file
         static constexpr std::uint32_t get_hash_value() {
 
             return RawFeatures::kHashValue ^ kOutputDimensions;
         }
 
-        // a string representing the structure
-        static std::string get_structure_string() {
+        static std::string get_name() {
             return RawFeatures::get_name() + "[" +
                 std::to_string(kInputDimensions) + "->" +
                 std::to_string(kHalfDimensions) + "x2]";
         }
 
+        // a string representing the structure
+        static std::string get_structure_string() {
+            return get_name();
+        }
+
+        static std::string get_layers_info() {
+            std::string info = std::to_string(kLayerIndex);
+            info += ": ";
+            info += get_name();
+            return info;
+        }
+
         // Read network parameters
         bool read_parameters(std::istream& stream) {
 

From ec436d3dfd212b90e568864318f9cd42b55faece Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 11:15:59 +0200
Subject: [PATCH 251/398] Print some weight update stats

---
 src/learn/learn.cpp                |  9 +++----
 src/nnue/evaluate_nnue_learner.cpp | 42 ++++++++++++++++++++++++++++--
 src/nnue/evaluate_nnue_learner.h   |  1 +
 src/nnue/trainer/trainer.h         |  1 +
 4 files changed, 45 insertions(+), 8 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 7de359ef..e3d2fecf 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -599,19 +599,16 @@ namespace Learner
                 // Evaluation value of deep search
                 const auto deep_value = (Value)ps.score;
 
-                const Value shallow_value =
-                    (rootColor == pos.side_to_move())
-                    ? Eval::evaluate(pos)
-                    : -Eval::evaluate(pos);
+                const Value shallow_value = Eval::evaluate(pos);
 
                 const auto loss = calc_cross_entropy(
                     deep_value,
-                    shallow_value,
+                    (rootColor == pos.side_to_move()) ? shallow_value : -shallow_value,
                     ps);
 
                 local_loss_sum += loss;
 
-                Eval::NNUE::add_example(pos, rootColor, ps, 1.0);
+                Eval::NNUE::add_example(pos, rootColor, shallow_value, ps, 1.0);
             };
 
             if (!pos.pseudo_legal((Move)ps.move) || !pos.legal((Move)ps.move))
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 54525fe4..581e7928 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -118,8 +118,12 @@ namespace Eval::NNUE {
     }
 
     // Add 1 sample of learning data
-    void add_example(Position& pos, Color rootColor,
-                    const Learner::PackedSfenValue& psv, double weight) {
+    void add_example(
+        Position& pos,
+        Color rootColor,
+        Value discrete_nn_eval,
+        const Learner::PackedSfenValue& psv,
+        double weight) {
 
         Example example;
         if (rootColor == pos.side_to_move()) {
@@ -128,6 +132,7 @@ namespace Eval::NNUE {
             example.sign = -1;
         }
 
+        example.discrete_nn_eval = discrete_nn_eval;
         example.psv = psv;
         example.weight = weight;
 
@@ -176,6 +181,13 @@ namespace Eval::NNUE {
 
         std::lock_guard<std::mutex> lock(examples_mutex);
         std::shuffle(examples.begin(), examples.end(), rng);
+
+        double abs_eval_diff_sum = 0.0;
+        double abs_discrete_eval_sum = 0.0;
+        double gradient_norm = 0.0;
+
+        bool is_first_batch = true;
+
         while (examples.size() >= batch_size) {
             std::vector<Example> batch(examples.end() - batch_size, examples.end());
             examples.resize(examples.size() - batch_size);
@@ -186,13 +198,39 @@ namespace Eval::NNUE {
             for (std::size_t b = 0; b < batch.size(); ++b) {
                 const auto shallow = static_cast<Value>(round<std::int32_t>(
                     batch[b].sign * network_output[b] * kPonanzaConstant));
+                const auto discrete = batch[b].sign * batch[b].discrete_nn_eval;
                 const auto& psv = batch[b].psv;
                 const double gradient = batch[b].sign * Learner::calc_grad(shallow, psv);
                 gradients[b] = static_cast<LearnFloatType>(gradient * batch[b].weight);
+
+
+                // The discrete eval will only be valid before first backpropagation,
+                // that is only for the first batch.
+                // Similarily we want only gradients from one batch.
+                if (is_first_batch)
+                {
+                    abs_eval_diff_sum += std::abs(discrete - shallow);
+                    abs_discrete_eval_sum += std::abs(discrete);
+                    gradient_norm += std::abs(gradient);
+                }
             }
 
             trainer->backpropagate(gradients.data(), learning_rate);
+
+            is_first_batch = false;
         }
+
+        const double avg_abs_eval_diff = abs_eval_diff_sum / batch_size;
+        const double avg_abs_discrete_eval = abs_discrete_eval_sum / batch_size;
+
+        std::cout << "INFO (update_weights):"
+            << " avg_abs(trainer_eval-nnue_eval) = " << avg_abs_eval_diff
+            << " , avg_abs(nnue_eval) = " << avg_abs_discrete_eval
+            << " , avg_relative_error = " << avg_abs_eval_diff / avg_abs_discrete_eval
+            << " , batch_size = " << batch_size
+            << " , grad_norm = " << gradient_norm
+            << std::endl;
+
         send_messages({{"quantize_parameters"}});
     }
 
diff --git a/src/nnue/evaluate_nnue_learner.h b/src/nnue/evaluate_nnue_learner.h
index 431fb02e..48ab31b9 100644
--- a/src/nnue/evaluate_nnue_learner.h
+++ b/src/nnue/evaluate_nnue_learner.h
@@ -22,6 +22,7 @@ namespace Eval::NNUE {
     void add_example(
         Position& pos,
         Color rootColor,
+        Value discrete_nn_eval,
     	const Learner::PackedSfenValue& psv,
         double weight);
 
diff --git a/src/nnue/trainer/trainer.h b/src/nnue/trainer/trainer.h
index 763bd5c8..973bc898 100644
--- a/src/nnue/trainer/trainer.h
+++ b/src/nnue/trainer/trainer.h
@@ -68,6 +68,7 @@ namespace Eval::NNUE {
     struct Example {
         std::vector<TrainingFeature> training_features[2];
         Learner::PackedSfenValue psv;
+        Value discrete_nn_eval;
         int sign;
         double weight;
     };

From a351c1d65e23b04bb56bac4cd74ac5dd2041b658 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 11:39:16 +0200
Subject: [PATCH 252/398] Add verbose flag to learn. Only print update
 parameters info when vebose=true

---
 src/learn/learn.cpp                |  9 ++++++++-
 src/nnue/evaluate_nnue_learner.cpp | 29 ++++++++++++++++-------------
 src/nnue/evaluate_nnue_learner.h   |  2 +-
 3 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index e3d2fecf..a56ac15f 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -432,6 +432,8 @@ namespace Learner
         // If true, do not dig the folder.
         bool save_only_once;
 
+        bool verbose;
+
         double newbob_decay;
         int newbob_num_trials;
         uint64_t auto_lr_drop;
@@ -644,7 +646,7 @@ namespace Learner
         // should be no real issues happening since
         // the read/write phases are isolated.
         atomic_thread_fence(memory_order_seq_cst);
-        Eval::NNUE::update_parameters();
+        Eval::NNUE::update_parameters(epoch, verbose);
         atomic_thread_fence(memory_order_seq_cst);
 
         if (++save_count * mini_batch_size >= eval_save_interval)
@@ -943,6 +945,8 @@ namespace Learner
         // Turn on if you want to pass a pre-shuffled file.
         bool no_shuffle = false;
 
+        bool verbose = false;
+
         global_learning_rate = 1.0;
 
         // elmo lambda
@@ -1070,6 +1074,7 @@ namespace Learner
                 UCI::setoption("PruneAtShallowDepth", "false");
                 UCI::setoption("EnableTranspositionTable", "false");
             }
+            else if (option == "verbose") verbose = true;
             else
             {
                 cout << "Unknown option: " << option << ". Ignoring.\n";
@@ -1191,6 +1196,8 @@ namespace Learner
         learn_think.mini_batch_size = mini_batch_size;
         learn_think.validation_set_file_name = validation_set_file_name;
 
+        learn_think.verbose = verbose;
+
         cout << "init done." << endl;
 
         // Start learning.
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 581e7928..e0d2351d 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -173,7 +173,7 @@ namespace Eval::NNUE {
     }
 
     // update the evaluation function parameters
-    void update_parameters() {
+    void update_parameters(uint64_t epoch, bool verbose) {
         assert(batch_size > 0);
 
         const auto learning_rate = static_cast<LearnFloatType>(
@@ -186,7 +186,7 @@ namespace Eval::NNUE {
         double abs_discrete_eval_sum = 0.0;
         double gradient_norm = 0.0;
 
-        bool is_first_batch = true;
+        bool collect_stats = verbose;
 
         while (examples.size() >= batch_size) {
             std::vector<Example> batch(examples.end() - batch_size, examples.end());
@@ -207,7 +207,7 @@ namespace Eval::NNUE {
                 // The discrete eval will only be valid before first backpropagation,
                 // that is only for the first batch.
                 // Similarily we want only gradients from one batch.
-                if (is_first_batch)
+                if (collect_stats)
                 {
                     abs_eval_diff_sum += std::abs(discrete - shallow);
                     abs_discrete_eval_sum += std::abs(discrete);
@@ -217,19 +217,22 @@ namespace Eval::NNUE {
 
             trainer->backpropagate(gradients.data(), learning_rate);
 
-            is_first_batch = false;
+            collect_stats = false;
         }
 
-        const double avg_abs_eval_diff = abs_eval_diff_sum / batch_size;
-        const double avg_abs_discrete_eval = abs_discrete_eval_sum / batch_size;
+        if (verbose) {
+            const double avg_abs_eval_diff = abs_eval_diff_sum / batch_size;
+            const double avg_abs_discrete_eval = abs_discrete_eval_sum / batch_size;
 
-        std::cout << "INFO (update_weights):"
-            << " avg_abs(trainer_eval-nnue_eval) = " << avg_abs_eval_diff
-            << " , avg_abs(nnue_eval) = " << avg_abs_discrete_eval
-            << " , avg_relative_error = " << avg_abs_eval_diff / avg_abs_discrete_eval
-            << " , batch_size = " << batch_size
-            << " , grad_norm = " << gradient_norm
-            << std::endl;
+            std::cout << "INFO (update_parameters):"
+                << " epoch = " << epoch
+                << " , avg_abs(trainer_eval-nnue_eval) = " << avg_abs_eval_diff
+                << " , avg_abs(nnue_eval) = " << avg_abs_discrete_eval
+                << " , avg_relative_error = " << avg_abs_eval_diff / avg_abs_discrete_eval
+                << " , batch_size = " << batch_size
+                << " , grad_norm = " << gradient_norm
+                << std::endl;
+        }
 
         send_messages({{"quantize_parameters"}});
     }
diff --git a/src/nnue/evaluate_nnue_learner.h b/src/nnue/evaluate_nnue_learner.h
index 48ab31b9..03a23c83 100644
--- a/src/nnue/evaluate_nnue_learner.h
+++ b/src/nnue/evaluate_nnue_learner.h
@@ -27,7 +27,7 @@ namespace Eval::NNUE {
         double weight);
 
     // update the evaluation function parameters
-    void update_parameters();
+    void update_parameters(uint64_t epoch, bool verbose);
 
     // Check if there are any problems with learning
     void check_health();

From d70408f20431a0f18283aaff7ddb09d3fc42cb51 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 11:40:26 +0200
Subject: [PATCH 253/398] Add docs entry for the verbose flag.

---
 docs/learn.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/learn.md b/docs/learn.md
index f815284c..7051a173 100644
--- a/docs/learn.md
+++ b/docs/learn.md
@@ -80,6 +80,8 @@ Currently the following options are available:
 
 `seed` - seed for the PRNG. Can be either a number or a string. If it's a string then its hash will be used. If not specified then the current time will be used.
 
+`verbose` - this is a modifier, not a parameter. When used there will be more detailed output during training.
+
 ## Legacy subcommands and parameters
 
 ### Convert

From 8ddef320e6117e6d9174b2445dec28212c4cb92f Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 11:42:30 +0200
Subject: [PATCH 254/398] Print an additional new line before calc_loss
 progress instead of after check_health in the feature transformer layer.

---
 src/learn/learn.cpp                            | 3 ++-
 src/nnue/trainer/trainer_feature_transformer.h | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index a56ac15f..6257d920 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -677,7 +677,8 @@ namespace Learner
         TT.new_search();
         TimePoint elapsed = now() - Search::Limits.startTime + 1;
 
-        cout << "PROGRESS: " << now_string() << ", ";
+        cout << "\n";
+        cout << "PROGRESS (calc_loss): " << now_string() << ", ";
         cout << total_done << " sfens, ";
         cout << total_done * 1000 / elapsed  << " sfens/second";
         cout << ", iteration " << epoch;
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index a3d6c16a..2311fc0c 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -349,7 +349,7 @@ namespace Eval::NNUE {
 
             std::cout << "INFO: largest min activation = " << largest_min_activation
                       << ", smallest max activation = " << smallest_max_activation
-                      << std::endl << std::endl;
+                      << std::endl;
 
             std::fill(std::begin(min_activations_), std::end(min_activations_),
                       std::numeric_limits<LearnFloatType>::max());

From c49ae541c42b4111767d13424bc29d65532aa1a9 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 11:58:53 +0200
Subject: [PATCH 255/398] Add layer info for check_health. Print subsequent
 infos from the same scope with "-->" instead of "INFO:" for clarity.

---
 src/learn/learn.cpp                           |  4 ++--
 src/nnue/trainer/trainer_clipped_relu.h       |  9 +++++++--
 .../trainer/trainer_feature_transformer.h     | 19 +++++++++++++------
 3 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 6257d920..c9313575 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -141,7 +141,7 @@ namespace Learner
             void print(const std::string& prefix, ostream& s) const
             {
                 s
-                    << "INFO: "
+                    << "--> "
                     << prefix << "_cross_entropy_eval = " << cross_entropy_eval / count
                     << " , " << prefix << "_cross_entropy_win = " << cross_entropy_win / count
                     << " , " << prefix << "_entropy_eval = " << entropy_eval / count
@@ -722,7 +722,7 @@ namespace Learner
 
         if (psv.size() && test_loss_sum.count > 0.0)
         {
-            cout << "INFO: norm = " << sum_norm
+            cout << "--> norm = " << sum_norm
                 << " , move accuracy = " << (move_accord_count * 100.0 / psv.size()) << "%"
                 << endl;
 
diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index 35503493..d1dd738b 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -99,8 +99,13 @@ namespace Eval::NNUE {
             const auto smallest_max_activation = *std::min_element(
                 std::begin(max_activations_), std::end(max_activations_));
 
-            std::cout << "INFO: largest min activation = " << largest_min_activation
-                      << ", smallest max activation = " << smallest_max_activation
+            std::cout << "INFO (check_health):"
+                      << " layer = " << LayerType::kLayerIndex
+                      << " , name = " << LayerType::get_name()
+                      << std::endl;
+
+            std::cout << "--> largest min activation = " << largest_min_activation
+                      << " , smallest max activation = " << smallest_max_activation
                       << std::endl;
 
             std::fill(std::begin(min_activations_), std::end(min_activations_),
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 2311fc0c..dbfe18a2 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -330,25 +330,32 @@ namespace Eval::NNUE {
 
         // Check if there are any problems with learning
         void check_health() {
-            std::cout << "INFO: observed " << observed_features.count()
-                      << " (out of " << kInputDimensions << ") features" << std::endl;
+            std::cout << "INFO (check_health):"
+                      << " layer = " << LayerType::kLayerIndex
+                      << " , name = " << LayerType::get_name()
+                      << std::endl;
+
+            std::cout << "--> observed " << observed_features.count()
+                      << " (out of " << kInputDimensions << ") features"
+                      << std::endl;
 
             constexpr LearnFloatType kPreActivationLimit =
                 std::numeric_limits<typename LayerType::WeightType>::max() /
                 kWeightScale;
 
-            std::cout << "INFO: (min, max) of pre-activations = "
+            std::cout << "--> (min, max) of pre-activations = "
                       << min_pre_activation_ << ", "
                       << max_pre_activation_ << " (limit = "
-                      << kPreActivationLimit << ")" << std::endl;
+                      << kPreActivationLimit << ")"
+                      << std::endl;
 
             const auto largest_min_activation = *std::max_element(
                 std::begin(min_activations_), std::end(min_activations_));
             const auto smallest_max_activation = *std::min_element(
                 std::begin(max_activations_), std::end(max_activations_));
 
-            std::cout << "INFO: largest min activation = " << largest_min_activation
-                      << ", smallest max activation = " << smallest_max_activation
+            std::cout << "--> largest min activation = " << largest_min_activation
+                      << " , smallest max activation = " << smallest_max_activation
                       << std::endl;
 
             std::fill(std::begin(min_activations_), std::end(min_activations_),

From cf3edfed8203ad249bca9eab16336afb961d1e9e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 13:12:32 +0200
Subject: [PATCH 256/398] Improve info messages.

---
 src/learn/learn.cpp                           | 43 +++++++++----------
 src/learn/sfen_reader.h                       | 12 +++---
 src/nnue/evaluate_nnue_learner.cpp            |  4 +-
 src/nnue/trainer/trainer_clipped_relu.h       |  6 +--
 .../trainer/trainer_feature_transformer.h     | 10 ++---
 5 files changed, 36 insertions(+), 39 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index c9313575..cf26e05e 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -140,15 +140,12 @@ namespace Learner
 
             void print(const std::string& prefix, ostream& s) const
             {
-                s
-                    << "--> "
-                    << prefix << "_cross_entropy_eval = " << cross_entropy_eval / count
-                    << " , " << prefix << "_cross_entropy_win = " << cross_entropy_win / count
-                    << " , " << prefix << "_entropy_eval = " << entropy_eval / count
-                    << " , " << prefix << "_entropy_win = " << entropy_win / count
-                    << " , " << prefix << "_cross_entropy = " << cross_entropy / count
-                    << " , " << prefix << "_entropy = " << entropy / count
-                    << endl;
+                s << "==> " << prefix << "_cross_entropy_eval = " << cross_entropy_eval / count << endl;
+                s << "==> " << prefix << "_cross_entropy_win  = " << cross_entropy_win / count << endl;
+                s << "==> " << prefix << "_entropy_eval       = " << entropy_eval / count << endl;
+                s << "==> " << prefix << "_entropy_win        = " << entropy_win / count << endl;
+                s << "==> " << prefix << "_cross_entropy      = " << cross_entropy / count << endl;
+                s << "==> " << prefix << "_entropy            = " << entropy / count << endl;
             }
         };
     }
@@ -678,11 +675,13 @@ namespace Learner
         TimePoint elapsed = now() - Search::Limits.startTime + 1;
 
         cout << "\n";
-        cout << "PROGRESS (calc_loss): " << now_string() << ", ";
-        cout << total_done << " sfens, ";
-        cout << total_done * 1000 / elapsed  << " sfens/second";
-        cout << ", iteration " << epoch;
-        cout << ", learning rate = " << global_learning_rate << ", ";
+        cout << "PROGRESS (calc_loss): " << now_string()
+             << ", " << total_done << " sfens"
+             << ", " << total_done * 1000 / elapsed  << " sfens/second"
+             << ", epoch " << epoch
+             << endl;
+
+        cout << "==> learning rate = " << global_learning_rate << endl;
 
         // For calculation of verification data loss
         AtomicLoss test_loss_sum{};
@@ -699,7 +698,7 @@ namespace Learner
             auto& pos = th.rootPos;
             StateInfo si;
             pos.set(StartFEN, false, &si, &th);
-            cout << "startpos eval = " << Eval::evaluate(pos) << endl;
+            cout << "==> startpos eval = " << Eval::evaluate(pos) << endl;
         });
         mainThread->wait_for_worker_finished();
 
@@ -722,16 +721,15 @@ namespace Learner
 
         if (psv.size() && test_loss_sum.count > 0.0)
         {
-            cout << "--> norm = " << sum_norm
-                << " , move accuracy = " << (move_accord_count * 100.0 / psv.size()) << "%"
-                << endl;
-
             test_loss_sum.print("test", cout);
 
             if (learn_loss_sum.count > 0.0)
             {
                 learn_loss_sum.print("learn", cout);
             }
+
+            cout << "==> norm = " << sum_norm << endl;
+            cout << "==> move accuracy = " << (move_accord_count * 100.0 / psv.size()) << "%" << endl;
         }
         else
         {
@@ -847,7 +845,8 @@ namespace Learner
                 const double latest_loss = latest_loss_sum / latest_loss_count;
                 latest_loss_sum = 0.0;
                 latest_loss_count = 0;
-                cout << "loss: " << latest_loss;
+                cout << "INFO (learning_rate):" << endl;
+                cout << "==> loss = " << latest_loss;
                 auto tot = total_done;
                 if (auto_lr_drop)
                 {
@@ -877,7 +876,7 @@ namespace Learner
                     if (--trials > 0 && !is_final)
                     {
                         cout
-                            << "reducing learning rate from " << global_learning_rate
+                            << "==> reducing learning rate from " << global_learning_rate
                             << " to " << (global_learning_rate * newbob_decay)
                             << " (" << trials << " more trials)" << endl;
 
@@ -887,7 +886,7 @@ namespace Learner
 
                 if (trials == 0)
                 {
-                    cout << "converged" << endl;
+                    cout << "==> converged" << endl;
                     return true;
                 }
             }
diff --git a/src/learn/sfen_reader.h b/src/learn/sfen_reader.h
index 71767bc6..4d5a6d1a 100644
--- a/src/learn/sfen_reader.h
+++ b/src/learn/sfen_reader.h
@@ -83,7 +83,7 @@ namespace Learner{
                 PackedSfenValue ps;
                 if (!read_to_thread_buffer(0, ps))
                 {
-                    std::cout << "Error! read packed sfen , failed." << std::endl;
+                    std::cout << "ERROR (sfen_reader): Reading failed." << std::endl;
                     return sfen_for_mse;
                 }
 
@@ -211,16 +211,16 @@ namespace Learner{
 
                     if (sfen_input_stream == nullptr)
                     {
-                        std::cout << "File does not exist: " << currentFilename << '\n';
+                        std::cout << "INFO (sfen_reader): File does not exist: " << currentFilename << '\n';
                     }
                     else
                     {
-                        std::cout << "Opened file for reading: " << currentFilename << '\n';
+                        std::cout << "INFO (sfen_reader): Opened file for reading: " << currentFilename << '\n';
 
                         // in case the file is empty or was deleted.
                         if (sfen_input_stream->eof())
                         {
-                            std::cout << "File empty, nothing to read.\n";
+                            std::cout << "INFO (sfen_reader): File empty, nothing to read.\n";
                         }
                         else
                         {
@@ -232,7 +232,7 @@ namespace Learner{
 
             if (sfen_input_stream == nullptr && !open_next_file())
             {
-                std::cout << "..end of files." << std::endl;
+                std::cout << "INFO (sfen_reader): End of files." << std::endl;
                 end_of_files = true;
                 return;
             }
@@ -271,7 +271,7 @@ namespace Learner{
                         if(!open_next_file())
                         {
                             // There was no next file. Abort.
-                            std::cout << "..end of files." << std::endl;
+                            std::cout << "INFO (sfen_reader): End of files." << std::endl;
                             end_of_files = true;
                             return;
                         }
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index e0d2351d..64b558bd 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -245,7 +245,7 @@ namespace Eval::NNUE {
     // save merit function parameters to a file
     void save_eval(std::string dir_name) {
         auto eval_dir = Path::combine(Options["EvalSaveDir"], dir_name);
-        std::cout << "save_eval() start. folder = " << eval_dir << std::endl;
+        std::cout << "INFO (save_eval): Saving current evaluation file in " << eval_dir << std::endl;
 
         // mkdir() will fail if this folder already exists, but
         // Apart from that. If not, I just want you to make it.
@@ -261,7 +261,5 @@ namespace Eval::NNUE {
 #ifndef NDEBUG
         assert(result);
 #endif
-
-        std::cout << "save_eval() finished. folder = " << eval_dir << std::endl;
     }
 }  // namespace Eval::NNUE
\ No newline at end of file
diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index d1dd738b..284b7e73 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -100,11 +100,11 @@ namespace Eval::NNUE {
                 std::begin(max_activations_), std::end(max_activations_));
 
             std::cout << "INFO (check_health):"
-                      << " layer = " << LayerType::kLayerIndex
-                      << " , name = " << LayerType::get_name()
+                      << " layer " << LayerType::kLayerIndex
+                      << " - " << LayerType::get_name()
                       << std::endl;
 
-            std::cout << "--> largest min activation = " << largest_min_activation
+            std::cout << "==> largest min activation = " << largest_min_activation
                       << " , smallest max activation = " << smallest_max_activation
                       << std::endl;
 
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index dbfe18a2..fea419c9 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -331,11 +331,11 @@ namespace Eval::NNUE {
         // Check if there are any problems with learning
         void check_health() {
             std::cout << "INFO (check_health):"
-                      << " layer = " << LayerType::kLayerIndex
-                      << " , name = " << LayerType::get_name()
+                      << " layer " << LayerType::kLayerIndex
+                      << " - " << LayerType::get_name()
                       << std::endl;
 
-            std::cout << "--> observed " << observed_features.count()
+            std::cout << "==> observed " << observed_features.count()
                       << " (out of " << kInputDimensions << ") features"
                       << std::endl;
 
@@ -343,7 +343,7 @@ namespace Eval::NNUE {
                 std::numeric_limits<typename LayerType::WeightType>::max() /
                 kWeightScale;
 
-            std::cout << "--> (min, max) of pre-activations = "
+            std::cout << "==> (min, max) of pre-activations = "
                       << min_pre_activation_ << ", "
                       << max_pre_activation_ << " (limit = "
                       << kPreActivationLimit << ")"
@@ -354,7 +354,7 @@ namespace Eval::NNUE {
             const auto smallest_max_activation = *std::min_element(
                 std::begin(max_activations_), std::end(max_activations_));
 
-            std::cout << "--> largest min activation = " << largest_min_activation
+            std::cout << "==> largest min activation = " << largest_min_activation
                       << " , smallest max activation = " << smallest_max_activation
                       << std::endl;
 

From 54dd6a240705e83c44ff0b4201d113b0868630b1 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 13:13:10 +0200
Subject: [PATCH 257/398] Add logger with synchronized regions.

---
 src/misc.cpp |   2 +
 src/misc.h   | 178 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 180 insertions(+)

diff --git a/src/misc.cpp b/src/misc.cpp
index e09b8eed..879f4462 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -61,6 +61,8 @@ typedef bool(*fun3_t)(HANDLE, CONST GROUP_AFFINITY*, PGROUP_AFFINITY);
 
 using namespace std;
 
+SynchronizedRegionLogger sync_region_cout(std::cout);
+
 namespace {
 
 /// Version number. If Version is left empty, then compile date in the format
diff --git a/src/misc.h b/src/misc.h
index dca959cd..af40ab16 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -31,6 +31,7 @@
 #include <cmath>
 #include <cctype>
 #include <sstream>
+#include <deque>
 
 #include "types.h"
 
@@ -70,6 +71,183 @@ std::ostream& operator<<(std::ostream&, SyncCout);
 #define sync_cout std::cout << IO_LOCK
 #define sync_endl std::endl << IO_UNLOCK
 
+// This logger allows printing many parts in a region atomically
+// but doesn't block the threads trying to append to other regions.
+// Instead if some region tries to pring while other region holds
+// the lock the messages are queued to be printed as soon as the
+// current region releases the lock.
+struct SynchronizedRegionLogger
+{
+private:
+  using RegionId = std::uint64_t;
+
+  struct RegionLock
+  {
+    RegionLock(SynchronizedRegionLogger& log, RegionId id) :
+      logger(&log), region_id(id), is_held(true)
+    {
+    }
+
+    RegionLock(const RegionLock&) = delete;
+    RegionLock& operator=(const RegionLock&) = delete;
+
+    RegionLock(RegionLock&& other) :
+      logger(other.logger), region_id(other.region_id), is_held(other.is_held)
+    {
+      other.logger = nullptr;
+      other.is_held = false;
+    }
+
+    RegionLock& operator=(RegionLock&& other) {
+      if (is_held && logger != nullptr)
+      {
+        logger->release_region(region_id);
+      }
+
+      logger = other.logger;
+      region_id = other.region_id;
+      is_held = other.is_held;
+
+      other.is_held = false;
+
+      return *this;
+    }
+
+    ~RegionLock() { unlock(); }
+
+    void unlock() {
+      if (is_held) {
+        is_held = false;
+
+        if (logger != nullptr)
+          logger->release_region(region_id);
+      }
+    }
+
+    template <typename T>
+    RegionLock& operator << (const T& value) {
+      if (logger != nullptr)
+        logger->write(region_id, value);
+
+      return *this;
+    }
+
+  private:
+    SynchronizedRegionLogger* logger;
+    RegionId region_id;
+    bool is_held;
+  };
+
+  struct Region
+  {
+    Region(RegionId rid) : id(rid), is_held(true) {}
+
+    std::vector<std::string> pending_parts;
+    RegionId id;
+    bool is_held;
+  };
+
+  RegionId init_next_region()
+  {
+    static RegionId next_id = 0;
+
+    std::lock_guard lock(mutex);
+
+    const auto id = next_id++;
+    regions.emplace_back(id);
+
+    return id;
+  }
+
+  template <typename T>
+  void write(RegionId id, const T& value) {
+    std::lock_guard lock(mutex);
+
+    if (regions.empty())
+      return;
+
+    if (id == regions.front().id) {
+      // We can just directly print to the output because
+      // we are at the front of the region queue.
+      out << value;
+    } else {
+      // We have to schedule the print until previous regions are
+      // processed
+      auto* region = find_region_nolock(id);
+      if (region == nullptr)
+        return;
+
+      std::stringstream ss;
+      ss << value;
+      region->pending_parts.emplace_back(std::move(ss).str());
+    }
+  }
+
+  std::ostream& out;
+
+  std::deque<Region> regions;
+
+  std::mutex mutex;
+
+  Region* find_region_nolock(RegionId id) {
+    // Linear search because the amount of concurrent regions should be small.
+    auto it = std::find_if(
+      regions.begin(),
+      regions.end(),
+      [id](const Region& r) { return r.id == id; });
+
+    if (it == regions.end())
+      return nullptr;
+    else
+      return &*it;
+  }
+
+  void release_region(RegionId id) {
+    std::lock_guard lock(mutex);
+
+    auto* region = find_region_nolock(id);
+    if (region == nullptr)
+      return;
+
+    region->is_held = false;
+
+    process_backlog_nolock();
+  }
+
+  void process_backlog_nolock()
+  {
+    while(!regions.empty()) {
+      auto& region = regions.front();
+
+      for(auto& part : region.pending_parts) {
+        out << part;
+      }
+
+      // If the region is still held then we don't
+      // want to start printing stuff from the next region.
+      if (region.is_held)
+        break;
+
+      regions.pop_front();
+    }
+  }
+
+public:
+
+  SynchronizedRegionLogger(std::ostream& s) :
+    out(s)
+  {
+  }
+
+  [[nodiscard]] RegionLock new_region() {
+    const auto id = init_next_region();
+    return RegionLock(*this, id);
+  }
+
+};
+
+extern SynchronizedRegionLogger sync_region_cout;
+
 
 /// xorshift64star Pseudo-Random Number Generator
 /// This class is based on original code written and dedicated

From d824bd8ec5057d1b63b3d7721fe31f4a828b7516 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 13:31:40 +0200
Subject: [PATCH 258/398] Add an overload for io manip in the logger.

---
 src/misc.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/src/misc.h b/src/misc.h
index af40ab16..4c99cc2b 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -124,6 +124,13 @@ private:
       }
     }
 
+    RegionLock& operator << (std::ostream&(*pManip)(std::ostream&)) {
+      if (logger != nullptr)
+        logger->write(region_id, pManip);
+
+      return *this;
+    }
+
     template <typename T>
     RegionLock& operator << (const T& value) {
       if (logger != nullptr)
@@ -159,6 +166,29 @@ private:
     return id;
   }
 
+  void write(RegionId id, std::ostream&(*pManip)(std::ostream&)) {
+    std::lock_guard lock(mutex);
+
+    if (regions.empty())
+      return;
+
+    if (id == regions.front().id) {
+      // We can just directly print to the output because
+      // we are at the front of the region queue.
+      out << *pManip;
+    } else {
+      // We have to schedule the print until previous regions are
+      // processed
+      auto* region = find_region_nolock(id);
+      if (region == nullptr)
+        return;
+
+      std::stringstream ss;
+      ss << *pManip;
+      region->pending_parts.emplace_back(std::move(ss).str());
+    }
+  }
+
   template <typename T>
   void write(RegionId id, const T& value) {
     std::lock_guard lock(mutex);

From 4b72658409379d0f0c7de0530b49509a41631408 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 13:34:10 +0200
Subject: [PATCH 259/398] Synchronize printed info regions in the learner and
 sfen reader.

---
 src/learn/learn.cpp                           | 36 ++++++++++-------
 src/learn/sfen_reader.h                       | 13 ++++---
 src/nnue/evaluate_nnue_learner.cpp            |  4 +-
 src/nnue/trainer/trainer_clipped_relu.h       | 19 +++++----
 .../trainer/trainer_feature_transformer.h     | 39 +++++++++++--------
 5 files changed, 66 insertions(+), 45 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index cf26e05e..b0ae62f6 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -138,7 +138,8 @@ namespace Learner
                 count = 0.0;
             }
 
-            void print(const std::string& prefix, ostream& s) const
+            template <typename StreamT>
+            void print(const std::string& prefix, StreamT& s) const
             {
                 s << "==> " << prefix << "_cross_entropy_eval = " << cross_entropy_eval / count << endl;
                 s << "==> " << prefix << "_cross_entropy_win  = " << cross_entropy_win / count << endl;
@@ -499,8 +500,9 @@ namespace Learner
         if (validation_set_file_name.empty()
             && sfen_for_mse.size() != sfen_for_mse_size)
         {
-            cout
-                << "Error reading sfen_for_mse. Read " << sfen_for_mse.size()
+            auto out = sync_region_cout.new_region();
+            out
+                << "INFO (learn): Error reading sfen_for_mse. Read " << sfen_for_mse.size()
                 << " out of " << sfen_for_mse_size << '\n';
 
             return;
@@ -514,7 +516,8 @@ namespace Learner
             latest_loss_sum = 0.0;
             latest_loss_count = 0;
 
-            cout << "initial loss: " << best_loss << endl;
+            auto out = sync_region_cout.new_region();
+            out << "INFO (learn): initial loss = " << best_loss << endl;
         }
 
         stop_flag = false;
@@ -585,7 +588,8 @@ namespace Learner
             if (pos.set_from_packed_sfen(ps.sfen, &si, &th) != 0)
             {
                 // Malformed sfen
-                cout << "Error! : illigal packed sfen = " << pos.fen() << endl;
+                auto out = sync_region_cout.new_region();
+                out << "ERROR: illigal packed sfen = " << pos.fen() << endl;
                 goto RETRY_READ;
             }
 
@@ -674,14 +678,16 @@ namespace Learner
         TT.new_search();
         TimePoint elapsed = now() - Search::Limits.startTime + 1;
 
-        cout << "\n";
-        cout << "PROGRESS (calc_loss): " << now_string()
+        auto out = sync_region_cout.new_region();
+
+        out << "\n";
+        out << "PROGRESS (calc_loss): " << now_string()
              << ", " << total_done << " sfens"
              << ", " << total_done * 1000 / elapsed  << " sfens/second"
              << ", epoch " << epoch
              << endl;
 
-        cout << "==> learning rate = " << global_learning_rate << endl;
+        out << "==> learning rate = " << global_learning_rate << endl;
 
         // For calculation of verification data loss
         AtomicLoss test_loss_sum{};
@@ -694,11 +700,11 @@ namespace Learner
         atomic<int> move_accord_count{0};
 
         auto mainThread = Threads.main();
-        mainThread->execute_with_worker([](auto& th){
+        mainThread->execute_with_worker([&out](auto& th){
             auto& pos = th.rootPos;
             StateInfo si;
             pos.set(StartFEN, false, &si, &th);
-            cout << "==> startpos eval = " << Eval::evaluate(pos) << endl;
+            out << "==> startpos eval = " << Eval::evaluate(pos) << endl;
         });
         mainThread->wait_for_worker_finished();
 
@@ -721,19 +727,19 @@ namespace Learner
 
         if (psv.size() && test_loss_sum.count > 0.0)
         {
-            test_loss_sum.print("test", cout);
+            test_loss_sum.print("test", out);
 
             if (learn_loss_sum.count > 0.0)
             {
-                learn_loss_sum.print("learn", cout);
+                learn_loss_sum.print("learn", out);
             }
 
-            cout << "==> norm = " << sum_norm << endl;
-            cout << "==> move accuracy = " << (move_accord_count * 100.0 / psv.size()) << "%" << endl;
+            out << "==> norm = " << sum_norm << endl;
+            out << "==> move accuracy = " << (move_accord_count * 100.0 / psv.size()) << "%" << endl;
         }
         else
         {
-            cout << "Error! : psv.size() = " << psv.size() << " ,  done = " << test_loss_sum.count << endl;
+            out << "ERROR: psv.size() = " << psv.size() << " ,  done = " << test_loss_sum.count << endl;
         }
 
         learn_loss_sum.reset();
diff --git a/src/learn/sfen_reader.h b/src/learn/sfen_reader.h
index 4d5a6d1a..3547b6bb 100644
--- a/src/learn/sfen_reader.h
+++ b/src/learn/sfen_reader.h
@@ -209,18 +209,19 @@ namespace Learner{
 
                     sfen_input_stream = open_sfen_input_file(currentFilename);
 
+                    auto out = sync_region_cout.new_region();
                     if (sfen_input_stream == nullptr)
                     {
-                        std::cout << "INFO (sfen_reader): File does not exist: " << currentFilename << '\n';
+                        out << "INFO (sfen_reader): File does not exist: " << currentFilename << '\n';
                     }
                     else
                     {
-                        std::cout << "INFO (sfen_reader): Opened file for reading: " << currentFilename << '\n';
+                        out << "INFO (sfen_reader): Opened file for reading: " << currentFilename << '\n';
 
                         // in case the file is empty or was deleted.
                         if (sfen_input_stream->eof())
                         {
-                            std::cout << "INFO (sfen_reader): File empty, nothing to read.\n";
+                            out << "==> File empty, nothing to read.\n";
                         }
                         else
                         {
@@ -232,7 +233,8 @@ namespace Learner{
 
             if (sfen_input_stream == nullptr && !open_next_file())
             {
-                std::cout << "INFO (sfen_reader): End of files." << std::endl;
+                auto out = sync_region_cout.new_region();
+                out << "INFO (sfen_reader): End of files." << std::endl;
                 end_of_files = true;
                 return;
             }
@@ -271,7 +273,8 @@ namespace Learner{
                         if(!open_next_file())
                         {
                             // There was no next file. Abort.
-                            std::cout << "INFO (sfen_reader): End of files." << std::endl;
+                            auto out = sync_region_cout.new_region();
+                            out << "INFO (sfen_reader): End of files." << std::endl;
                             end_of_files = true;
                             return;
                         }
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 64b558bd..9e960da4 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -224,7 +224,9 @@ namespace Eval::NNUE {
             const double avg_abs_eval_diff = abs_eval_diff_sum / batch_size;
             const double avg_abs_discrete_eval = abs_discrete_eval_sum / batch_size;
 
-            std::cout << "INFO (update_parameters):"
+            auto out = sync_region_cout.new_region();
+
+            out << "INFO (update_parameters):"
                 << " epoch = " << epoch
                 << " , avg_abs(trainer_eval-nnue_eval) = " << avg_abs_eval_diff
                 << " , avg_abs(nnue_eval) = " << avg_abs_discrete_eval
diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index 284b7e73..49b715db 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -94,19 +94,24 @@ namespace Eval::NNUE {
 
         // Check if there are any problems with learning
         void check_health() {
+
             const auto largest_min_activation = *std::max_element(
                 std::begin(min_activations_), std::end(min_activations_));
             const auto smallest_max_activation = *std::min_element(
                 std::begin(max_activations_), std::end(max_activations_));
 
-            std::cout << "INFO (check_health):"
-                      << " layer " << LayerType::kLayerIndex
-                      << " - " << LayerType::get_name()
-                      << std::endl;
+            auto out = sync_region_cout.new_region();
 
-            std::cout << "==> largest min activation = " << largest_min_activation
-                      << " , smallest max activation = " << smallest_max_activation
-                      << std::endl;
+            out << "INFO (check_health):"
+                << " layer " << LayerType::kLayerIndex
+                << " - " << LayerType::get_name()
+                << std::endl;
+
+            out << "==> largest min activation = " << largest_min_activation
+                << " , smallest max activation = " << smallest_max_activation
+                << std::endl;
+
+            out.unlock();
 
             std::fill(std::begin(min_activations_), std::end(min_activations_),
                       std::numeric_limits<LearnFloatType>::max());
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index fea419c9..34c423b4 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -330,33 +330,38 @@ namespace Eval::NNUE {
 
         // Check if there are any problems with learning
         void check_health() {
-            std::cout << "INFO (check_health):"
-                      << " layer " << LayerType::kLayerIndex
-                      << " - " << LayerType::get_name()
-                      << std::endl;
-
-            std::cout << "==> observed " << observed_features.count()
-                      << " (out of " << kInputDimensions << ") features"
-                      << std::endl;
 
             constexpr LearnFloatType kPreActivationLimit =
                 std::numeric_limits<typename LayerType::WeightType>::max() /
                 kWeightScale;
 
-            std::cout << "==> (min, max) of pre-activations = "
-                      << min_pre_activation_ << ", "
-                      << max_pre_activation_ << " (limit = "
-                      << kPreActivationLimit << ")"
-                      << std::endl;
-
             const auto largest_min_activation = *std::max_element(
                 std::begin(min_activations_), std::end(min_activations_));
             const auto smallest_max_activation = *std::min_element(
                 std::begin(max_activations_), std::end(max_activations_));
 
-            std::cout << "==> largest min activation = " << largest_min_activation
-                      << " , smallest max activation = " << smallest_max_activation
-                      << std::endl;
+            auto out = sync_region_cout.new_region();
+
+            out << "INFO (check_health):"
+                << " layer " << LayerType::kLayerIndex
+                << " - " << LayerType::get_name()
+                << std::endl;
+
+            out << "==> observed " << observed_features.count()
+                << " (out of " << kInputDimensions << ") features"
+                << std::endl;
+
+            out << "==> (min, max) of pre-activations = "
+                << min_pre_activation_ << ", "
+                << max_pre_activation_ << " (limit = "
+                << kPreActivationLimit << ")"
+                << std::endl;
+
+            out << "==> largest min activation = " << largest_min_activation
+                << " , smallest max activation = " << smallest_max_activation
+                << std::endl;
+
+            out.unlock();
 
             std::fill(std::begin(min_activations_), std::end(min_activations_),
                       std::numeric_limits<LearnFloatType>::max());

From b882423005f62978cec4eb7f903b76df59149f48 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 13:46:34 +0200
Subject: [PATCH 260/398] Bring back info for finished evalsave. Update tests
 with the new message.

---
 src/nnue/evaluate_nnue_learner.cpp | 6 +++++-
 tests/instrumented_learn.sh        | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 9e960da4..0151b3f8 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -247,7 +247,10 @@ namespace Eval::NNUE {
     // save merit function parameters to a file
     void save_eval(std::string dir_name) {
         auto eval_dir = Path::combine(Options["EvalSaveDir"], dir_name);
-        std::cout << "INFO (save_eval): Saving current evaluation file in " << eval_dir << std::endl;
+
+        auto out = sync_region_cout.new_region();
+
+        out << "INFO (save_eval): Saving current evaluation file in " << eval_dir << std::endl;
 
         // mkdir() will fail if this folder already exists, but
         // Apart from that. If not, I just want you to make it.
@@ -263,5 +266,6 @@ namespace Eval::NNUE {
 #ifndef NDEBUG
         assert(result);
 #endif
+        out << "INFO (save_eval): Saving current evaluation file in " << eval_dir << std::endl;
     }
 }  // namespace Eval::NNUE
\ No newline at end of file
diff --git a/tests/instrumented_learn.sh b/tests/instrumented_learn.sh
index 4ce3dc1c..50b6e4ae 100755
--- a/tests/instrumented_learn.sh
+++ b/tests/instrumented_learn.sh
@@ -129,7 +129,7 @@ cat << EOF > learn01.exp
  send "isready\n"
  send "learn targetdir training_data epochs 1 sfen_read_size 100 thread_buffer_size 10 batchsize 100 use_draw_in_training 1 use_draw_in_validation 1 lr 1 eval_limit 32000 nn_batch_size 30 newbob_decay 0.5 eval_save_interval 30 loss_output_interval 10 validation_set_file_name validation_data/validation_data.bin\n"
 
- expect "save_eval() finished."
+ expect "INFO (save_eval): Saving current evaluation file in"
 
  send "quit\n"
  expect eof

From 2c477d76ec8cf8915dc520cb35bd150d78c7ddd6 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 14:21:59 +0200
Subject: [PATCH 261/398] Cleaner and more outputs during training
 initialization.

---
 src/learn/learn.cpp                           | 124 +++++++++++-------
 src/learn/sfen_reader.h                       |   2 +-
 src/misc.h                                    |  42 +++---
 src/nnue/evaluate_nnue_learner.cpp            |  19 ++-
 src/nnue/evaluate_nnue_learner.h              |   6 +-
 src/nnue/layers/affine_transform.h            |   4 +-
 src/nnue/layers/clipped_relu.h                |   4 +-
 src/nnue/layers/input_slice.h                 |   5 +-
 src/nnue/layers/sum.h                         |   4 +-
 src/nnue/nnue_feature_transformer.h           |   5 +-
 src/nnue/trainer/trainer_clipped_relu.h       |   2 +-
 .../trainer/trainer_feature_transformer.h     |   6 +-
 12 files changed, 129 insertions(+), 94 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index b0ae62f6..3faab0ea 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -141,12 +141,12 @@ namespace Learner
             template <typename StreamT>
             void print(const std::string& prefix, StreamT& s) const
             {
-                s << "==> " << prefix << "_cross_entropy_eval = " << cross_entropy_eval / count << endl;
-                s << "==> " << prefix << "_cross_entropy_win  = " << cross_entropy_win / count << endl;
-                s << "==> " << prefix << "_entropy_eval       = " << entropy_eval / count << endl;
-                s << "==> " << prefix << "_entropy_win        = " << entropy_win / count << endl;
-                s << "==> " << prefix << "_cross_entropy      = " << cross_entropy / count << endl;
-                s << "==> " << prefix << "_entropy            = " << entropy / count << endl;
+                s << "  - " << prefix << "_cross_entropy_eval = " << cross_entropy_eval / count << endl;
+                s << "  - " << prefix << "_cross_entropy_win  = " << cross_entropy_win / count << endl;
+                s << "  - " << prefix << "_entropy_eval       = " << entropy_eval / count << endl;
+                s << "  - " << prefix << "_entropy_win        = " << entropy_win / count << endl;
+                s << "  - " << prefix << "_cross_entropy      = " << cross_entropy / count << endl;
+                s << "  - " << prefix << "_entropy            = " << entropy / count << endl;
             }
         };
     }
@@ -687,7 +687,7 @@ namespace Learner
              << ", epoch " << epoch
              << endl;
 
-        out << "==> learning rate = " << global_learning_rate << endl;
+        out << "  - learning rate = " << global_learning_rate << endl;
 
         // For calculation of verification data loss
         AtomicLoss test_loss_sum{};
@@ -704,7 +704,7 @@ namespace Learner
             auto& pos = th.rootPos;
             StateInfo si;
             pos.set(StartFEN, false, &si, &th);
-            out << "==> startpos eval = " << Eval::evaluate(pos) << endl;
+            out << "  - startpos eval = " << Eval::evaluate(pos) << endl;
         });
         mainThread->wait_for_worker_finished();
 
@@ -734,8 +734,8 @@ namespace Learner
                 learn_loss_sum.print("learn", out);
             }
 
-            out << "==> norm = " << sum_norm << endl;
-            out << "==> move accuracy = " << (move_accord_count * 100.0 / psv.size()) << "%" << endl;
+            out << "  - norm = " << sum_norm << endl;
+            out << "  - move accuracy = " << (move_accord_count * 100.0 / psv.size()) << "%" << endl;
         }
         else
         {
@@ -852,7 +852,7 @@ namespace Learner
                 latest_loss_sum = 0.0;
                 latest_loss_count = 0;
                 cout << "INFO (learning_rate):" << endl;
-                cout << "==> loss = " << latest_loss;
+                cout << "  - loss = " << latest_loss;
                 auto tot = total_done;
                 if (auto_lr_drop)
                 {
@@ -882,7 +882,7 @@ namespace Learner
                     if (--trials > 0 && !is_final)
                     {
                         cout
-                            << "==> reducing learning rate from " << global_learning_rate
+                            << "  - reducing learning rate from " << global_learning_rate
                             << " to " << (global_learning_rate * newbob_decay)
                             << " (" << trials << " more trials)" << endl;
 
@@ -892,7 +892,7 @@ namespace Learner
 
                 if (trials == 0)
                 {
-                    cout << "==> converged" << endl;
+                    cout << "  - converged" << endl;
                     return true;
                 }
             }
@@ -980,6 +980,8 @@ namespace Learner
         string validation_set_file_name;
         string seed;
 
+        auto out = sync_region_cout.new_region();
+
         // Assume the filenames are staggered.
         while (true)
         {
@@ -1083,7 +1085,7 @@ namespace Learner
             else if (option == "verbose") verbose = true;
             else
             {
-                cout << "Unknown option: " << option << ". Ignoring.\n";
+                out << "INFO: Unknown option: " << option << ". Ignoring.\n";
             }
         }
 
@@ -1092,11 +1094,14 @@ namespace Learner
             loss_output_interval = LEARN_RMSE_OUTPUT_INTERVAL * mini_batch_size;
         }
 
-        cout << "learn command , ";
+        // If reduction_gameply is set to 0, rand(0) will be divided by 0, so correct it to 1.
+        reduction_gameply = max(reduction_gameply, 1);
+
+        out << "INFO: Executing learn command\n";
 
         // Issue a warning if OpenMP is disabled.
 #if !defined(_OPENMP)
-        cout << "Warning! OpenMP disabled." << endl;
+        out << "WARNING: OpenMP disabled." << endl;
 #endif
 
         // Right now we only have the individual files.
@@ -1107,65 +1112,80 @@ namespace Learner
         }
         rebase_files(filenames, base_dir);
 
-        cout << "learn from ";
+        out << "INFO: Input files:\n";
         for (auto s : filenames)
-            cout << s << " , ";
+            out << "  - " << s << '\n';
 
-        cout << endl;
+        out << "INFO: Parameters:\n";
         if (!validation_set_file_name.empty())
         {
-            cout << "validation set  : " << validation_set_file_name << endl;
+            out << "  - validation set           : " << validation_set_file_name << endl;
         }
 
-        cout << "base dir        : " << base_dir << endl;
-        cout << "target dir      : " << target_dir << endl;
+        out << "  - epochs                   : " << epochs << endl;
+        out << "  - epochs * minibatch size  : " << epochs * mini_batch_size << endl;
+        out << "  - eval_limit               : " << eval_limit << endl;
+        out << "  - save_only_once           : " << (save_only_once ? "true" : "false") << endl;
+        out << "  - shuffle on read          : " << (no_shuffle ? "false" : "true") << endl;
 
-        cout << "epochs            : " << epochs << endl;
-        cout << "eval_limit        : " << eval_limit << endl;
-        cout << "save_only_once    : " << (save_only_once ? "true" : "false") << endl;
-        cout << "no_shuffle        : " << (no_shuffle ? "true" : "false") << endl;
+        out << "  - Loss Function            : " << LOSS_FUNCTION << endl;
+        out << "  - minibatch size           : " << mini_batch_size << endl;
 
-        cout << "Loss Function     : " << LOSS_FUNCTION << endl;
-        cout << "mini-batch size   : " << mini_batch_size << endl;
+        out << "  - nn_batch_size            : " << nn_batch_size << endl;
+        out << "  - nn_options               : " << nn_options << endl;
 
-        cout << "nn_batch_size     : " << nn_batch_size << endl;
-        cout << "nn_options        : " << nn_options << endl;
+        out << "  - learning rate            : " << global_learning_rate << endl;
+        out << "  - use draws in training    : " << use_draw_games_in_training << endl;
+        out << "  - use draws in validation  : " << use_draw_games_in_validation << endl;
+        out << "  - skip repeated positions  : " << skip_duplicated_positions_in_training << endl;
 
-        cout << "learning rate     : " << global_learning_rate << endl;
-        cout << "use_draw_games_in_training : " << use_draw_games_in_training << endl;
-        cout << "use_draw_games_in_validation : " << use_draw_games_in_validation << endl;
-        cout << "skip_duplicated_positions_in_training : " << skip_duplicated_positions_in_training << endl;
+        out << "  - winning prob coeff       : " << winning_probability_coefficient << endl;
+        out << "  - use_wdl                  : " << use_wdl << endl;
 
-        if (newbob_decay != 1.0) {
-            cout << "scheduling        : newbob with decay = " << newbob_decay
-                << ", " << newbob_num_trials << " trials" << endl;
+        out << "  - src_score_min_value      : " << src_score_min_value << endl;
+        out << "  - src_score_max_value      : " << src_score_max_value << endl;
+        out << "  - dest_score_min_value     : " << dest_score_min_value << endl;
+        out << "  - dest_score_max_value     : " << dest_score_max_value << endl;
+
+        out << "  - reduction_gameply        : " << reduction_gameply << endl;
+
+        out << "  - LAMBDA                   : " << ELMO_LAMBDA << endl;
+        out << "  - LAMBDA2                  : " << ELMO_LAMBDA2 << endl;
+        out << "  - LAMBDA_LIMIT             : " << ELMO_LAMBDA_LIMIT << endl;
+        out << "  - eval_save_interval       : " << eval_save_interval << " sfens" << endl;
+        out << "  - loss_output_interval     : " << loss_output_interval << " sfens" << endl;
+
+        out << "  - sfen_read_size           : " << sfen_read_size << endl;
+        out << "  - thread_buffer_size       : " << thread_buffer_size << endl;
+
+        out << "  - seed                     : " << seed << endl;
+        out << "  - verbose                  : " << (verbose ? "true" : "false") << endl;
+
+        if (auto_lr_drop) {
+            out << "  - learning rate scheduling : every " << auto_lr_drop << " sfens" << endl;
+        }
+        else if (newbob_decay != 1.0) {
+            out << "  - learning rate scheduling : newbob with decay" << endl;
+            out << "  - newbob_decay             : " << newbob_decay << endl;
+            out << "  - newbob_num_trials        : " << newbob_num_trials << endl;
         }
         else {
-            cout << "scheduling        : default" << endl;
+            out << "  - learning rate scheduling : fixed learning rate" << endl;
         }
 
-        // If reduction_gameply is set to 0, rand(0) will be divided by 0, so correct it to 1.
-        reduction_gameply = max(reduction_gameply, 1);
-        cout << "reduction_gameply : " << reduction_gameply << endl;
-
-        cout << "LAMBDA            : " << ELMO_LAMBDA << endl;
-        cout << "LAMBDA2           : " << ELMO_LAMBDA2 << endl;
-        cout << "LAMBDA_LIMIT      : " << ELMO_LAMBDA_LIMIT << endl;
-        cout << "eval_save_interval  : " << eval_save_interval << " sfens" << endl;
-        cout << "loss_output_interval: " << loss_output_interval << " sfens" << endl;
+        out << endl;
 
         // -----------------------------------
         // various initialization
         // -----------------------------------
 
-        cout << "init.." << endl;
+        out << "INFO: Started initialization." << endl;
 
         Threads.main()->ponder = false;
 
         set_learning_search_limits();
 
-        cout << "init_training.." << endl;
-        Eval::NNUE::initialize_training(seed);
+        Eval::NNUE::initialize_training(seed, out);
         Eval::NNUE::set_batch_size(nn_batch_size);
         Eval::NNUE::set_options(nn_options);
 
@@ -1204,7 +1224,9 @@ namespace Learner
 
         learn_think.verbose = verbose;
 
-        cout << "init done." << endl;
+        out << "Finished initialization." << endl;
+
+        out.unlock();
 
         // Start learning.
         learn_think.learn(epochs);
diff --git a/src/learn/sfen_reader.h b/src/learn/sfen_reader.h
index 3547b6bb..512f1165 100644
--- a/src/learn/sfen_reader.h
+++ b/src/learn/sfen_reader.h
@@ -221,7 +221,7 @@ namespace Learner{
                         // in case the file is empty or was deleted.
                         if (sfen_input_stream->eof())
                         {
-                            out << "==> File empty, nothing to read.\n";
+                            out << "  - File empty, nothing to read.\n";
                         }
                         else
                         {
diff --git a/src/misc.h b/src/misc.h
index 4c99cc2b..3e6dc5b0 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -78,27 +78,23 @@ std::ostream& operator<<(std::ostream&, SyncCout);
 // current region releases the lock.
 struct SynchronizedRegionLogger
 {
-private:
   using RegionId = std::uint64_t;
 
-  struct RegionLock
+  struct Region
   {
-    RegionLock(SynchronizedRegionLogger& log, RegionId id) :
-      logger(&log), region_id(id), is_held(true)
-    {
-    }
+    friend struct SynchronizedRegionLogger;
 
-    RegionLock(const RegionLock&) = delete;
-    RegionLock& operator=(const RegionLock&) = delete;
+    Region(const Region&) = delete;
+    Region& operator=(const Region&) = delete;
 
-    RegionLock(RegionLock&& other) :
+    Region(Region&& other) :
       logger(other.logger), region_id(other.region_id), is_held(other.is_held)
     {
       other.logger = nullptr;
       other.is_held = false;
     }
 
-    RegionLock& operator=(RegionLock&& other) {
+    Region& operator=(Region&& other) {
       if (is_held && logger != nullptr)
       {
         logger->release_region(region_id);
@@ -113,7 +109,7 @@ private:
       return *this;
     }
 
-    ~RegionLock() { unlock(); }
+    ~Region() { unlock(); }
 
     void unlock() {
       if (is_held) {
@@ -124,7 +120,7 @@ private:
       }
     }
 
-    RegionLock& operator << (std::ostream&(*pManip)(std::ostream&)) {
+    Region& operator << (std::ostream&(*pManip)(std::ostream&)) {
       if (logger != nullptr)
         logger->write(region_id, pManip);
 
@@ -132,7 +128,7 @@ private:
     }
 
     template <typename T>
-    RegionLock& operator << (const T& value) {
+    Region& operator << (const T& value) {
       if (logger != nullptr)
         logger->write(region_id, value);
 
@@ -143,11 +139,17 @@ private:
     SynchronizedRegionLogger* logger;
     RegionId region_id;
     bool is_held;
+
+    Region(SynchronizedRegionLogger& log, RegionId id) :
+      logger(&log), region_id(id), is_held(true)
+    {
+    }
   };
 
-  struct Region
+private:
+  struct RegionBookkeeping
   {
-    Region(RegionId rid) : id(rid), is_held(true) {}
+    RegionBookkeeping(RegionId rid) : id(rid), is_held(true) {}
 
     std::vector<std::string> pending_parts;
     RegionId id;
@@ -215,16 +217,16 @@ private:
 
   std::ostream& out;
 
-  std::deque<Region> regions;
+  std::deque<RegionBookkeeping> regions;
 
   std::mutex mutex;
 
-  Region* find_region_nolock(RegionId id) {
+  RegionBookkeeping* find_region_nolock(RegionId id) {
     // Linear search because the amount of concurrent regions should be small.
     auto it = std::find_if(
       regions.begin(),
       regions.end(),
-      [id](const Region& r) { return r.id == id; });
+      [id](const RegionBookkeeping& r) { return r.id == id; });
 
     if (it == regions.end())
       return nullptr;
@@ -269,9 +271,9 @@ public:
   {
   }
 
-  [[nodiscard]] RegionLock new_region() {
+  [[nodiscard]] Region new_region() {
     const auto id = init_next_region();
-    return RegionLock(*this, id);
+    return Region(*this, id);
   }
 
 };
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 0151b3f8..7a72ea19 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -54,23 +54,28 @@ namespace Eval::NNUE {
     }  // namespace
 
     // Initialize learning
-    void initialize_training(const std::string& seed) {
-        std::cout << "Initializing NN training for "
-                  << get_architecture_string() << std::endl;
+    void initialize_training(
+        const std::string& seed,
+        SynchronizedRegionLogger::Region& out) {
 
-        std::cout << std::endl;
+        out << "INFO (initialize_training): Initializing NN training for "
+            << get_architecture_string() << std::endl;
 
-        std::cout << "Layers:\n"
-                  << get_layers_info() << std::endl;
+        out << std::endl;
 
-        std::cout << std::endl;
+        out << "Layers:\n"
+            << get_layers_info() << std::endl;
+
+        out << std::endl;
 
         assert(feature_transformer);
         assert(network);
+
         trainer = Trainer<Network>::create(network.get(), feature_transformer.get());
         rng.seed(PRNG(seed).rand<uint64_t>());
 
         if (Options["SkipLoadingEval"]) {
+            out << "INFO (initialize_training): Performing random net initialization.\n";
             trainer->initialize(rng);
         }
     }
diff --git a/src/nnue/evaluate_nnue_learner.h b/src/nnue/evaluate_nnue_learner.h
index 03a23c83..91d2aa99 100644
--- a/src/nnue/evaluate_nnue_learner.h
+++ b/src/nnue/evaluate_nnue_learner.h
@@ -3,11 +3,15 @@
 
 #include "learn/learn.h"
 
+#include "misc.h"
+
 // Interface used for learning NNUE evaluation function
 namespace Eval::NNUE {
 
     // Initialize learning
-    void initialize_training(const std::string& seed);
+    void initialize_training(
+        const std::string& seed,
+        SynchronizedRegionLogger::Region& out);
 
     // set the number of samples in the mini-batch
     void set_batch_size(uint64_t size);
diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index e734580e..1227efff 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -82,9 +82,9 @@ namespace Eval::NNUE::Layers {
 
         static std::string get_layers_info() {
             std::string info = PreviousLayer::get_layers_info();
-            info += '\n';
+            info += "\n  - ";
             info += std::to_string(kLayerIndex);
-            info += ": ";
+            info += " - ";
             info += get_name();
             return info;
         }
diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h
index 5fbd66cc..40185b13 100644
--- a/src/nnue/layers/clipped_relu.h
+++ b/src/nnue/layers/clipped_relu.h
@@ -76,9 +76,9 @@ namespace Eval::NNUE::Layers {
 
         static std::string get_layers_info() {
             std::string info = PreviousLayer::get_layers_info();
-            info += '\n';
+            info += "\n  - ";
             info += std::to_string(kLayerIndex);
-            info += ": ";
+            info += " - ";
             info += get_name();
             return info;
         }
diff --git a/src/nnue/layers/input_slice.h b/src/nnue/layers/input_slice.h
index 56c738af..3dc613b9 100644
--- a/src/nnue/layers/input_slice.h
+++ b/src/nnue/layers/input_slice.h
@@ -65,8 +65,9 @@ namespace Eval::NNUE::Layers {
         }
 
         static std::string get_layers_info() {
-            std::string info = std::to_string(kLayerIndex);
-            info += ": ";
+            std::string info = "  - ";
+            info += std::to_string(kLayerIndex);
+            info += " - ";
             info += get_name();
             return info;
         }
diff --git a/src/nnue/layers/sum.h b/src/nnue/layers/sum.h
index 0f71bd61..261dbee1 100644
--- a/src/nnue/layers/sum.h
+++ b/src/nnue/layers/sum.h
@@ -60,9 +60,9 @@ namespace Eval::NNUE::Layers {
 
         static std::string get_layers_info() {
             std::string info = Tail::get_layers_info();
-            info += '\n';
+            info += "\n  - ";
             info += std::to_string(kLayerIndex);
-            info += ": ";
+            info += " - ";
             info += get_name();
             return info;
         }
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index 3e18e68a..2089ab1c 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -130,8 +130,9 @@ namespace Eval::NNUE {
         }
 
         static std::string get_layers_info() {
-            std::string info = std::to_string(kLayerIndex);
-            info += ": ";
+            std::string info = "  - ";
+            info += std::to_string(kLayerIndex);
+            info += " - ";
             info += get_name();
             return info;
         }
diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index 49b715db..f9bbd833 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -107,7 +107,7 @@ namespace Eval::NNUE {
                 << " - " << LayerType::get_name()
                 << std::endl;
 
-            out << "==> largest min activation = " << largest_min_activation
+            out << "  - largest min activation = " << largest_min_activation
                 << " , smallest max activation = " << smallest_max_activation
                 << std::endl;
 
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 34c423b4..ffde6eba 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -347,17 +347,17 @@ namespace Eval::NNUE {
                 << " - " << LayerType::get_name()
                 << std::endl;
 
-            out << "==> observed " << observed_features.count()
+            out << "  - observed " << observed_features.count()
                 << " (out of " << kInputDimensions << ") features"
                 << std::endl;
 
-            out << "==> (min, max) of pre-activations = "
+            out << "  - (min, max) of pre-activations = "
                 << min_pre_activation_ << ", "
                 << max_pre_activation_ << " (limit = "
                 << kPreActivationLimit << ")"
                 << std::endl;
 
-            out << "==> largest min activation = " << largest_min_activation
+            out << "  - largest min activation = " << largest_min_activation
                 << " , smallest max activation = " << smallest_max_activation
                 << std::endl;
 

From fe766f4f4298ac8cfa62be340f3c94f45f1cd365 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 16:51:31 +0200
Subject: [PATCH 262/398] Additional output from layers during training.

---
 src/nnue/trainer/trainer_affine_transform.h   | 52 ++++++++++++++++---
 src/nnue/trainer/trainer_clipped_relu.h       | 24 ++++++---
 .../trainer/trainer_feature_transformer.h     | 40 +++++++++-----
 3 files changed, 88 insertions(+), 28 deletions(-)

diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h
index f6d374ef..21e54f18 100644
--- a/src/nnue/trainer/trainer_affine_transform.h
+++ b/src/nnue/trainer/trainer_affine_transform.h
@@ -48,6 +48,10 @@ namespace Eval::NNUE {
             if (receive_message("quantize_parameters", message)) {
                 quantize_parameters();
             }
+
+            if (receive_message("check_health", message)) {
+                check_health();
+            }
         }
 
         // Initialize the parameters with random numbers
@@ -145,16 +149,11 @@ namespace Eval::NNUE {
                           &gradients[batch_offset], 1, biases_diff_, 1);
             }
 
-            cblas_saxpy(kOutputDimensions, -local_learning_rate,
-                        biases_diff_, 1, biases_, 1);
-
             cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
                         kOutputDimensions, kInputDimensions, batch_size_, 1.0,
                         gradients, kOutputDimensions,
                         batch_input_, kInputDimensions,
                         momentum_, weights_diff_, kInputDimensions);
-            cblas_saxpy(kOutputDimensions * kInputDimensions, -local_learning_rate,
-                        weights_diff_, 1, weights_, 1);
 
 #else
             // backpropagate
@@ -196,16 +195,22 @@ namespace Eval::NNUE {
                     }
                 }
             }
+#endif
 
             for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                biases_[i] -= local_learning_rate * biases_diff_[i];
+                const double d = local_learning_rate * biases_diff_[i];
+                biases_[i] -= d;
+                abs_biases_diff_sum_ += std::abs(d);
             }
+            num_biases_diffs_ += kOutputDimensions;
 
             for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
-                weights_[i] -= local_learning_rate * weights_diff_[i];
+                const double d = local_learning_rate * weights_diff_[i];
+                weights_[i] -= d;
+                abs_weights_diff_sum_ += std::abs(d);
             }
+            num_weights_diffs_ += kOutputDimensions * kInputDimensions;
 
-#endif
             previous_layer_trainer_->backpropagate(gradients_.data(), learning_rate);
         }
 
@@ -227,6 +232,30 @@ namespace Eval::NNUE {
             dequantize_parameters();
         }
 
+        void reset_stats() {
+            abs_biases_diff_sum_ = 0.0;
+            abs_weights_diff_sum_ = 0.0;
+            num_biases_diffs_ = 0;
+            num_weights_diffs_ = 0;
+        }
+
+        void check_health() {
+
+            auto out = sync_region_cout.new_region();
+
+            out << "INFO (check_health):"
+                << " layer " << LayerType::kLayerIndex
+                << " - " << LayerType::get_name()
+                << std::endl;
+
+            out << "  - avg_abs_bias_diff   = " << abs_biases_diff_sum_ / num_biases_diffs_ << std::endl;
+            out << "  - avg_abs_weight_diff = " << abs_weights_diff_sum_ / num_weights_diffs_ << std::endl;
+
+            out.unlock();
+
+            reset_stats();
+        }
+
         // Weight saturation and parameterization
         void quantize_parameters() {
             for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
@@ -270,6 +299,8 @@ namespace Eval::NNUE {
                       static_cast<LearnFloatType>(0.0));
             std::fill(std::begin(weights_diff_), std::end(weights_diff_),
                       static_cast<LearnFloatType>(0.0));
+
+            reset_stats();
         }
 
         // number of input/output dimensions
@@ -296,6 +327,11 @@ namespace Eval::NNUE {
         // number of samples in mini-batch
         IndexType batch_size_;
 
+        double abs_biases_diff_sum_;
+        double abs_weights_diff_sum_;
+        uint64_t num_biases_diffs_;
+        uint64_t num_weights_diffs_;
+
         // Input mini batch
         const LearnFloatType* batch_input_;
 
diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index f9bbd833..57e9bac4 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -70,10 +70,12 @@ namespace Eval::NNUE {
                 const IndexType batch_offset = kOutputDimensions * b;
                 for (IndexType i = 0; i < kOutputDimensions; ++i) {
                     const IndexType index = batch_offset + i;
-                    gradients_[index] = gradients[index] *
-                        (output_[index] > kZero) * (output_[index] < kOne);
+                    const bool clipped = (output_[index] <= kZero) | (output_[index] >= kOne);
+                    gradients_[index] = gradients[index] * !clipped;
+                    num_clipped_ += clipped;
                 }
             }
+            num_total_ += batch_size_ * kOutputDimensions;
 
             previous_layer_trainer_->backpropagate(gradients_.data(), learning_rate);
         }
@@ -86,10 +88,17 @@ namespace Eval::NNUE {
                 &target_layer->previous_layer_, ft)),
             target_layer_(target_layer) {
 
+            reset_stats();
+        }
+
+        void reset_stats() {
             std::fill(std::begin(min_activations_), std::end(min_activations_),
                       std::numeric_limits<LearnFloatType>::max());
             std::fill(std::begin(max_activations_), std::end(max_activations_),
                       std::numeric_limits<LearnFloatType>::lowest());
+
+            num_clipped_ = 0;
+            num_total_ = 0;
         }
 
         // Check if there are any problems with learning
@@ -111,12 +120,12 @@ namespace Eval::NNUE {
                 << " , smallest max activation = " << smallest_max_activation
                 << std::endl;
 
+            out << "  - clipped " << static_cast<double>(num_clipped_) / num_total_ * 100.0 << "% of outputs"
+                << std::endl;
+
             out.unlock();
 
-            std::fill(std::begin(min_activations_), std::end(min_activations_),
-                      std::numeric_limits<LearnFloatType>::max());
-            std::fill(std::begin(max_activations_), std::end(max_activations_),
-                      std::numeric_limits<LearnFloatType>::lowest());
+            reset_stats();
         }
 
         // number of input/output dimensions
@@ -130,6 +139,9 @@ namespace Eval::NNUE {
         // number of samples in mini-batch
         IndexType batch_size_;
 
+        IndexType num_clipped_;
+        IndexType num_total_;
+
         // Trainer of the previous layer
         const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
 
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index ffde6eba..869ceb85 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -153,10 +153,12 @@ namespace Eval::NNUE {
                 const IndexType batch_offset = kOutputDimensions * b;
                 for (IndexType i = 0; i < kOutputDimensions; ++i) {
                     const IndexType index = batch_offset + i;
-                    gradients_[index] = gradients[index] *
-                        ((output_[index] > kZero) * (output_[index] < kOne));
+                    const bool clipped = (output_[index] <= kZero) | (output_[index] >= kOne);
+                    gradients_[index] = gradients[index] * !clipped;
+                    num_clipped_ += clipped;
                 }
             }
+            num_total_ += batch_->size() * kOutputDimensions;
 
             // Since the weight matrix updates only the columns corresponding to the features that appeared in the input,
             // Correct the learning rate and adjust the scale without using momentum
@@ -261,14 +263,6 @@ namespace Eval::NNUE {
             momentum_(0.2),
             learning_rate_scale_(1.0) {
 
-            min_pre_activation_ = std::numeric_limits<LearnFloatType>::max();
-            max_pre_activation_ = std::numeric_limits<LearnFloatType>::lowest();
-
-            std::fill(std::begin(min_activations_), std::end(min_activations_),
-                      std::numeric_limits<LearnFloatType>::max());
-            std::fill(std::begin(max_activations_), std::end(max_activations_),
-                      std::numeric_limits<LearnFloatType>::lowest());
-
             dequantize_parameters();
         }
 
@@ -299,6 +293,19 @@ namespace Eval::NNUE {
             }
         }
 
+        void reset_stats() {
+            min_pre_activation_ = std::numeric_limits<LearnFloatType>::max();
+            max_pre_activation_ = std::numeric_limits<LearnFloatType>::lowest();
+
+            std::fill(std::begin(min_activations_), std::end(min_activations_),
+                      std::numeric_limits<LearnFloatType>::max());
+            std::fill(std::begin(max_activations_), std::end(max_activations_),
+                      std::numeric_limits<LearnFloatType>::lowest());
+
+            num_clipped_ = 0;
+            num_total_ = 0;
+        }
+
         // read parameterized integer
         void dequantize_parameters() {
             for (IndexType i = 0; i < kHalfDimensions; ++i) {
@@ -314,6 +321,8 @@ namespace Eval::NNUE {
             }
 
             std::fill(std::begin(biases_diff_), std::end(biases_diff_), +kZero);
+
+            reset_stats();
         }
 
         // Set the weight corresponding to the feature that does not appear in the learning data to 0
@@ -361,12 +370,12 @@ namespace Eval::NNUE {
                 << " , smallest max activation = " << smallest_max_activation
                 << std::endl;
 
+            out << "  - clipped " << static_cast<double>(num_clipped_) / num_total_ * 100.0 << "% of outputs"
+                << std::endl;
+
             out.unlock();
 
-            std::fill(std::begin(min_activations_), std::end(min_activations_),
-                      std::numeric_limits<LearnFloatType>::max());
-            std::fill(std::begin(max_activations_), std::end(max_activations_),
-                      std::numeric_limits<LearnFloatType>::lowest());
+            reset_stats();
         }
 
         // number of input/output dimensions
@@ -391,6 +400,9 @@ namespace Eval::NNUE {
         // layer to learn
         LayerType* const target_layer_;
 
+        IndexType num_clipped_;
+        IndexType num_total_;
+
         // parameter
         alignas(kCacheLineSize) LearnFloatType biases_[kHalfDimensions];
         alignas(kCacheLineSize)

From 0e528995c279c773f5e6e5903bc4631586d8d27c Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 20:03:40 +0200
Subject: [PATCH 263/398] Print avg bias/weight for affine trasform and feature
 transformer during training.

---
 src/nnue/trainer/trainer_affine_transform.h    | 11 +++++++++++
 src/nnue/trainer/trainer_feature_transformer.h | 12 ++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h
index 21e54f18..3179aeb0 100644
--- a/src/nnue/trainer/trainer_affine_transform.h
+++ b/src/nnue/trainer/trainer_affine_transform.h
@@ -241,6 +241,15 @@ namespace Eval::NNUE {
 
         void check_health() {
 
+            double abs_bias_sum = 0.0;
+            double abs_weight_sum = 0.0;
+
+            for(auto b : biases_)
+                abs_bias_sum += std::abs(b);
+
+            for(auto w : weights_)
+                abs_weight_sum += std::abs(w);
+
             auto out = sync_region_cout.new_region();
 
             out << "INFO (check_health):"
@@ -248,7 +257,9 @@ namespace Eval::NNUE {
                 << " - " << LayerType::get_name()
                 << std::endl;
 
+            out << "  - avg_abs_bias        = " << abs_bias_sum / std::size(biases_) << std::endl;
             out << "  - avg_abs_bias_diff   = " << abs_biases_diff_sum_ / num_biases_diffs_ << std::endl;
+            out << "  - avg_abs_weight      = " << abs_weight_sum / std::size(weights_) << std::endl;
             out << "  - avg_abs_weight_diff = " << abs_weights_diff_sum_ / num_weights_diffs_ << std::endl;
 
             out.unlock();
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 869ceb85..97b19c46 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -349,6 +349,15 @@ namespace Eval::NNUE {
             const auto smallest_max_activation = *std::min_element(
                 std::begin(max_activations_), std::end(max_activations_));
 
+            double abs_bias_sum = 0.0;
+            double abs_weight_sum = 0.0;
+
+            for(auto b : biases_)
+                abs_bias_sum += std::abs(b);
+
+            for(auto w : weights_)
+                abs_weight_sum += std::abs(w);
+
             auto out = sync_region_cout.new_region();
 
             out << "INFO (check_health):"
@@ -370,6 +379,9 @@ namespace Eval::NNUE {
                 << " , smallest max activation = " << smallest_max_activation
                 << std::endl;
 
+            out << "  - avg_abs_bias   = " << abs_bias_sum / std::size(biases_) << std::endl;
+            out << "  - avg_abs_weight = " << abs_weight_sum / std::size(weights_) << std::endl;
+
             out << "  - clipped " << static_cast<double>(num_clipped_) / num_total_ * 100.0 << "% of outputs"
                 << std::endl;
 

From af238fe132778621125f8406052f1b55f7a6e13b Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 21:18:06 +0200
Subject: [PATCH 264/398] Rewrite gensfen to use stockfish's thread pool.

---
 src/learn/gensfen.cpp | 180 ++++++++++++++++++++----------------------
 1 file changed, 84 insertions(+), 96 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 22fddafb..b2325e40 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -1,7 +1,6 @@
 ﻿#include "gensfen.h"
 
 #include "packed_sfen.h"
-#include "multi_think.h"
 #include "sfen_stream.h"
 
 #include "misc.h"
@@ -261,7 +260,7 @@ namespace Learner
     // -----------------------------------
 
     // Class to generate sfen with multiple threads
-    struct MultiThinkGenSfen : public MultiThink
+    struct MultiThinkGenSfen
     {
         // Hash to limit the export of identical sfens
         static constexpr uint64_t GENSFEN_HASH_SIZE = 64 * 1024 * 1024;
@@ -269,7 +268,7 @@ namespace Learner
         static_assert((GENSFEN_HASH_SIZE& (GENSFEN_HASH_SIZE - 1)) == 0);
 
         MultiThinkGenSfen(int search_depth_min_, int search_depth_max_, SfenWriter& sw_, const std::string& seed) :
-            MultiThink(seed),
+            prng(seed),
             search_depth_min(search_depth_min_),
             search_depth_max(search_depth_max_),
             sfen_writer(sw_)
@@ -285,7 +284,9 @@ namespace Learner
             sfen_writer.start_file_write_worker();
         }
 
-        void thread_worker(size_t thread_id) override;
+        void gensfen(uint64_t limit);
+
+        void thread_worker(Thread& th, std::atomic<uint64_t>& counter, uint64_t limit);
 
         optional<int8_t> get_current_game_result(
             Position& pos,
@@ -293,7 +294,14 @@ namespace Learner
 
         vector<uint8_t> generate_random_move_flags();
 
-        bool commit_psv(PSVector& a_psv, size_t thread_id, int8_t lastTurnIsWin);
+        bool was_seen_before(const Position& pos);
+
+        bool commit_psv(
+            Thread& th,
+            PSVector& sfens,
+            int8_t lastTurnIsWin,
+            std::atomic<uint64_t>& counter,
+            uint64_t limit);
 
         optional<Move> choose_random_move(
             Position& pos,
@@ -301,6 +309,8 @@ namespace Learner
             int ply,
             int& random_move_c);
 
+        PRNG prng;
+
         // Min and max depths for search during gensfen
         int search_depth_min;
         int search_depth_max;
@@ -347,6 +357,15 @@ namespace Learner
         vector<Key> hash; // 64MB*sizeof(HASH_KEY) = 512MB
     };
 
+    void MultiThinkGenSfen::gensfen(uint64_t limit)
+    {
+        std::atomic<uint64_t> counter{0};
+        Threads.execute_with_workers([&counter, limit, this](Thread& th) {
+            thread_worker(th, counter, limit);
+        });
+        Threads.wait_for_workers_finished();
+    }
+
     optional<int8_t> MultiThinkGenSfen::get_current_game_result(
         Position& pos,
         const vector<int>& move_hist_scores) const
@@ -470,7 +489,12 @@ namespace Learner
     // 1 when winning. -1 when losing. Pass 0 for a draw.
     // Return value: true if the specified number of
     // sfens has already been reached and the process ends.
-    bool MultiThinkGenSfen::commit_psv(PSVector& sfens, size_t thread_id, int8_t lastTurnIsWin)
+    bool MultiThinkGenSfen::commit_psv(
+        Thread& th,
+        PSVector& sfens,
+        int8_t lastTurnIsWin,
+        std::atomic<uint64_t>& counter,
+        uint64_t limit)
     {
         if (!write_out_draw_game_in_training_data_generation && lastTurnIsWin == 0)
         {
@@ -482,34 +506,26 @@ namespace Learner
 
         // From the final stage (one step before) to the first stage, give information on the outcome of the game for each stage.
         // The phases stored in sfens are assumed to be continuous (in order).
-        bool quit = false;
-        int num_sfens_to_commit = 0;
         for (auto it = sfens.rbegin(); it != sfens.rend(); ++it)
         {
             // If is_win == 0 (draw), multiply by -1 and it will remain 0 (draw)
             is_win = -is_win;
             it->game_result = is_win;
-
-            // See how many sfens were already written and get the next id.
-            // Exit if requested number of sfens reached.
-            auto now_loop_count = get_next_loop_count();
-            if (now_loop_count == LOOP_COUNT_FINISHED)
-            {
-                quit = true;
-                break;
-            }
-
-            ++num_sfens_to_commit;
         }
 
         // Write sfens in move order to make potential compression easier
-        for (auto it = sfens.end() - num_sfens_to_commit; it != sfens.end(); ++it)
+        for (auto& sfen : sfens)
         {
+            // Return true if there is already enough data generated.
+            const auto iter = counter.fetch_add(1);
+            if (iter >= limit)
+                return true;
+
             // Write out one sfen.
-            sfen_writer.write(thread_id, *it);
+            sfen_writer.write(th.thread_idx(), sfen);
         }
 
-        return quit;
+        return false;
     }
 
     optional<Move> MultiThinkGenSfen::choose_random_move(
@@ -640,8 +656,29 @@ namespace Learner
         return random_move_flag;
     }
 
+    bool MultiThinkGenSfen::was_seen_before(const Position& pos)
+    {
+        // Look into the position hashtable to see if the same
+        // position was seen before.
+        // This is a good heuristic to exlude already seen
+        // positions without many false positives.
+        auto key = pos.key();
+        auto hash_index = (size_t)(key & (GENSFEN_HASH_SIZE - 1));
+        auto old_key = hash[hash_index];
+        if (key == old_key)
+        {
+            return true;
+        }
+        else
+        {
+            // Replace with the current key.
+            hash[hash_index] = key;
+            return false;
+        }
+    }
+
     // thread_id = 0..Threads.size()-1
-    void MultiThinkGenSfen::thread_worker(size_t thread_id)
+    void MultiThinkGenSfen::thread_worker(Thread& th, std::atomic<uint64_t>& counter, uint64_t limit)
     {
         // For the time being, it will be treated as a draw
         // at the maximum number of steps to write.
@@ -660,10 +697,8 @@ namespace Learner
             // It is necessary to set a dependent thread for Position.
             // When parallelizing, Threads (since this is a vector<Thread*>,
             // Do the same for up to Threads[0]...Threads[thread_num-1].
-            auto th = Threads[thread_id];
-
-            auto& pos = th->rootPos;
-            pos.set(StartFEN, false, &si, th);
+            auto& pos = th.rootPos;
+            pos.set(StartFEN, false, &si, &th);
 
             int resign_counter = 0;
             bool should_resign = prng.rand(10) > 1;
@@ -684,13 +719,11 @@ namespace Learner
             vector<int> move_hist_scores;
 
             auto flush_psv = [&](int8_t result) {
-                quit = commit_psv(a_psv, thread_id, result);
+                quit = commit_psv(th, a_psv, result, counter, limit);
             };
 
             for (int ply = 0; ; ++ply)
             {
-                Move next_move = MOVE_NONE;
-
                 // Current search depth
                 const int depth = search_depth_min + (int)prng.rand(search_depth_max - search_depth_min + 1);
 
@@ -715,18 +748,17 @@ namespace Learner
                         flush_psv((search_value >= eval_limit) ? 1 : -1);
                         break;
                     }
-                } else {
+                }
+                else
+                {
                     resign_counter = 0;
                 }
-                // Verification of a strange move
-                if (search_pv.size() > 0
-                    && (search_pv[0] == MOVE_NONE || search_pv[0] == MOVE_NULL))
+
+                // In case there is no PV and the game was not ended here
+                // there is nothing we can do, we can't continue the game,
+                // we don't know the result, so discard this game.
+                if (search_pv.empty())
                 {
-                    // (???)
-                    // MOVE_WIN is checking if it is the declaration victory stage before this
-                    // The declarative winning move should never come back here.
-                    // Also, when MOVE_RESIGN, search_value is a one-stop score, which should be the minimum value of eval_limit (-31998)...
-                    cout << "Error! : " << pos.fen() << next_move << search_value << endl;
                     break;
                 }
 
@@ -736,34 +768,10 @@ namespace Learner
                 // Discard stuff before write_minply is reached
                 // because it can harm training due to overfitting.
                 // Initial positions would be too common.
-                if (ply < write_minply - 1)
-                {
-                    a_psv.clear();
-                    goto SKIP_SAVE;
-                }
-
-                // Look into the position hashtable to see if the same
-                // position was seen before.
-                // This is a good heuristic to exlude already seen
-                // positions without many false positives.
-                {
-                    auto key = pos.key();
-                    auto hash_index = (size_t)(key & (GENSFEN_HASH_SIZE - 1));
-                    auto old_key = hash[hash_index];
-                    if (key == old_key)
-                    {
-                        goto SKIP_SAVE;
-                    }
-                    else
-                    {
-                        // Replace with the current key.
-                        hash[hash_index] = key;
-                    }
-                }
-
-                // Pack the current position into a packed sfen and save it into the buffer.
+                if (ply >= write_minply && !was_seen_before(pos))
                 {
                     a_psv.emplace_back(PackedSfenValue());
+
                     auto& psv = a_psv.back();
 
                     // Here we only write the position data.
@@ -771,48 +779,29 @@ namespace Learner
                     pos.sfen_pack(psv.sfen);
 
                     psv.score = search_value;
-
                     psv.gamePly = ply;
-
-                    // Take out the first PV move. This should be present unless depth 0.
-                    assert(search_pv.size() >= 1);
                     psv.move = search_pv[0];
                 }
 
-            SKIP_SAVE:;
+                // Update the next move according to best search result or random move.
+                auto random_move = choose_random_move(pos, random_move_flag, ply, actual_random_move_count);
+                const Move next_move = random_move.has_value() ? *random_move : search_pv[0];
 
-                // For some reason, We could not get PV (hit the substitution table etc. and got stuck?)
-                // so go to the next game. It's a rare case, so you can ignore it.
-                if (search_pv.size() == 0)
+                // We don't have the whole game yet, but it ended,
+                // so the writing process ends and the next game starts.
+                // This shouldn't really happen.
+                if (!is_ok(next_move))
                 {
                     break;
                 }
 
-                // Update the next move according to best search result.
-                next_move = search_pv[0];
-
-                // Random move.
-                auto random_move = choose_random_move(pos, random_move_flag, ply, actual_random_move_count);
-                if (random_move.has_value())
-                {
-                    next_move = random_move.value();
-
-                    // We don't have the whole game yet, but it ended,
-                    // so the writing process ends and the next game starts.
-                    if (!is_ok(next_move))
-                    {
-                        break;
-                    }
-                }
-
                 // Do move.
                 pos.do_move(next_move, states[ply]);
 
-            } // for (int ply = 0; ; ++ply)
+            }
+        }
 
-        } // while(!quit)
-
-        sfen_writer.finalize(thread_id);
+        sfen_writer.finalize(th.thread_idx());
     }
 
     // -----------------------------------
@@ -1029,7 +1018,6 @@ namespace Learner
 
             MultiThinkGenSfen multi_think(search_depth_min, search_depth_max, sfen_writer, seed);
             multi_think.nodes = nodes;
-            multi_think.set_loop_max(loop_max);
             multi_think.eval_limit = eval_limit;
             multi_think.random_move_minply = random_move_minply;
             multi_think.random_move_maxply = random_move_maxply;
@@ -1041,7 +1029,7 @@ namespace Learner
             multi_think.write_minply = write_minply;
             multi_think.write_maxply = write_maxply;
             multi_think.start_file_write_worker();
-            multi_think.go_think();
+            multi_think.gensfen(loop_max);
 
             // Since we are joining with the destructor of SfenWriter, please give a message that it has finished after the join
             // Enclose this in a block because it should be displayed.

From 821b655bc63515effbdebbba277f0a45cc463be3 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 21:51:22 +0200
Subject: [PATCH 265/398] Move gensfen progress reporting from sfen writer to
 gensfen

---
 src/learn/gensfen.cpp | 93 ++++++++++++++++++++++++++++---------------
 1 file changed, 60 insertions(+), 33 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index b2325e40..c661200e 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -134,20 +134,6 @@ namespace Learner
         // Dedicated thread to write to file
         void file_write_worker()
         {
-            auto startTime = now();
-
-            auto output_status = [&]()
-            {
-                // Also output the current time to console.
-                const auto nowTime = now();
-                const TimePoint elapsed = nowTime - startTime + 1;
-
-                sync_cout << endl
-                    << sfen_write_count << " sfens, "
-                    << sfen_write_count * 1000 / elapsed << " sfens/second, "
-                    << "at " << now_string() << sync_endl;
-            };
-
             while (!finished || sfen_buffers_pool.size())
             {
                 vector<std::unique_ptr<PSVector>> buffers;
@@ -190,28 +176,9 @@ namespace Learner
                             output_file_stream = create_new_sfen_output(new_filename, sfen_output_type);
                             cout << endl << "output sfen file = " << new_filename << endl;
                         }
-
-                        // Output '.' every time when writing a game record.
-                        std::cout << ".";
-
-                        // Output the number of phases processed
-                        // every STATUS_OUTPUT_PERIOD times
-                        // Finally, the remainder of the teacher phase
-                        // of each thread is written out,
-                        // so halfway numbers are displayed, but is it okay?
-                        // If you overuse the threads to the maximum number
-                        // of logical cores, the console will be clogged,
-                        // so it may be beneficial to increase that value.
-                        if ((++batch_counter % STATUS_OUTPUT_PERIOD) == 0)
-                        {
-                            output_status();
-                        }
                     }
                 }
             }
-
-            // Output the status again after whole processing is done.
-            output_status();
         }
 
         void set_save_interval(uint64_t v)
@@ -267,6 +234,10 @@ namespace Learner
         // It must be 2**N because it will be used as the mask to calculate hash_index.
         static_assert((GENSFEN_HASH_SIZE& (GENSFEN_HASH_SIZE - 1)) == 0);
 
+        static constexpr uint64_t REPORT_DOT_EVERY = 5000;
+        static constexpr uint64_t REPORT_STATS_EVERY = 200000;
+        static_assert(REPORT_STATS_EVERY % REPORT_DOT_EVERY == 0);
+
         MultiThinkGenSfen(int search_depth_min_, int search_depth_max_, SfenWriter& sw_, const std::string& seed) :
             prng(seed),
             search_depth_min(search_depth_min_),
@@ -296,6 +267,10 @@ namespace Learner
 
         bool was_seen_before(const Position& pos);
 
+        void report(uint64_t done, uint64_t new_done);
+
+        void maybe_report(uint64_t done);
+
         bool commit_psv(
             Thread& th,
             PSVector& sfens,
@@ -351,6 +326,9 @@ namespace Learner
         int write_minply;
         int write_maxply;
 
+        std::mutex stats_mutex;
+        TimePoint last_stats_report_time;
+
         // sfen exporter
         SfenWriter& sfen_writer;
 
@@ -359,11 +337,20 @@ namespace Learner
 
     void MultiThinkGenSfen::gensfen(uint64_t limit)
     {
+        last_stats_report_time = 0;
+
         std::atomic<uint64_t> counter{0};
         Threads.execute_with_workers([&counter, limit, this](Thread& th) {
             thread_worker(th, counter, limit);
         });
         Threads.wait_for_workers_finished();
+
+        if (limit % REPORT_STATS_EVERY != 0)
+        {
+            report(limit, limit % REPORT_STATS_EVERY);
+        }
+
+        std::cout << std::endl;
     }
 
     optional<int8_t> MultiThinkGenSfen::get_current_game_result(
@@ -484,6 +471,43 @@ namespace Learner
         return nullopt;
     }
 
+    void MultiThinkGenSfen::report(uint64_t done, uint64_t new_done)
+    {
+        const auto now_time = now();
+        const TimePoint elapsed = now_time - last_stats_report_time + 1;
+
+        sync_cout
+            << endl
+            << done << " sfens, "
+            << new_done * 1000 / elapsed << " sfens/second, "
+            << "at " << now_string() << sync_endl;
+
+        last_stats_report_time = now_time;
+    }
+
+    void MultiThinkGenSfen::maybe_report(uint64_t done)
+    {
+        if (done % REPORT_DOT_EVERY == 0)
+        {
+            std::lock_guard lock(stats_mutex);
+
+            if (last_stats_report_time == 0)
+            {
+                last_stats_report_time = now();
+            }
+
+            if (done != 0)
+            {
+                std::cout << '.';
+
+                if (done % REPORT_STATS_EVERY == 0)
+                {
+                    report(done, REPORT_STATS_EVERY);
+                }
+            }
+        }
+    }
+
     // Write out the phases loaded in sfens to a file.
     // lastTurnIsWin: win/loss in the next phase after the final phase in sfens
     // 1 when winning. -1 when losing. Pass 0 for a draw.
@@ -521,6 +545,9 @@ namespace Learner
             if (iter >= limit)
                 return true;
 
+            // because `iter` was done, now we do one more
+            maybe_report(iter + 1);
+
             // Write out one sfen.
             sfen_writer.write(th.thread_idx(), sfen);
         }

From 3f289546da73b96f48c913de539088cde9d64a65 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 21:53:16 +0200
Subject: [PATCH 266/398] Make some gensfen members private.

---
 src/learn/gensfen.cpp | 67 ++++++++++++++++++++++++-------------------
 1 file changed, 37 insertions(+), 30 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index c661200e..08b9c3d9 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -257,35 +257,6 @@ namespace Learner
 
         void gensfen(uint64_t limit);
 
-        void thread_worker(Thread& th, std::atomic<uint64_t>& counter, uint64_t limit);
-
-        optional<int8_t> get_current_game_result(
-            Position& pos,
-            const vector<int>& move_hist_scores) const;
-
-        vector<uint8_t> generate_random_move_flags();
-
-        bool was_seen_before(const Position& pos);
-
-        void report(uint64_t done, uint64_t new_done);
-
-        void maybe_report(uint64_t done);
-
-        bool commit_psv(
-            Thread& th,
-            PSVector& sfens,
-            int8_t lastTurnIsWin,
-            std::atomic<uint64_t>& counter,
-            uint64_t limit);
-
-        optional<Move> choose_random_move(
-            Position& pos,
-            std::vector<uint8_t>& random_move_flag,
-            int ply,
-            int& random_move_c);
-
-        PRNG prng;
-
         // Min and max depths for search during gensfen
         int search_depth_min;
         int search_depth_max;
@@ -326,6 +297,9 @@ namespace Learner
         int write_minply;
         int write_maxply;
 
+    private:
+        PRNG prng;
+
         std::mutex stats_mutex;
         TimePoint last_stats_report_time;
 
@@ -333,6 +307,36 @@ namespace Learner
         SfenWriter& sfen_writer;
 
         vector<Key> hash; // 64MB*sizeof(HASH_KEY) = 512MB
+
+        void gensfen_worker(
+            Thread& th,
+            std::atomic<uint64_t>& counter,
+            uint64_t limit);
+
+        optional<int8_t> get_current_game_result(
+            Position& pos,
+            const vector<int>& move_hist_scores) const;
+
+        vector<uint8_t> generate_random_move_flags();
+
+        bool was_seen_before(const Position& pos);
+
+        void report(uint64_t done, uint64_t new_done);
+
+        void maybe_report(uint64_t done);
+
+        bool commit_psv(
+            Thread& th,
+            PSVector& sfens,
+            int8_t lastTurnIsWin,
+            std::atomic<uint64_t>& counter,
+            uint64_t limit);
+
+        optional<Move> choose_random_move(
+            Position& pos,
+            std::vector<uint8_t>& random_move_flag,
+            int ply,
+            int& random_move_c);
     };
 
     void MultiThinkGenSfen::gensfen(uint64_t limit)
@@ -705,7 +709,10 @@ namespace Learner
     }
 
     // thread_id = 0..Threads.size()-1
-    void MultiThinkGenSfen::thread_worker(Thread& th, std::atomic<uint64_t>& counter, uint64_t limit)
+    void MultiThinkGenSfen::gensfen_worker(
+        Thread& th,
+        std::atomic<uint64_t>& counter,
+        uint64_t limit)
     {
         // For the time being, it will be treated as a draw
         // at the maximum number of steps to write.

From cb61dc9c9b92c4bfab5f5d3f82d021f4d94b69a2 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 22:00:52 +0200
Subject: [PATCH 267/398] Make sfen writer a part of gensfen.

---
 src/learn/gensfen.cpp | 51 +++++++++++++++----------------------------
 1 file changed, 17 insertions(+), 34 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 08b9c3d9..6f759db3 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -49,20 +49,19 @@ namespace Learner
         // Amount of sfens required to flush the buffer.
         static constexpr size_t SFEN_WRITE_SIZE = 5000;
 
-        // Current status is output after
-        // each (SFEN_WRITE_SIZE * STATUS_OUTPUT_PERIOD) sfens
-        static constexpr uint64_t STATUS_OUTPUT_PERIOD = 40;
-
         // File name to write and number of threads to create
-        SfenWriter(string filename_, int thread_num)
+        SfenWriter(string filename_, int thread_num, uint64_t save_count)
         {
             sfen_buffers_pool.reserve((size_t)thread_num * 10);
             sfen_buffers.resize(thread_num);
 
             output_file_stream = create_new_sfen_output(filename_, sfen_output_type);
             filename = filename_;
+            save_every = save_count;
 
             finished = false;
+
+            file_worker_thread = std::thread([&] { this->file_write_worker(); });
         }
 
         ~SfenWriter()
@@ -125,12 +124,6 @@ namespace Learner
             }
         }
 
-        // Start the write_worker thread.
-        void start_file_write_worker()
-        {
-            file_worker_thread = std::thread([&] { this->file_write_worker(); });
-        }
-
         // Dedicated thread to write to file
         void file_write_worker()
         {
@@ -181,11 +174,6 @@ namespace Learner
             }
         }
 
-        void set_save_interval(uint64_t v)
-        {
-            save_every = v;
-        }
-
     private:
 
         std::unique_ptr<BasicSfenOutputStream> output_file_stream;
@@ -202,9 +190,6 @@ namespace Learner
         // Flag that all threads have finished
         atomic<bool> finished;
 
-        // Counter for time stamp output
-        uint64_t batch_counter = 0;
-
         // buffer before writing to file
         // sfen_buffers is the buffer for each thread
         // sfen_buffers_pool is a buffer for writing.
@@ -238,11 +223,18 @@ namespace Learner
         static constexpr uint64_t REPORT_STATS_EVERY = 200000;
         static_assert(REPORT_STATS_EVERY % REPORT_DOT_EVERY == 0);
 
-        MultiThinkGenSfen(int search_depth_min_, int search_depth_max_, SfenWriter& sw_, const std::string& seed) :
-            prng(seed),
+        MultiThinkGenSfen(
+            int search_depth_min_,
+            int search_depth_max_,
+            std::string output_file_name,
+            int thread_num,
+            uint64_t save_every,
+            const std::string& seed
+        ) :
             search_depth_min(search_depth_min_),
             search_depth_max(search_depth_max_),
-            sfen_writer(sw_)
+            prng(seed),
+            sfen_writer(output_file_name, thread_num, save_every)
         {
             hash.resize(GENSFEN_HASH_SIZE);
 
@@ -250,11 +242,6 @@ namespace Learner
             std::cout << prng << std::endl;
         }
 
-        void start_file_write_worker()
-        {
-            sfen_writer.start_file_write_worker();
-        }
-
         void gensfen(uint64_t limit);
 
         // Min and max depths for search during gensfen
@@ -304,7 +291,7 @@ namespace Learner
         TimePoint last_stats_report_time;
 
         // sfen exporter
-        SfenWriter& sfen_writer;
+        SfenWriter sfen_writer;
 
         vector<Key> hash; // 64MB*sizeof(HASH_KEY) = 512MB
 
@@ -345,7 +332,7 @@ namespace Learner
 
         std::atomic<uint64_t> counter{0};
         Threads.execute_with_workers([&counter, limit, this](Thread& th) {
-            thread_worker(th, counter, limit);
+            gensfen_worker(th, counter, limit);
         });
         Threads.wait_for_workers_finished();
 
@@ -1047,10 +1034,7 @@ namespace Learner
 
         // Create and execute threads as many as Options["Threads"].
         {
-            SfenWriter sfen_writer(output_file_name, thread_num);
-            sfen_writer.set_save_interval(save_every);
-
-            MultiThinkGenSfen multi_think(search_depth_min, search_depth_max, sfen_writer, seed);
+            MultiThinkGenSfen multi_think(search_depth_min, search_depth_max, output_file_name, thread_num, save_every, seed);
             multi_think.nodes = nodes;
             multi_think.eval_limit = eval_limit;
             multi_think.random_move_minply = random_move_minply;
@@ -1062,7 +1046,6 @@ namespace Learner
             multi_think.random_multi_pv_depth = random_multi_pv_depth;
             multi_think.write_minply = write_minply;
             multi_think.write_maxply = write_maxply;
-            multi_think.start_file_write_worker();
             multi_think.gensfen(loop_max);
 
             // Since we are joining with the destructor of SfenWriter, please give a message that it has finished after the join

From 21fac7c53cca6b48ba8e3e1cb913120e5fffdb44 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 22:27:01 +0200
Subject: [PATCH 268/398] A collective struct for gensfen parameters.

---
 src/learn/gensfen.cpp | 381 +++++++++++++++++++-----------------------
 1 file changed, 171 insertions(+), 210 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 6f759db3..d69fcf53 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -37,12 +37,6 @@ using namespace std;
 
 namespace Learner
 {
-    static bool write_out_draw_game_in_training_data_generation = true;
-    static bool detect_draw_by_consecutive_low_score = true;
-    static bool detect_draw_by_insufficient_mating_material = true;
-
-    static SfenOutputType sfen_output_type = SfenOutputType::Bin;
-
     // Helper class for exporting Sfen
     struct SfenWriter
     {
@@ -50,12 +44,13 @@ namespace Learner
         static constexpr size_t SFEN_WRITE_SIZE = 5000;
 
         // File name to write and number of threads to create
-        SfenWriter(string filename_, int thread_num, uint64_t save_count)
+        SfenWriter(string filename_, int thread_num, uint64_t save_count, SfenOutputType sfen_output_type)
         {
             sfen_buffers_pool.reserve((size_t)thread_num * 10);
             sfen_buffers.resize(thread_num);
 
-            output_file_stream = create_new_sfen_output(filename_, sfen_output_type);
+            sfen_format = sfen_output_type;
+            output_file_stream = create_new_sfen_output(filename_, sfen_format);
             filename = filename_;
             save_every = save_count;
 
@@ -166,7 +161,7 @@ namespace Learner
                             // Add ios::app in consideration of overwriting.
                             // (Depending on the operation, it may not be necessary.)
                             string new_filename = filename + "_" + std::to_string(n);
-                            output_file_stream = create_new_sfen_output(new_filename, sfen_output_type);
+                            output_file_stream = create_new_sfen_output(new_filename, sfen_format);
                             cout << endl << "output sfen file = " << new_filename << endl;
                         }
                     }
@@ -190,6 +185,8 @@ namespace Learner
         // Flag that all threads have finished
         atomic<bool> finished;
 
+        SfenOutputType sfen_format;
+
         // buffer before writing to file
         // sfen_buffers is the buffer for each thread
         // sfen_buffers_pool is a buffer for writing.
@@ -214,6 +211,74 @@ namespace Learner
     // Class to generate sfen with multiple threads
     struct MultiThinkGenSfen
     {
+        struct Params
+        {
+            // Min and max depths for search during gensfen
+            int search_depth_min = 3;
+            int search_depth_max = -1;
+
+            // Number of the nodes to be searched.
+            // 0 represents no limits.
+            uint64_t nodes = 0;
+
+            // Upper limit of evaluation value of generated situation
+            int eval_limit = 3000;
+
+            // minimum ply with random move
+            // maximum ply with random move
+            // Number of random moves in one station
+            int random_move_minply = 1;
+            int random_move_maxply = 24;
+            int random_move_count = 5;
+
+            // Move kings with a probability of 1/N when randomly moving like Apery software.
+            // When you move the king again, there is a 1/N chance that it will randomly moved
+            // once in the opponent's turn.
+            // Apery has N=2. Specifying 0 here disables this function.
+            int random_move_like_apery = 0;
+
+            // For when using multi pv instead of random move.
+            // random_multi_pv is the number of candidates for MultiPV.
+            // When adopting the move of the candidate move, the difference
+            // between the evaluation value of the move of the 1st place
+            // and the evaluation value of the move of the Nth place is.
+            // Must be in the range random_multi_pv_diff.
+            // random_multi_pv_depth is the search depth for MultiPV.
+            int random_multi_pv = 0;
+            int random_multi_pv_diff = 32000;
+            int random_multi_pv_depth = -1;
+
+            // The minimum and maximum ply (number of steps from
+            // the initial phase) of the sfens to write out.
+            int write_minply = 16;
+            int write_maxply = 400;
+
+            uint64_t save_every = std::numeric_limits<uint64_t>::max();
+
+            std::string output_file_name = "generated_kifu";
+
+            SfenOutputType sfen_format = SfenOutputType::Binpack;
+
+            std::string seed;
+
+            bool write_out_draw_game_in_training_data_generation = true;
+            bool detect_draw_by_consecutive_low_score = true;
+            bool detect_draw_by_insufficient_mating_material = true;
+
+            uint64_t num_threads;
+
+            void enforce_constraints()
+            {
+                search_depth_max = std::max(search_depth_min, search_depth_max);
+                random_multi_pv_depth = std::max(search_depth_min, random_multi_pv_depth);
+
+                // Limit the maximum to a one-stop score. (Otherwise you might not end the loop)
+                eval_limit = std::min(eval_limit, (int)mate_in(2));
+
+                num_threads = Options["Threads"];
+            }
+        };
+
         // Hash to limit the export of identical sfens
         static constexpr uint64_t GENSFEN_HASH_SIZE = 64 * 1024 * 1024;
         // It must be 2**N because it will be used as the mask to calculate hash_index.
@@ -224,17 +289,11 @@ namespace Learner
         static_assert(REPORT_STATS_EVERY % REPORT_DOT_EVERY == 0);
 
         MultiThinkGenSfen(
-            int search_depth_min_,
-            int search_depth_max_,
-            std::string output_file_name,
-            int thread_num,
-            uint64_t save_every,
-            const std::string& seed
+            const Params& prm
         ) :
-            search_depth_min(search_depth_min_),
-            search_depth_max(search_depth_max_),
-            prng(seed),
-            sfen_writer(output_file_name, thread_num, save_every)
+            params(prm),
+            prng(prm.seed),
+            sfen_writer(prm.output_file_name, prm.num_threads, prm.save_every, prm.sfen_format)
         {
             hash.resize(GENSFEN_HASH_SIZE);
 
@@ -244,47 +303,9 @@ namespace Learner
 
         void gensfen(uint64_t limit);
 
-        // Min and max depths for search during gensfen
-        int search_depth_min;
-        int search_depth_max;
-
-        // Number of the nodes to be searched.
-        // 0 represents no limits.
-        uint64_t nodes;
-
-        // Upper limit of evaluation value of generated situation
-        int eval_limit;
-
-        // minimum ply with random move
-        // maximum ply with random move
-        // Number of random moves in one station
-        int random_move_minply;
-        int random_move_maxply;
-        int random_move_count;
-
-        // Move kings with a probability of 1/N when randomly moving like Apery software.
-        // When you move the king again, there is a 1/N chance that it will randomly moved
-        // once in the opponent's turn.
-        // Apery has N=2. Specifying 0 here disables this function.
-        int random_move_like_apery;
-
-        // For when using multi pv instead of random move.
-        // random_multi_pv is the number of candidates for MultiPV.
-        // When adopting the move of the candidate move, the difference
-        // between the evaluation value of the move of the 1st place
-        // and the evaluation value of the move of the Nth place is.
-        // Must be in the range random_multi_pv_diff.
-        // random_multi_pv_depth is the search depth for MultiPV.
-        int random_multi_pv;
-        int random_multi_pv_diff;
-        int random_multi_pv_depth;
-
-        // The minimum and maximum ply (number of steps from
-        // the initial phase) of the sfens to write out.
-        int write_minply;
-        int write_maxply;
-
     private:
+        Params params;
+
         PRNG prng;
 
         std::mutex stats_mutex;
@@ -365,7 +386,7 @@ namespace Learner
         const int ply = move_hist_scores.size();
 
         // has it reached the max length or is a draw
-        if (ply >= write_maxply || pos.is_draw(ply))
+        if (ply >= params.write_maxply || pos.is_draw(ply))
         {
             return 0;
         }
@@ -379,7 +400,7 @@ namespace Learner
         }
 
         // Adjudicate game to a draw if the last 4 scores of each engine is 0.
-        if (detect_draw_by_consecutive_low_score)
+        if (params.detect_draw_by_consecutive_low_score)
         {
             if (ply >= adj_draw_ply)
             {
@@ -414,7 +435,7 @@ namespace Learner
         }
 
         // Draw by insufficient mating material
-        if (detect_draw_by_insufficient_mating_material)
+        if (params.detect_draw_by_insufficient_mating_material)
         {
             if (pos.count<ALL_PIECES>() <= 4)
             {
@@ -511,7 +532,7 @@ namespace Learner
         std::atomic<uint64_t>& counter,
         uint64_t limit)
     {
-        if (!write_out_draw_game_in_training_data_generation && lastTurnIsWin == 0)
+        if (!params.write_out_draw_game_in_training_data_generation && lastTurnIsWin == 0)
         {
             // We didn't write anything so why quit.
             return false;
@@ -557,21 +578,21 @@ namespace Learner
         // Randomly choose one from legal move
         if (
             // 1. Random move of random_move_count times from random_move_minply to random_move_maxply
-            (random_move_minply != -1 && ply < (int)random_move_flag.size() && random_move_flag[ply]) ||
+            (params.random_move_minply != -1 && ply < (int)random_move_flag.size() && random_move_flag[ply]) ||
             // 2. A mode to perform random move of random_move_count times after leaving the startpos
-            (random_move_minply == -1 && random_move_c < random_move_count))
+            (params.random_move_minply == -1 && random_move_c < params.random_move_count))
         {
             ++random_move_c;
 
             // It's not a mate, so there should be one legal move...
-            if (random_multi_pv == 0)
+            if (params.random_multi_pv == 0)
             {
                 // Normal random move
                 MoveList<LEGAL> list(pos);
 
                 // I don't really know the goodness and badness of making this the Apery method.
-                if (random_move_like_apery == 0
-                    || prng.rand(random_move_like_apery) != 0)
+                if (params.random_move_like_apery == 0
+                    || prng.rand(params.random_move_like_apery) != 0)
                 {
                     // Normally one move from legal move
                     random_move = list.at((size_t)prng.rand((uint64_t)list.size()));
@@ -612,18 +633,18 @@ namespace Learner
             }
             else
             {
-                Search::search(pos, random_multi_pv_depth, random_multi_pv);
+                Search::search(pos, params.random_multi_pv_depth, params.random_multi_pv);
 
                 // Select one from the top N hands of root Moves
                 auto& rm = pos.this_thread()->rootMoves;
 
-                uint64_t s = min((uint64_t)rm.size(), (uint64_t)random_multi_pv);
+                uint64_t s = min((uint64_t)rm.size(), (uint64_t)params.random_multi_pv);
                 for (uint64_t i = 1; i < s; ++i)
                 {
                     // The difference from the evaluation value of rm[0] must
                     // be within the range of random_multi_pv_diff.
                     // It can be assumed that rm[x].score is arranged in descending order.
-                    if (rm[0].score > rm[i].score + random_multi_pv_diff)
+                    if (rm[0].score > rm[i].score + params.random_multi_pv_diff)
                     {
                         s = i;
                         break;
@@ -651,21 +672,21 @@ namespace Learner
         // to shuffle the first N pieces with Fisher-Yates.
 
         vector<int> a;
-        a.reserve((size_t)random_move_maxply);
+        a.reserve((size_t)params.random_move_maxply);
 
         // random_move_minply ,random_move_maxply is specified by 1 origin,
         // Note that we are handling 0 origin here.
-        for (int i = std::max(random_move_minply - 1, 0); i < random_move_maxply; ++i)
+        for (int i = std::max(params.random_move_minply - 1, 0); i < params.random_move_maxply; ++i)
         {
             a.push_back(i);
         }
 
         // In case of Apery random move, insert() may be called random_move_count times.
         // Reserve only the size considering it.
-        random_move_flag.resize((size_t)random_move_maxply + random_move_count);
+        random_move_flag.resize((size_t)params.random_move_maxply + params.random_move_count);
 
         // A random move that exceeds the size() of a[] cannot be applied, so limit it.
-        for (int i = 0; i < std::min(random_move_count, (int)a.size()); ++i)
+        for (int i = 0; i < std::min(params.random_move_count, (int)a.size()); ++i)
         {
             swap(a[i], a[prng.rand((uint64_t)a.size() - i) + i]);
             random_move_flag[a[i]] = true;
@@ -705,7 +726,7 @@ namespace Learner
         // at the maximum number of steps to write.
         // Maximum StateInfo + Search PV to advance to leaf buffer
         std::vector<StateInfo, AlignedAllocator<StateInfo>> states(
-            write_maxply + MAX_PLY /* == search_depth_min + α */);
+            params.write_maxply + MAX_PLY /* == search_depth_min + α */);
 
         StateInfo si;
 
@@ -725,7 +746,7 @@ namespace Learner
             bool should_resign = prng.rand(10) > 1;
             // Vector for holding the sfens in the current simulated game.
             PSVector a_psv;
-            a_psv.reserve(write_maxply + MAX_PLY);
+            a_psv.reserve(params.write_maxply + MAX_PLY);
 
             // Precomputed flags. Used internally by choose_random_move.
             vector<uint8_t> random_move_flag = generate_random_move_flags();
@@ -746,10 +767,10 @@ namespace Learner
             for (int ply = 0; ; ++ply)
             {
                 // Current search depth
-                const int depth = search_depth_min + (int)prng.rand(search_depth_max - search_depth_min + 1);
+                const int depth = params.search_depth_min + (int)prng.rand(params.search_depth_max - params.search_depth_min + 1);
 
                 // Starting search calls init_for_search
-                auto [search_value, search_pv] = Search::search(pos, depth, 1, nodes);
+                auto [search_value, search_pv] = Search::search(pos, depth, 1, params.nodes);
 
                 // This has to be performed after search because it needs to know
                 // rootMoves which are filled in init_for_search.
@@ -762,11 +783,11 @@ namespace Learner
 
                 // Always adjudivate by eval limit.
                 // Also because of this we don't have to check for TB/MATE scores
-                if (abs(search_value) >= eval_limit)
+                if (abs(search_value) >= params.eval_limit)
                 {
                     resign_counter++;
                     if ((should_resign && resign_counter >= 4) || abs(search_value) >= VALUE_KNOWN_WIN) {
-                        flush_psv((search_value >= eval_limit) ? 1 : -1);
+                        flush_psv((search_value >= params.eval_limit) ? 1 : -1);
                         break;
                     }
                 }
@@ -789,7 +810,7 @@ namespace Learner
                 // Discard stuff before write_minply is reached
                 // because it can harm training due to overfitting.
                 // Initial positions would be too common.
-                if (ply >= write_minply && !was_seen_before(pos))
+                if (ply >= params.write_minply && !was_seen_before(pos))
                 {
                     a_psv.emplace_back(PackedSfenValue());
 
@@ -825,6 +846,25 @@ namespace Learner
         sfen_writer.finalize(th.thread_idx());
     }
 
+    void set_gensfen_search_limits()
+    {
+        // About Search::Limits
+        // Be careful because this member variable is global and affects other threads.
+        auto& limits = Search::Limits;
+
+        // Make the search equivalent to the "go infinite" command. (Because it is troublesome if time management is done)
+        limits.infinite = true;
+
+        // Since PV is an obstacle when displayed, erase it.
+        limits.silent = true;
+
+        // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it.
+        limits.nodes = 0;
+
+        // depth is also processed by the one passed as an argument of Learner::search().
+        limits.depth = 0;
+    }
+
     // -----------------------------------
     // Command to generate a game record (master thread)
     // -----------------------------------
@@ -832,55 +872,16 @@ namespace Learner
     // Command to generate a game record
     void gen_sfen(Position&, istringstream& is)
     {
-        // number of threads (given by USI setoption)
-        uint32_t thread_num = (uint32_t)Options["Threads"];
-
         // Number of generated game records default = 8 billion phases (Ponanza specification)
         uint64_t loop_max = 8000000000UL;
 
-        // Stop the generation when the evaluation value reaches this value.
-        int eval_limit = 3000;
-
-        // search depth
-        int search_depth_min = 3;
-        int search_depth_max = INT_MIN;
-
-        // Number of nodes to be searched.
-        uint64_t nodes = 0;
-
-        // minimum ply, maximum ply and number of random moves
-        int random_move_minply = 1;
-        int random_move_maxply = 24;
-        int random_move_count = 5;
-
-        // A function to move the random move mainly like Apery
-        // If this is set to 3, the ball will move with a probability of 1/3.
-        int random_move_like_apery = 0;
-
-        // If you search with multipv instead of random move and choose from among them randomly, set random_multi_pv = 1 or more.
-        int random_multi_pv = 0;
-        int random_multi_pv_diff = 32000;
-        int random_multi_pv_depth = INT_MIN;
-
-        // The minimum and maximum ply (number of steps from the initial phase) of the phase to write out.
-        int write_minply = 16;
-        int write_maxply = 400;
-
-        // File name to write
-        string output_file_name = "generated_kifu";
-
-        string token;
-
-        // Save to file in this unit.
-        // File names are serialized like file_1.bin, file_2.bin.
-        uint64_t save_every = UINT64_MAX;
+        MultiThinkGenSfen::Params params;
 
         // Add a random number to the end of the file name.
         bool random_file_name = false;
-
         std::string sfen_format = "binpack";
-        std::string seed;
 
+        string token;
         while (true)
         {
             token = "";
@@ -889,55 +890,51 @@ namespace Learner
                 break;
 
             if (token == "depth")
-                is >> search_depth_min;
+                is >> params.search_depth_min;
             else if (token == "depth2")
-                is >> search_depth_max;
+                is >> params.search_depth_max;
             else if (token == "nodes")
-                is >> nodes;
+                is >> params.nodes;
             else if (token == "loop")
                 is >> loop_max;
             else if (token == "output_file_name")
-                is >> output_file_name;
+                is >> params.output_file_name;
             else if (token == "eval_limit")
-            {
-                is >> eval_limit;
-                // Limit the maximum to a one-stop score. (Otherwise you might not end the loop)
-                eval_limit = std::min(eval_limit, (int)mate_in(2));
-            }
+                is >> params.eval_limit;
             else if (token == "random_move_minply")
-                is >> random_move_minply;
+                is >> params.random_move_minply;
             else if (token == "random_move_maxply")
-                is >> random_move_maxply;
+                is >> params.random_move_maxply;
             else if (token == "random_move_count")
-                is >> random_move_count;
+                is >> params.random_move_count;
             else if (token == "random_move_like_apery")
-                is >> random_move_like_apery;
+                is >> params.random_move_like_apery;
             else if (token == "random_multi_pv")
-                is >> random_multi_pv;
+                is >> params.random_multi_pv;
             else if (token == "random_multi_pv_diff")
-                is >> random_multi_pv_diff;
+                is >> params.random_multi_pv_diff;
             else if (token == "random_multi_pv_depth")
-                is >> random_multi_pv_depth;
+                is >> params.random_multi_pv_depth;
             else if (token == "write_minply")
-                is >> write_minply;
+                is >> params.write_minply;
             else if (token == "write_maxply")
-                is >> write_maxply;
+                is >> params.write_maxply;
             else if (token == "save_every")
-                is >> save_every;
+                is >> params.save_every;
             else if (token == "random_file_name")
                 is >> random_file_name;
             // Accept also the old option name.
             else if (token == "use_draw_in_training_data_generation" || token == "write_out_draw_game_in_training_data_generation")
-                is >> write_out_draw_game_in_training_data_generation;
+                is >> params.write_out_draw_game_in_training_data_generation;
             // Accept also the old option name.
             else if (token == "use_game_draw_adjudication" || token == "detect_draw_by_consecutive_low_score")
-                is >> detect_draw_by_consecutive_low_score;
+                is >> params.detect_draw_by_consecutive_low_score;
             else if (token == "detect_draw_by_insufficient_mating_material")
-                is >> detect_draw_by_insufficient_mating_material;
+                is >> params.detect_draw_by_insufficient_mating_material;
             else if (token == "sfen_format")
                 is >> sfen_format;
             else if (token == "seed")
-                is >> seed;
+                is >> params.seed;
             else if (token == "set_recommended_uci_options")
             {
                 UCI::setoption("Contempt", "0");
@@ -955,26 +952,20 @@ namespace Learner
         if (!sfen_format.empty())
         {
             if (sfen_format == "bin")
-                sfen_output_type = SfenOutputType::Bin;
+                params.sfen_format = SfenOutputType::Bin;
             else if (sfen_format == "binpack")
-                sfen_output_type = SfenOutputType::Binpack;
+                params.sfen_format = SfenOutputType::Binpack;
             else
             {
                 cout << "Unknown sfen format `" << sfen_format << "`. Using bin\n";
             }
         }
 
-        // If search depth2 is not set, leave it the same as search depth.
-        if (search_depth_max == INT_MIN)
-            search_depth_max = search_depth_min;
-        if (random_multi_pv_depth == INT_MIN)
-            random_multi_pv_depth = search_depth_min;
-
         if (random_file_name)
         {
             // Give a random number to output_file_name at this point.
             // Do not use std::random_device().  Because it always the same integers on MinGW.
-            PRNG r(seed);
+            PRNG r(params.seed);
             // Just in case, reassign the random numbers.
             for (int i = 0; i < 10; ++i)
                 r.rand(1);
@@ -983,74 +974,44 @@ namespace Learner
                 ss << std::hex << u;
                 return ss.str();
             };
+
             // I don't want to wear 64bit numbers by accident, so I'next_move going to make a 64bit number 2 just in case.
-            output_file_name = output_file_name + "_" + to_hex(r.rand<uint64_t>()) + to_hex(r.rand<uint64_t>());
+            params.output_file_name += "_" + to_hex(r.rand<uint64_t>()) + to_hex(r.rand<uint64_t>());
         }
 
+        params.enforce_constraints();
+
         std::cout << "gensfen : " << endl
-            << "  search_depth_min = " << search_depth_min << " to " << search_depth_max << endl
-            << "  nodes = " << nodes << endl
+            << "  search_depth_min = " << params.search_depth_min << " to " << params.search_depth_max << endl
+            << "  nodes = " << params.nodes << endl
             << "  loop_max = " << loop_max << endl
-            << "  eval_limit = " << eval_limit << endl
-            << "  thread_num (set by USI setoption) = " << thread_num << endl
-            << "  random_move_minply     = " << random_move_minply << endl
-            << "  random_move_maxply     = " << random_move_maxply << endl
-            << "  random_move_count      = " << random_move_count << endl
-            << "  random_move_like_apery = " << random_move_like_apery << endl
-            << "  random_multi_pv        = " << random_multi_pv << endl
-            << "  random_multi_pv_diff   = " << random_multi_pv_diff << endl
-            << "  random_multi_pv_depth  = " << random_multi_pv_depth << endl
-            << "  write_minply           = " << write_minply << endl
-            << "  write_maxply           = " << write_maxply << endl
-            << "  output_file_name       = " << output_file_name << endl
-            << "  save_every             = " << save_every << endl
+            << "  eval_limit = " << params.eval_limit << endl
+            << "  thread_num (set by USI setoption) = " << params.num_threads << endl
+            << "  random_move_minply     = " << params.random_move_minply << endl
+            << "  random_move_maxply     = " << params.random_move_maxply << endl
+            << "  random_move_count      = " << params.random_move_count << endl
+            << "  random_move_like_apery = " << params.random_move_like_apery << endl
+            << "  random_multi_pv        = " << params.random_multi_pv << endl
+            << "  random_multi_pv_diff   = " << params.random_multi_pv_diff << endl
+            << "  random_multi_pv_depth  = " << params.random_multi_pv_depth << endl
+            << "  write_minply           = " << params.write_minply << endl
+            << "  write_maxply           = " << params.write_maxply << endl
+            << "  output_file_name       = " << params.output_file_name << endl
+            << "  save_every             = " << params.save_every << endl
             << "  random_file_name       = " << random_file_name << endl
-            << "  write_out_draw_game_in_training_data_generation = " << write_out_draw_game_in_training_data_generation << endl
-            << "  detect_draw_by_consecutive_low_score = " << detect_draw_by_consecutive_low_score << endl
-            << "  detect_draw_by_insufficient_mating_material = " << detect_draw_by_insufficient_mating_material << endl;
+            << "  write_out_draw_game_in_training_data_generation = " << params.write_out_draw_game_in_training_data_generation << endl
+            << "  detect_draw_by_consecutive_low_score = " << params.detect_draw_by_consecutive_low_score << endl
+            << "  detect_draw_by_insufficient_mating_material = " << params.detect_draw_by_insufficient_mating_material << endl;
 
         // Show if the training data generator uses NNUE.
         Eval::NNUE::verify_eval_file_loaded();
 
         Threads.main()->ponder = false;
 
-        // About Search::Limits
-        // Be careful because this member variable is global and affects other threads.
-        {
-          auto& limits = Search::Limits;
+        set_gensfen_search_limits();
 
-          // Make the search equivalent to the "go infinite" command. (Because it is troublesome if time management is done)
-          limits.infinite = true;
-
-          // Since PV is an obstacle when displayed, erase it.
-          limits.silent = true;
-
-          // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it.
-          limits.nodes = 0;
-
-          // depth is also processed by the one passed as an argument of Learner::search().
-          limits.depth = 0;
-        }
-
-        // Create and execute threads as many as Options["Threads"].
-        {
-            MultiThinkGenSfen multi_think(search_depth_min, search_depth_max, output_file_name, thread_num, save_every, seed);
-            multi_think.nodes = nodes;
-            multi_think.eval_limit = eval_limit;
-            multi_think.random_move_minply = random_move_minply;
-            multi_think.random_move_maxply = random_move_maxply;
-            multi_think.random_move_count = random_move_count;
-            multi_think.random_move_like_apery = random_move_like_apery;
-            multi_think.random_multi_pv = random_multi_pv;
-            multi_think.random_multi_pv_diff = random_multi_pv_diff;
-            multi_think.random_multi_pv_depth = random_multi_pv_depth;
-            multi_think.write_minply = write_minply;
-            multi_think.write_maxply = write_maxply;
-            multi_think.gensfen(loop_max);
-
-            // Since we are joining with the destructor of SfenWriter, please give a message that it has finished after the join
-            // Enclose this in a block because it should be displayed.
-        }
+        MultiThinkGenSfen multi_think(params);
+        multi_think.gensfen(loop_max);
 
         std::cout << "gensfen finished." << endl;
     }

From d77b3d176e6736d4729b00d1a4465943d30ea64c Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 22:30:08 +0200
Subject: [PATCH 269/398] Always flush sfen writer at the end of gensfen and
 when it is destroyed.

---
 src/learn/gensfen.cpp | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index d69fcf53..a4ce5728 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -61,6 +61,8 @@ namespace Learner
 
         ~SfenWriter()
         {
+            flush();
+
             finished = true;
             file_worker_thread.join();
             output_file_stream.reset();
@@ -105,8 +107,16 @@ namespace Learner
             }
         }
 
+        void flush()
+        {
+            for (size_t i = 0; i < sfen_buffers.size(); ++i)
+            {
+                flush(i);
+            }
+        }
+
         // Move what remains in the buffer for your thread to a buffer for writing to a file.
-        void finalize(size_t thread_id)
+        void flush(size_t thread_id)
         {
             std::unique_lock<std::mutex> lk(mutex);
 
@@ -357,6 +367,8 @@ namespace Learner
         });
         Threads.wait_for_workers_finished();
 
+        sfen_writer.flush();
+
         if (limit % REPORT_STATS_EVERY != 0)
         {
             report(limit, limit % REPORT_STATS_EVERY);
@@ -842,8 +854,6 @@ namespace Learner
 
             }
         }
-
-        sfen_writer.finalize(th.thread_idx());
     }
 
     void set_gensfen_search_limits()

From 6d4d20c4be5ce46760de0f51d19148d75aedcfd5 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 22:44:02 +0200
Subject: [PATCH 270/398] Cleaner printing and some renaming.

---
 src/learn/gensfen.cpp | 72 ++++++++++++++++++++++++++-----------------
 src/learn/gensfen.h   |  2 +-
 src/misc.h            |  5 +++
 src/uci.cpp           |  2 +-
 4 files changed, 51 insertions(+), 30 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index a4ce5728..971afd1b 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -49,6 +49,9 @@ namespace Learner
             sfen_buffers_pool.reserve((size_t)thread_num * 10);
             sfen_buffers.resize(thread_num);
 
+            auto out = sync_region_cout.new_region();
+            out << "INFO (sfen_writer): Creating new data file at " << filename_ << endl;
+
             sfen_format = sfen_output_type;
             output_file_stream = create_new_sfen_output(filename_, sfen_format);
             filename = filename_;
@@ -172,7 +175,9 @@ namespace Learner
                             // (Depending on the operation, it may not be necessary.)
                             string new_filename = filename + "_" + std::to_string(n);
                             output_file_stream = create_new_sfen_output(new_filename, sfen_format);
-                            cout << endl << "output sfen file = " << new_filename << endl;
+
+                            auto out = sync_region_cout.new_region();
+                            out << "INFO (sfen_writer): Creating new data file at " << new_filename << endl;
                         }
                     }
                 }
@@ -285,6 +290,8 @@ namespace Learner
                 // Limit the maximum to a one-stop score. (Otherwise you might not end the loop)
                 eval_limit = std::min(eval_limit, (int)mate_in(2));
 
+                save_every = std::max(save_every, REPORT_STATS_EVERY);
+
                 num_threads = Options["Threads"];
             }
         };
@@ -324,6 +331,8 @@ namespace Learner
         // sfen exporter
         SfenWriter sfen_writer;
 
+        SynchronizedRegionLogger::Region out;
+
         vector<Key> hash; // 64MB*sizeof(HASH_KEY) = 512MB
 
         void gensfen_worker(
@@ -500,13 +509,15 @@ namespace Learner
         const auto now_time = now();
         const TimePoint elapsed = now_time - last_stats_report_time + 1;
 
-        sync_cout
+        out
             << endl
             << done << " sfens, "
             << new_done * 1000 / elapsed << " sfens/second, "
             << "at " << now_string() << sync_endl;
 
         last_stats_report_time = now_time;
+
+        out = sync_region_cout.new_region();
     }
 
     void MultiThinkGenSfen::maybe_report(uint64_t done)
@@ -518,11 +529,12 @@ namespace Learner
             if (last_stats_report_time == 0)
             {
                 last_stats_report_time = now();
+                out = sync_region_cout.new_region();
             }
 
             if (done != 0)
             {
-                std::cout << '.';
+                out << '.';
 
                 if (done % REPORT_STATS_EVERY == 0)
                 {
@@ -880,7 +892,7 @@ namespace Learner
     // -----------------------------------
 
     // Command to generate a game record
-    void gen_sfen(Position&, istringstream& is)
+    void gensfen(istringstream& is)
     {
         // Number of generated game records default = 8 billion phases (Ponanza specification)
         uint64_t loop_max = 8000000000UL;
@@ -956,7 +968,7 @@ namespace Learner
                 UCI::setoption("EnableTranspositionTable", "true");
             }
             else
-                cout << "Error! : Illegal token " << token << endl;
+                cout << "ERROR: Ignoring unknown option " << token << endl;
         }
 
         if (!sfen_format.empty())
@@ -967,7 +979,7 @@ namespace Learner
                 params.sfen_format = SfenOutputType::Binpack;
             else
             {
-                cout << "Unknown sfen format `" << sfen_format << "`. Using bin\n";
+                cout << "WARNING: Unknown sfen format `" << sfen_format << "`. Using bin\n";
             }
         }
 
@@ -991,27 +1003,31 @@ namespace Learner
 
         params.enforce_constraints();
 
-        std::cout << "gensfen : " << endl
-            << "  search_depth_min = " << params.search_depth_min << " to " << params.search_depth_max << endl
-            << "  nodes = " << params.nodes << endl
-            << "  loop_max = " << loop_max << endl
-            << "  eval_limit = " << params.eval_limit << endl
-            << "  thread_num (set by USI setoption) = " << params.num_threads << endl
-            << "  random_move_minply     = " << params.random_move_minply << endl
-            << "  random_move_maxply     = " << params.random_move_maxply << endl
-            << "  random_move_count      = " << params.random_move_count << endl
-            << "  random_move_like_apery = " << params.random_move_like_apery << endl
-            << "  random_multi_pv        = " << params.random_multi_pv << endl
-            << "  random_multi_pv_diff   = " << params.random_multi_pv_diff << endl
-            << "  random_multi_pv_depth  = " << params.random_multi_pv_depth << endl
-            << "  write_minply           = " << params.write_minply << endl
-            << "  write_maxply           = " << params.write_maxply << endl
-            << "  output_file_name       = " << params.output_file_name << endl
-            << "  save_every             = " << params.save_every << endl
-            << "  random_file_name       = " << random_file_name << endl
-            << "  write_out_draw_game_in_training_data_generation = " << params.write_out_draw_game_in_training_data_generation << endl
-            << "  detect_draw_by_consecutive_low_score = " << params.detect_draw_by_consecutive_low_score << endl
-            << "  detect_draw_by_insufficient_mating_material = " << params.detect_draw_by_insufficient_mating_material << endl;
+        std::cout << "INFO: Executing gensfen command\n";
+
+        std::cout << "INFO: Parameters:\n";
+        std::cout
+            << "  - search_depth_min       = " << params.search_depth_min << endl
+            << "  - search_depth_max       = " << params.search_depth_max << endl
+            << "  - nodes                  = " << params.nodes << endl
+            << "  - num sfens to generate  = " << loop_max << endl
+            << "  - eval_limit             = " << params.eval_limit << endl
+            << "  - num threads (UCI)      = " << params.num_threads << endl
+            << "  - random_move_minply     = " << params.random_move_minply << endl
+            << "  - random_move_maxply     = " << params.random_move_maxply << endl
+            << "  - random_move_count      = " << params.random_move_count << endl
+            << "  - random_move_like_apery = " << params.random_move_like_apery << endl
+            << "  - random_multi_pv        = " << params.random_multi_pv << endl
+            << "  - random_multi_pv_diff   = " << params.random_multi_pv_diff << endl
+            << "  - random_multi_pv_depth  = " << params.random_multi_pv_depth << endl
+            << "  - write_minply           = " << params.write_minply << endl
+            << "  - write_maxply           = " << params.write_maxply << endl
+            << "  - output_file_name       = " << params.output_file_name << endl
+            << "  - save_every             = " << params.save_every << endl
+            << "  - random_file_name       = " << random_file_name << endl
+            << "  - write_drawn_games      = " << params.write_out_draw_game_in_training_data_generation << endl
+            << "  - draw by low score      = " << params.detect_draw_by_consecutive_low_score << endl
+            << "  - draw by insuff. mat.   = " << params.detect_draw_by_insufficient_mating_material << endl;
 
         // Show if the training data generator uses NNUE.
         Eval::NNUE::verify_eval_file_loaded();
@@ -1023,6 +1039,6 @@ namespace Learner
         MultiThinkGenSfen multi_think(params);
         multi_think.gensfen(loop_max);
 
-        std::cout << "gensfen finished." << endl;
+        std::cout << "INFO: Gensfen finished." << endl;
     }
 }
diff --git a/src/learn/gensfen.h b/src/learn/gensfen.h
index d39e44c9..c0a7c978 100644
--- a/src/learn/gensfen.h
+++ b/src/learn/gensfen.h
@@ -8,7 +8,7 @@
 namespace Learner {
 
     // Automatic generation of teacher position
-    void gen_sfen(Position& pos, std::istringstream& is);
+    void gensfen(std::istringstream& is);
 }
 
 #endif
\ No newline at end of file
diff --git a/src/misc.h b/src/misc.h
index 3e6dc5b0..9f250b6e 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -84,6 +84,11 @@ struct SynchronizedRegionLogger
   {
     friend struct SynchronizedRegionLogger;
 
+    Region() :
+      logger(nullptr), region_id(0), is_held(false)
+    {
+    }
+
     Region(const Region&) = delete;
     Region& operator=(const Region&) = delete;
 
diff --git a/src/uci.cpp b/src/uci.cpp
index 398fd01a..dbef05bf 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -338,7 +338,7 @@ void UCI::loop(int argc, char* argv[]) {
       else if (token == "eval")     trace_eval(pos);
       else if (token == "compiler") sync_cout << compiler_info() << sync_endl;
 
-      else if (token == "gensfen") Learner::gen_sfen(pos, is);
+      else if (token == "gensfen") Learner::gensfen(is);
       else if (token == "learn") Learner::learn(pos, is);
       else if (token == "convert") Learner::convert(is);
       else if (token == "convert_bin") Learner::convert_bin(is);

From 03abfae41f912f99d0d6c86d7a237971b8266d03 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 22:53:14 +0200
Subject: [PATCH 271/398] Reorder members, renaming.

---
 src/learn/gensfen.cpp | 588 +++++++++++++++++++++---------------------
 1 file changed, 289 insertions(+), 299 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 971afd1b..1dddac5a 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -219,12 +219,8 @@ namespace Learner
         uint64_t sfen_write_count_current_file = 0;
     };
 
-    // -----------------------------------
-    // worker that creates the game record (for each thread)
-    // -----------------------------------
-
     // Class to generate sfen with multiple threads
-    struct MultiThinkGenSfen
+    struct Gensfen
     {
         struct Params
         {
@@ -305,7 +301,7 @@ namespace Learner
         static constexpr uint64_t REPORT_STATS_EVERY = 200000;
         static_assert(REPORT_STATS_EVERY % REPORT_DOT_EVERY == 0);
 
-        MultiThinkGenSfen(
+        Gensfen(
             const Params& prm
         ) :
             params(prm),
@@ -318,7 +314,7 @@ namespace Learner
             std::cout << prng << std::endl;
         }
 
-        void gensfen(uint64_t limit);
+        void generate(uint64_t limit);
 
     private:
         Params params;
@@ -335,22 +331,26 @@ namespace Learner
 
         vector<Key> hash; // 64MB*sizeof(HASH_KEY) = 512MB
 
-        void gensfen_worker(
+        static void set_gensfen_search_limits();
+
+        void generate_worker(
             Thread& th,
             std::atomic<uint64_t>& counter,
             uint64_t limit);
 
+        bool was_seen_before(const Position& pos);
+
         optional<int8_t> get_current_game_result(
             Position& pos,
             const vector<int>& move_hist_scores) const;
 
         vector<uint8_t> generate_random_move_flags();
 
-        bool was_seen_before(const Position& pos);
-
-        void report(uint64_t done, uint64_t new_done);
-
-        void maybe_report(uint64_t done);
+        optional<Move> choose_random_move(
+            Position& pos,
+            std::vector<uint8_t>& random_move_flag,
+            int ply,
+            int& random_move_c);
 
         bool commit_psv(
             Thread& th,
@@ -359,20 +359,39 @@ namespace Learner
             std::atomic<uint64_t>& counter,
             uint64_t limit);
 
-        optional<Move> choose_random_move(
-            Position& pos,
-            std::vector<uint8_t>& random_move_flag,
-            int ply,
-            int& random_move_c);
+        void report(uint64_t done, uint64_t new_done);
+
+        void maybe_report(uint64_t done);
     };
 
-    void MultiThinkGenSfen::gensfen(uint64_t limit)
+    void Gensfen::set_gensfen_search_limits()
+    {
+        // About Search::Limits
+        // Be careful because this member variable is global and affects other threads.
+        auto& limits = Search::Limits;
+
+        // Make the search equivalent to the "go infinite" command. (Because it is troublesome if time management is done)
+        limits.infinite = true;
+
+        // Since PV is an obstacle when displayed, erase it.
+        limits.silent = true;
+
+        // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it.
+        limits.nodes = 0;
+
+        // depth is also processed by the one passed as an argument of Learner::search().
+        limits.depth = 0;
+    }
+
+    void Gensfen::generate(uint64_t limit)
     {
         last_stats_report_time = 0;
 
+        set_gensfen_search_limits();
+
         std::atomic<uint64_t> counter{0};
         Threads.execute_with_workers([&counter, limit, this](Thread& th) {
-            gensfen_worker(th, counter, limit);
+            generate_worker(th, counter, limit);
         });
         Threads.wait_for_workers_finished();
 
@@ -386,7 +405,154 @@ namespace Learner
         std::cout << std::endl;
     }
 
-    optional<int8_t> MultiThinkGenSfen::get_current_game_result(
+    void Gensfen::generate_worker(
+        Thread& th,
+        std::atomic<uint64_t>& counter,
+        uint64_t limit)
+    {
+        // For the time being, it will be treated as a draw
+        // at the maximum number of steps to write.
+        // Maximum StateInfo + Search PV to advance to leaf buffer
+        std::vector<StateInfo, AlignedAllocator<StateInfo>> states(
+            params.write_maxply + MAX_PLY /* == search_depth_min + α */);
+
+        StateInfo si;
+
+        // end flag
+        bool quit = false;
+
+        // repeat until the specified number of times
+        while (!quit)
+        {
+            // It is necessary to set a dependent thread for Position.
+            // When parallelizing, Threads (since this is a vector<Thread*>,
+            // Do the same for up to Threads[0]...Threads[thread_num-1].
+            auto& pos = th.rootPos;
+            pos.set(StartFEN, false, &si, &th);
+
+            int resign_counter = 0;
+            bool should_resign = prng.rand(10) > 1;
+            // Vector for holding the sfens in the current simulated game.
+            PSVector packed_sfens;
+            packed_sfens.reserve(params.write_maxply + MAX_PLY);
+
+            // Precomputed flags. Used internally by choose_random_move.
+            vector<uint8_t> random_move_flag = generate_random_move_flags();
+
+            // A counter that keeps track of the number of random moves
+            // When random_move_minply == -1, random moves are
+            // performed continuously, so use it at this time.
+            // Used internally by choose_random_move.
+            int actual_random_move_count = 0;
+
+            // Save history of move scores for adjudication
+            vector<int> move_hist_scores;
+
+            auto flush_psv = [&](int8_t result) {
+                quit = commit_psv(th, packed_sfens, result, counter, limit);
+            };
+
+            for (int ply = 0; ; ++ply)
+            {
+                // Current search depth
+                const int depth = params.search_depth_min + (int)prng.rand(params.search_depth_max - params.search_depth_min + 1);
+
+                // Starting search calls init_for_search
+                auto [search_value, search_pv] = Search::search(pos, depth, 1, params.nodes);
+
+                // This has to be performed after search because it needs to know
+                // rootMoves which are filled in init_for_search.
+                const auto result = get_current_game_result(pos, move_hist_scores);
+                if (result.has_value())
+                {
+                    flush_psv(result.value());
+                    break;
+                }
+
+                // Always adjudivate by eval limit.
+                // Also because of this we don't have to check for TB/MATE scores
+                if (abs(search_value) >= params.eval_limit)
+                {
+                    resign_counter++;
+                    if ((should_resign && resign_counter >= 4) || abs(search_value) >= VALUE_KNOWN_WIN) {
+                        flush_psv((search_value >= params.eval_limit) ? 1 : -1);
+                        break;
+                    }
+                }
+                else
+                {
+                    resign_counter = 0;
+                }
+
+                // In case there is no PV and the game was not ended here
+                // there is nothing we can do, we can't continue the game,
+                // we don't know the result, so discard this game.
+                if (search_pv.empty())
+                {
+                    break;
+                }
+
+                // Save the move score for adjudication.
+                move_hist_scores.push_back(search_value);
+
+                // Discard stuff before write_minply is reached
+                // because it can harm training due to overfitting.
+                // Initial positions would be too common.
+                if (ply >= params.write_minply && !was_seen_before(pos))
+                {
+                    packed_sfens.emplace_back(PackedSfenValue());
+
+                    auto& psv = packed_sfens.back();
+
+                    // Here we only write the position data.
+                    // Result is added after the whole game is done.
+                    pos.sfen_pack(psv.sfen);
+
+                    psv.score = search_value;
+                    psv.gamePly = ply;
+                    psv.move = search_pv[0];
+                }
+
+                // Update the next move according to best search result or random move.
+                auto random_move = choose_random_move(pos, random_move_flag, ply, actual_random_move_count);
+                const Move next_move = random_move.has_value() ? *random_move : search_pv[0];
+
+                // We don't have the whole game yet, but it ended,
+                // so the writing process ends and the next game starts.
+                // This shouldn't really happen.
+                if (!is_ok(next_move))
+                {
+                    break;
+                }
+
+                // Do move.
+                pos.do_move(next_move, states[ply]);
+            }
+        }
+    }
+
+    bool Gensfen::was_seen_before(const Position& pos)
+    {
+        // Look into the position hashtable to see if the same
+        // position was seen before.
+        // This is a good heuristic to exlude already seen
+        // positions without many false positives.
+        auto key = pos.key();
+        auto hash_index = (size_t)(key & (GENSFEN_HASH_SIZE - 1));
+        auto old_key = hash[hash_index];
+        if (key == old_key)
+        {
+            return true;
+        }
+        else
+        {
+            // Replace with the current key.
+            hash[hash_index] = key;
+            return false;
+        }
+    }
+
+    optional<int8_t> Gensfen::get_current_game_result(
         Position& pos,
         const vector<int>& move_hist_scores) const
     {
@@ -504,94 +670,44 @@ namespace Learner
         return nullopt;
     }
 
-    void MultiThinkGenSfen::report(uint64_t done, uint64_t new_done)
+    vector<uint8_t> Gensfen::generate_random_move_flags()
     {
-        const auto now_time = now();
-        const TimePoint elapsed = now_time - last_stats_report_time + 1;
+        vector<uint8_t> random_move_flag;
 
-        out
-            << endl
-            << done << " sfens, "
-            << new_done * 1000 / elapsed << " sfens/second, "
-            << "at " << now_string() << sync_endl;
+        // Depending on random move selection parameters setup
+        // the array of flags that indicates whether a random move
+        // be taken at a given ply.
 
-        last_stats_report_time = now_time;
+        // Make an array like a[0] = 0 ,a[1] = 1, ...
+        // Fisher-Yates shuffle and take out the first N items.
+        // Actually, I only want N pieces, so I only need
+        // to shuffle the first N pieces with Fisher-Yates.
 
-        out = sync_region_cout.new_region();
+        vector<int> a;
+        a.reserve((size_t)params.random_move_maxply);
+
+        // random_move_minply ,random_move_maxply is specified by 1 origin,
+        // Note that we are handling 0 origin here.
+        for (int i = std::max(params.random_move_minply - 1, 0); i < params.random_move_maxply; ++i)
+        {
+            a.push_back(i);
+        }
+
+        // In case of Apery random move, insert() may be called random_move_count times.
+        // Reserve only the size considering it.
+        random_move_flag.resize((size_t)params.random_move_maxply + params.random_move_count);
+
+        // A random move that exceeds the size() of a[] cannot be applied, so limit it.
+        for (int i = 0; i < std::min(params.random_move_count, (int)a.size()); ++i)
+        {
+            swap(a[i], a[prng.rand((uint64_t)a.size() - i) + i]);
+            random_move_flag[a[i]] = true;
+        }
+
+        return random_move_flag;
     }
 
-    void MultiThinkGenSfen::maybe_report(uint64_t done)
-    {
-        if (done % REPORT_DOT_EVERY == 0)
-        {
-            std::lock_guard lock(stats_mutex);
-
-            if (last_stats_report_time == 0)
-            {
-                last_stats_report_time = now();
-                out = sync_region_cout.new_region();
-            }
-
-            if (done != 0)
-            {
-                out << '.';
-
-                if (done % REPORT_STATS_EVERY == 0)
-                {
-                    report(done, REPORT_STATS_EVERY);
-                }
-            }
-        }
-    }
-
-    // Write out the phases loaded in sfens to a file.
-    // lastTurnIsWin: win/loss in the next phase after the final phase in sfens
-    // 1 when winning. -1 when losing. Pass 0 for a draw.
-    // Return value: true if the specified number of
-    // sfens has already been reached and the process ends.
-    bool MultiThinkGenSfen::commit_psv(
-        Thread& th,
-        PSVector& sfens,
-        int8_t lastTurnIsWin,
-        std::atomic<uint64_t>& counter,
-        uint64_t limit)
-    {
-        if (!params.write_out_draw_game_in_training_data_generation && lastTurnIsWin == 0)
-        {
-            // We didn't write anything so why quit.
-            return false;
-        }
-
-        int8_t is_win = lastTurnIsWin;
-
-        // From the final stage (one step before) to the first stage, give information on the outcome of the game for each stage.
-        // The phases stored in sfens are assumed to be continuous (in order).
-        for (auto it = sfens.rbegin(); it != sfens.rend(); ++it)
-        {
-            // If is_win == 0 (draw), multiply by -1 and it will remain 0 (draw)
-            is_win = -is_win;
-            it->game_result = is_win;
-        }
-
-        // Write sfens in move order to make potential compression easier
-        for (auto& sfen : sfens)
-        {
-            // Return true if there is already enough data generated.
-            const auto iter = counter.fetch_add(1);
-            if (iter >= limit)
-                return true;
-
-            // because `iter` was done, now we do one more
-            maybe_report(iter + 1);
-
-            // Write out one sfen.
-            sfen_writer.write(th.thread_idx(), sfen);
-        }
-
-        return false;
-    }
-
-    optional<Move> MultiThinkGenSfen::choose_random_move(
+    optional<Move> Gensfen::choose_random_move(
         Position& pos,
         std::vector<uint8_t>& random_move_flag,
         int ply,
@@ -682,222 +798,98 @@ namespace Learner
         return random_move;
     }
 
-    vector<uint8_t> MultiThinkGenSfen::generate_random_move_flags()
-    {
-        vector<uint8_t> random_move_flag;
-
-        // Depending on random move selection parameters setup
-        // the array of flags that indicates whether a random move
-        // be taken at a given ply.
-
-        // Make an array like a[0] = 0 ,a[1] = 1, ...
-        // Fisher-Yates shuffle and take out the first N items.
-        // Actually, I only want N pieces, so I only need
-        // to shuffle the first N pieces with Fisher-Yates.
-
-        vector<int> a;
-        a.reserve((size_t)params.random_move_maxply);
-
-        // random_move_minply ,random_move_maxply is specified by 1 origin,
-        // Note that we are handling 0 origin here.
-        for (int i = std::max(params.random_move_minply - 1, 0); i < params.random_move_maxply; ++i)
-        {
-            a.push_back(i);
-        }
-
-        // In case of Apery random move, insert() may be called random_move_count times.
-        // Reserve only the size considering it.
-        random_move_flag.resize((size_t)params.random_move_maxply + params.random_move_count);
-
-        // A random move that exceeds the size() of a[] cannot be applied, so limit it.
-        for (int i = 0; i < std::min(params.random_move_count, (int)a.size()); ++i)
-        {
-            swap(a[i], a[prng.rand((uint64_t)a.size() - i) + i]);
-            random_move_flag[a[i]] = true;
-        }
-
-        return random_move_flag;
-    }
-
-    bool MultiThinkGenSfen::was_seen_before(const Position& pos)
-    {
-        // Look into the position hashtable to see if the same
-        // position was seen before.
-        // This is a good heuristic to exlude already seen
-        // positions without many false positives.
-        auto key = pos.key();
-        auto hash_index = (size_t)(key & (GENSFEN_HASH_SIZE - 1));
-        auto old_key = hash[hash_index];
-        if (key == old_key)
-        {
-            return true;
-        }
-        else
-        {
-            // Replace with the current key.
-            hash[hash_index] = key;
-            return false;
-        }
-    }
-
-    // thread_id = 0..Threads.size()-1
-    void MultiThinkGenSfen::gensfen_worker(
+    // Write out the phases loaded in sfens to a file.
+    // result: win/loss in the next phase after the final phase in sfens
+    // 1 when winning. -1 when losing. Pass 0 for a draw.
+    // Return value: true if the specified number of
+    // sfens has already been reached and the process ends.
+    bool Gensfen::commit_psv(
         Thread& th,
+        PSVector& sfens,
+        int8_t result,
         std::atomic<uint64_t>& counter,
         uint64_t limit)
     {
-        // For the time being, it will be treated as a draw
-        // at the maximum number of steps to write.
-        // Maximum StateInfo + Search PV to advance to leaf buffer
-        std::vector<StateInfo, AlignedAllocator<StateInfo>> states(
-            params.write_maxply + MAX_PLY /* == search_depth_min + α */);
-
-        StateInfo si;
-
-        // end flag
-        bool quit = false;
-
-        // repeat until the specified number of times
-        while (!quit)
+        if (!params.write_out_draw_game_in_training_data_generation && result == 0)
         {
-            // It is necessary to set a dependent thread for Position.
-            // When parallelizing, Threads (since this is a vector<Thread*>,
-            // Do the same for up to Threads[0]...Threads[thread_num-1].
-            auto& pos = th.rootPos;
-            pos.set(StartFEN, false, &si, &th);
+            // We didn't write anything so why quit.
+            return false;
+        }
 
-            int resign_counter = 0;
-            bool should_resign = prng.rand(10) > 1;
-            // Vector for holding the sfens in the current simulated game.
-            PSVector a_psv;
-            a_psv.reserve(params.write_maxply + MAX_PLY);
+        // From the final stage (one step before) to the first stage, give information on the outcome of the game for each stage.
+        // The phases stored in sfens are assumed to be continuous (in order).
+        for (auto it = sfens.rbegin(); it != sfens.rend(); ++it)
+        {
+            // If is_win == 0 (draw), multiply by -1 and it will remain 0 (draw)
+            result = -result;
+            it->game_result = result;
+        }
 
-            // Precomputed flags. Used internally by choose_random_move.
-            vector<uint8_t> random_move_flag = generate_random_move_flags();
+        // Write sfens in move order to make potential compression easier
+        for (auto& sfen : sfens)
+        {
+            // Return true if there is already enough data generated.
+            const auto iter = counter.fetch_add(1);
+            if (iter >= limit)
+                return true;
 
-            // A counter that keeps track of the number of random moves
-            // When random_move_minply == -1, random moves are
-            // performed continuously, so use it at this time.
-            // Used internally by choose_random_move.
-            int actual_random_move_count = 0;
+            // because `iter` was done, now we do one more
+            maybe_report(iter + 1);
 
-            // Save history of move scores for adjudication
-            vector<int> move_hist_scores;
+            // Write out one sfen.
+            sfen_writer.write(th.thread_idx(), sfen);
+        }
 
-            auto flush_psv = [&](int8_t result) {
-                quit = commit_psv(th, a_psv, result, counter, limit);
-            };
+        return false;
+    }
 
-            for (int ply = 0; ; ++ply)
+    void Gensfen::report(uint64_t done, uint64_t new_done)
+    {
+        const auto now_time = now();
+        const TimePoint elapsed = now_time - last_stats_report_time + 1;
+
+        out
+            << endl
+            << done << " sfens, "
+            << new_done * 1000 / elapsed << " sfens/second, "
+            << "at " << now_string() << sync_endl;
+
+        last_stats_report_time = now_time;
+
+        out = sync_region_cout.new_region();
+    }
+
+    void Gensfen::maybe_report(uint64_t done)
+    {
+        if (done % REPORT_DOT_EVERY == 0)
+        {
+            std::lock_guard lock(stats_mutex);
+
+            if (last_stats_report_time == 0)
             {
-                // Current search depth
-                const int depth = params.search_depth_min + (int)prng.rand(params.search_depth_max - params.search_depth_min + 1);
+                last_stats_report_time = now();
+                out = sync_region_cout.new_region();
+            }
 
-                // Starting search calls init_for_search
-                auto [search_value, search_pv] = Search::search(pos, depth, 1, params.nodes);
+            if (done != 0)
+            {
+                out << '.';
 
-                // This has to be performed after search because it needs to know
-                // rootMoves which are filled in init_for_search.
-                const auto result = get_current_game_result(pos, move_hist_scores);
-                if (result.has_value())
+                if (done % REPORT_STATS_EVERY == 0)
                 {
-                    flush_psv(result.value());
-                    break;
+                    report(done, REPORT_STATS_EVERY);
                 }
-
-                // Always adjudivate by eval limit.
-                // Also because of this we don't have to check for TB/MATE scores
-                if (abs(search_value) >= params.eval_limit)
-                {
-                    resign_counter++;
-                    if ((should_resign && resign_counter >= 4) || abs(search_value) >= VALUE_KNOWN_WIN) {
-                        flush_psv((search_value >= params.eval_limit) ? 1 : -1);
-                        break;
-                    }
-                }
-                else
-                {
-                    resign_counter = 0;
-                }
-
-                // In case there is no PV and the game was not ended here
-                // there is nothing we can do, we can't continue the game,
-                // we don't know the result, so discard this game.
-                if (search_pv.empty())
-                {
-                    break;
-                }
-
-                // Save the move score for adjudication.
-                move_hist_scores.push_back(search_value);
-
-                // Discard stuff before write_minply is reached
-                // because it can harm training due to overfitting.
-                // Initial positions would be too common.
-                if (ply >= params.write_minply && !was_seen_before(pos))
-                {
-                    a_psv.emplace_back(PackedSfenValue());
-
-                    auto& psv = a_psv.back();
-
-                    // Here we only write the position data.
-                    // Result is added after the whole game is done.
-                    pos.sfen_pack(psv.sfen);
-
-                    psv.score = search_value;
-                    psv.gamePly = ply;
-                    psv.move = search_pv[0];
-                }
-
-                // Update the next move according to best search result or random move.
-                auto random_move = choose_random_move(pos, random_move_flag, ply, actual_random_move_count);
-                const Move next_move = random_move.has_value() ? *random_move : search_pv[0];
-
-                // We don't have the whole game yet, but it ended,
-                // so the writing process ends and the next game starts.
-                // This shouldn't really happen.
-                if (!is_ok(next_move))
-                {
-                    break;
-                }
-
-                // Do move.
-                pos.do_move(next_move, states[ply]);
-
             }
         }
     }
 
-    void set_gensfen_search_limits()
-    {
-        // About Search::Limits
-        // Be careful because this member variable is global and affects other threads.
-        auto& limits = Search::Limits;
-
-        // Make the search equivalent to the "go infinite" command. (Because it is troublesome if time management is done)
-        limits.infinite = true;
-
-        // Since PV is an obstacle when displayed, erase it.
-        limits.silent = true;
-
-        // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it.
-        limits.nodes = 0;
-
-        // depth is also processed by the one passed as an argument of Learner::search().
-        limits.depth = 0;
-    }
-
-    // -----------------------------------
-    // Command to generate a game record (master thread)
-    // -----------------------------------
-
     // Command to generate a game record
     void gensfen(istringstream& is)
     {
         // Number of generated game records default = 8 billion phases (Ponanza specification)
         uint64_t loop_max = 8000000000UL;
 
-        MultiThinkGenSfen::Params params;
+        Gensfen::Params params;
 
         // Add a random number to the end of the file name.
         bool random_file_name = false;
@@ -978,9 +970,7 @@ namespace Learner
             else if (sfen_format == "binpack")
                 params.sfen_format = SfenOutputType::Binpack;
             else
-            {
                 cout << "WARNING: Unknown sfen format `" << sfen_format << "`. Using bin\n";
-            }
         }
 
         if (random_file_name)
@@ -988,9 +978,11 @@ namespace Learner
             // Give a random number to output_file_name at this point.
             // Do not use std::random_device().  Because it always the same integers on MinGW.
             PRNG r(params.seed);
+
             // Just in case, reassign the random numbers.
             for (int i = 0; i < 10; ++i)
                 r.rand(1);
+
             auto to_hex = [](uint64_t u) {
                 std::stringstream ss;
                 ss << std::hex << u;
@@ -1034,10 +1026,8 @@ namespace Learner
 
         Threads.main()->ponder = false;
 
-        set_gensfen_search_limits();
-
-        MultiThinkGenSfen multi_think(params);
-        multi_think.gensfen(loop_max);
+        Gensfen gensfen(params);
+        gensfen.generate(loop_max);
 
         std::cout << "INFO: Gensfen finished." << endl;
     }

From 65e443954a14bb38b7e68c7827c148313fc78176 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 25 Oct 2020 19:14:19 +0100
Subject: [PATCH 272/398] Update expected gensfen finished responses.

---
 tests/instrumented_learn.sh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/instrumented_learn.sh b/tests/instrumented_learn.sh
index 50b6e4ae..07f5f98b 100755
--- a/tests/instrumented_learn.sh
+++ b/tests/instrumented_learn.sh
@@ -79,11 +79,11 @@ cat << EOF > gensfen01.exp
  send "setoption name Use NNUE value false\n"
  send "isready\n"
  send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data/training_data.bin sfen_format bin\n"
- expect "gensfen finished."
+ expect "INFO: Gensfen finished."
  send "convert_plain targetfile training_data/training_data.bin output_file_name training_data.txt\n"
  expect "all done"
  send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data/training_data.binpack sfen_format binpack\n"
- expect "gensfen finished."
+ expect "INFO: Gensfen finished."
 
  send "quit\n"
  expect eof
@@ -105,9 +105,9 @@ cat << EOF > gensfen02.exp
  send "setoption name Use NNUE value true\n"
  send "isready\n"
  send "gensfen depth 4 loop 50 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name validation_data/validation_data.bin sfen_format bin\n"
- expect "gensfen finished."
+ expect "INFO: Gensfen finished."
  send "gensfen depth 4 loop 50 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name validation_data/validation_data.binpack sfen_format binpack\n"
- expect "gensfen finished."
+ expect "INFO: Gensfen finished."
 
  send "quit\n"
  expect eof
@@ -129,7 +129,7 @@ cat << EOF > learn01.exp
  send "isready\n"
  send "learn targetdir training_data epochs 1 sfen_read_size 100 thread_buffer_size 10 batchsize 100 use_draw_in_training 1 use_draw_in_validation 1 lr 1 eval_limit 32000 nn_batch_size 30 newbob_decay 0.5 eval_save_interval 30 loss_output_interval 10 validation_set_file_name validation_data/validation_data.bin\n"
 
- expect "INFO (save_eval): Saving current evaluation file in"
+ expect "INFO (save_eval): Finished saving evaluation file in"
 
  send "quit\n"
  expect eof

From e515f1f61f880caddef0cf7a09a767d0204d0dd4 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 22:56:42 +0200
Subject: [PATCH 273/398] Move SfenWriter to a separate file

---
 src/learn/gensfen.cpp   | 186 +-----------------------------------
 src/learn/sfen_writer.h | 206 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 208 insertions(+), 184 deletions(-)
 create mode 100644 src/learn/sfen_writer.h

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 1dddac5a..4accb882 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -1,7 +1,7 @@
 ﻿#include "gensfen.h"
 
+#include "sfen_writer.h"
 #include "packed_sfen.h"
-#include "sfen_stream.h"
 
 #include "misc.h"
 #include "position.h"
@@ -16,6 +16,7 @@
 
 #include "syzygy/tbprobe.h"
 
+#include <atomic>
 #include <chrono>
 #include <climits>
 #include <cmath>
@@ -28,7 +29,6 @@
 #include <memory>
 #include <optional>
 #include <random>
-#include <regex>
 #include <shared_mutex>
 #include <sstream>
 #include <unordered_set>
@@ -37,188 +37,6 @@ using namespace std;
 
 namespace Learner
 {
-    // Helper class for exporting Sfen
-    struct SfenWriter
-    {
-        // Amount of sfens required to flush the buffer.
-        static constexpr size_t SFEN_WRITE_SIZE = 5000;
-
-        // File name to write and number of threads to create
-        SfenWriter(string filename_, int thread_num, uint64_t save_count, SfenOutputType sfen_output_type)
-        {
-            sfen_buffers_pool.reserve((size_t)thread_num * 10);
-            sfen_buffers.resize(thread_num);
-
-            auto out = sync_region_cout.new_region();
-            out << "INFO (sfen_writer): Creating new data file at " << filename_ << endl;
-
-            sfen_format = sfen_output_type;
-            output_file_stream = create_new_sfen_output(filename_, sfen_format);
-            filename = filename_;
-            save_every = save_count;
-
-            finished = false;
-
-            file_worker_thread = std::thread([&] { this->file_write_worker(); });
-        }
-
-        ~SfenWriter()
-        {
-            flush();
-
-            finished = true;
-            file_worker_thread.join();
-            output_file_stream.reset();
-
-#if !defined(NDEBUG)
-            {
-                // All buffers should be empty since file_worker_thread
-                // should have written everything before exiting.
-                for (const auto& p : sfen_buffers) { assert(p == nullptr); (void)p ; }
-                assert(sfen_buffers_pool.empty());
-            }
-#endif
-        }
-
-        void write(size_t thread_id, const PackedSfenValue& psv)
-        {
-            // We have a buffer for each thread and add it there.
-            // If the buffer overflows, write it to a file.
-
-            // This buffer is prepared for each thread.
-            auto& buf = sfen_buffers[thread_id];
-
-            // Secure since there is no buf at the first time
-            // and immediately after writing the thread buffer.
-            if (!buf)
-            {
-                buf = std::make_unique<PSVector>();
-                buf->reserve(SFEN_WRITE_SIZE);
-            }
-
-            // Buffer is exclusive to this thread.
-            // There is no need for a critical section.
-            buf->push_back(psv);
-
-            if (buf->size() >= SFEN_WRITE_SIZE)
-            {
-                // If you load it in sfen_buffers_pool, the worker will do the rest.
-
-                // Critical section since sfen_buffers_pool is shared among threads.
-                std::unique_lock<std::mutex> lk(mutex);
-                sfen_buffers_pool.emplace_back(std::move(buf));
-            }
-        }
-
-        void flush()
-        {
-            for (size_t i = 0; i < sfen_buffers.size(); ++i)
-            {
-                flush(i);
-            }
-        }
-
-        // Move what remains in the buffer for your thread to a buffer for writing to a file.
-        void flush(size_t thread_id)
-        {
-            std::unique_lock<std::mutex> lk(mutex);
-
-            auto& buf = sfen_buffers[thread_id];
-
-            // There is a case that buf==nullptr, so that check is necessary.
-            if (buf && buf->size() != 0)
-            {
-                sfen_buffers_pool.emplace_back(std::move(buf));
-            }
-        }
-
-        // Dedicated thread to write to file
-        void file_write_worker()
-        {
-            while (!finished || sfen_buffers_pool.size())
-            {
-                vector<std::unique_ptr<PSVector>> buffers;
-                {
-                    std::unique_lock<std::mutex> lk(mutex);
-
-                    // Atomically swap take the filled buffers and
-                    // create a new buffer pool for threads to fill.
-                    buffers = std::move(sfen_buffers_pool);
-                    sfen_buffers_pool = std::vector<std::unique_ptr<PSVector>>();
-                }
-
-                if (!buffers.size())
-                {
-                    // Poor man's condition variable.
-                    sleep(100);
-                }
-                else
-                {
-                    for (auto& buf : buffers)
-                    {
-                        output_file_stream->write(*buf);
-
-                        sfen_write_count += buf->size();
-
-                        // Add the processed number here, and if it exceeds save_every,
-                        // change the file name and reset this counter.
-                        sfen_write_count_current_file += buf->size();
-                        if (sfen_write_count_current_file >= save_every)
-                        {
-                            sfen_write_count_current_file = 0;
-
-                            // Sequential number attached to the file
-                            int n = (int)(sfen_write_count / save_every);
-
-                            // Rename the file and open it again.
-                            // Add ios::app in consideration of overwriting.
-                            // (Depending on the operation, it may not be necessary.)
-                            string new_filename = filename + "_" + std::to_string(n);
-                            output_file_stream = create_new_sfen_output(new_filename, sfen_format);
-
-                            auto out = sync_region_cout.new_region();
-                            out << "INFO (sfen_writer): Creating new data file at " << new_filename << endl;
-                        }
-                    }
-                }
-            }
-        }
-
-    private:
-
-        std::unique_ptr<BasicSfenOutputStream> output_file_stream;
-
-        // A new net is saved after every save_every sfens are processed.
-        uint64_t save_every = std::numeric_limits<uint64_t>::max();
-
-        // File name passed in the constructor
-        std::string filename;
-
-        // Thread to write to the file
-        std::thread file_worker_thread;
-
-        // Flag that all threads have finished
-        atomic<bool> finished;
-
-        SfenOutputType sfen_format;
-
-        // buffer before writing to file
-        // sfen_buffers is the buffer for each thread
-        // sfen_buffers_pool is a buffer for writing.
-        // After loading the phase in the former buffer by SFEN_WRITE_SIZE,
-        // transfer it to the latter.
-        std::vector<std::unique_ptr<PSVector>> sfen_buffers;
-        std::vector<std::unique_ptr<PSVector>> sfen_buffers_pool;
-
-        // Mutex required to access sfen_buffers_pool
-        std::mutex mutex;
-
-        // Number of sfens written in total, and the
-        // number of sfens written in the current file.
-        uint64_t sfen_write_count = 0;
-        uint64_t sfen_write_count_current_file = 0;
-    };
-
     // Class to generate sfen with multiple threads
     struct Gensfen
     {
diff --git a/src/learn/sfen_writer.h b/src/learn/sfen_writer.h
new file mode 100644
index 00000000..1bbd916c
--- /dev/null
+++ b/src/learn/sfen_writer.h
@@ -0,0 +1,206 @@
+#include "packed_sfen.h"
+#include "sfen_stream.h"
+
+#include "misc.h"
+
+#include "extra/nnue_data_binpack_format.h"
+
+#include "syzygy/tbprobe.h"
+
+#include <cstring>
+#include <filesystem>
+#include <fstream>
+#include <limits>
+#include <list>
+#include <memory>
+#include <optional>
+#include <shared_mutex>
+#include <thread>
+#include <atomic>
+
+using namespace std;
+
+namespace Learner {
+
+    // Helper class for exporting Sfen
+    struct SfenWriter
+    {
+        // Amount of sfens required to flush the buffer.
+        static constexpr size_t SFEN_WRITE_SIZE = 5000;
+
+        // File name to write and number of threads to create
+        SfenWriter(string filename_, int thread_num, uint64_t save_count, SfenOutputType sfen_output_type)
+        {
+            sfen_buffers_pool.reserve((size_t)thread_num * 10);
+            sfen_buffers.resize(thread_num);
+
+            auto out = sync_region_cout.new_region();
+            out << "INFO (sfen_writer): Creating new data file at " << filename_ << endl;
+
+            sfen_format = sfen_output_type;
+            output_file_stream = create_new_sfen_output(filename_, sfen_format);
+            filename = filename_;
+            save_every = save_count;
+
+            finished = false;
+
+            file_worker_thread = std::thread([&] { this->file_write_worker(); });
+        }
+
+        ~SfenWriter()
+        {
+            flush();
+
+            finished = true;
+            file_worker_thread.join();
+            output_file_stream.reset();
+
+#if !defined(NDEBUG)
+            {
+                // All buffers should be empty since file_worker_thread
+                // should have written everything before exiting.
+                for (const auto& p : sfen_buffers) { assert(p == nullptr); (void)p ; }
+                assert(sfen_buffers_pool.empty());
+            }
+#endif
+        }
+
+        void write(size_t thread_id, const PackedSfenValue& psv)
+        {
+            // We have a buffer for each thread and add it there.
+            // If the buffer overflows, write it to a file.
+
+            // This buffer is prepared for each thread.
+            auto& buf = sfen_buffers[thread_id];
+
+            // Secure since there is no buf at the first time
+            // and immediately after writing the thread buffer.
+            if (!buf)
+            {
+                buf = std::make_unique<PSVector>();
+                buf->reserve(SFEN_WRITE_SIZE);
+            }
+
+            // Buffer is exclusive to this thread.
+            // There is no need for a critical section.
+            buf->push_back(psv);
+
+            if (buf->size() >= SFEN_WRITE_SIZE)
+            {
+                // If you load it in sfen_buffers_pool, the worker will do the rest.
+
+                // Critical section since sfen_buffers_pool is shared among threads.
+                std::unique_lock<std::mutex> lk(mutex);
+                sfen_buffers_pool.emplace_back(std::move(buf));
+            }
+        }
+
+        void flush()
+        {
+            for (size_t i = 0; i < sfen_buffers.size(); ++i)
+            {
+                flush(i);
+            }
+        }
+
+        // Move what remains in the buffer for your thread to a buffer for writing to a file.
+        void flush(size_t thread_id)
+        {
+            std::unique_lock<std::mutex> lk(mutex);
+
+            auto& buf = sfen_buffers[thread_id];
+
+            // There is a case that buf==nullptr, so that check is necessary.
+            if (buf && buf->size() != 0)
+            {
+                sfen_buffers_pool.emplace_back(std::move(buf));
+            }
+        }
+
+        // Dedicated thread to write to file
+        void file_write_worker()
+        {
+            while (!finished || sfen_buffers_pool.size())
+            {
+                vector<std::unique_ptr<PSVector>> buffers;
+                {
+                    std::unique_lock<std::mutex> lk(mutex);
+
+                    // Atomically swap take the filled buffers and
+                    // create a new buffer pool for threads to fill.
+                    buffers = std::move(sfen_buffers_pool);
+                    sfen_buffers_pool = std::vector<std::unique_ptr<PSVector>>();
+                }
+
+                if (!buffers.size())
+                {
+                    // Poor man's condition variable.
+                    sleep(100);
+                }
+                else
+                {
+                    for (auto& buf : buffers)
+                    {
+                        output_file_stream->write(*buf);
+
+                        sfen_write_count += buf->size();
+
+                        // Add the processed number here, and if it exceeds save_every,
+                        // change the file name and reset this counter.
+                        sfen_write_count_current_file += buf->size();
+                        if (sfen_write_count_current_file >= save_every)
+                        {
+                            sfen_write_count_current_file = 0;
+
+                            // Sequential number attached to the file
+                            int n = (int)(sfen_write_count / save_every);
+
+                            // Rename the file and open it again.
+                            // Add ios::app in consideration of overwriting.
+                            // (Depending on the operation, it may not be necessary.)
+                            string new_filename = filename + "_" + std::to_string(n);
+                            output_file_stream = create_new_sfen_output(new_filename, sfen_format);
+
+                            auto out = sync_region_cout.new_region();
+                            out << "INFO (sfen_writer): Creating new data file at " << new_filename << endl;
+                        }
+                    }
+                }
+            }
+        }
+
+    private:
+
+        std::unique_ptr<BasicSfenOutputStream> output_file_stream;
+
+        // A new net is saved after every save_every sfens are processed.
+        uint64_t save_every = std::numeric_limits<uint64_t>::max();
+
+        // File name passed in the constructor
+        std::string filename;
+
+        // Thread to write to the file
+        std::thread file_worker_thread;
+
+        // Flag that all threads have finished
+        atomic<bool> finished;
+
+        SfenOutputType sfen_format;
+
+        // buffer before writing to file
+        // sfen_buffers is the buffer for each thread
+        // sfen_buffers_pool is a buffer for writing.
+        // After loading the phase in the former buffer by SFEN_WRITE_SIZE,
+        // transfer it to the latter.
+        std::vector<std::unique_ptr<PSVector>> sfen_buffers;
+        std::vector<std::unique_ptr<PSVector>> sfen_buffers_pool;
+
+        // Mutex required to access sfen_buffers_pool
+        std::mutex mutex;
+
+        // Number of sfens written in total, and the
+        // number of sfens written in the current file.
+        uint64_t sfen_write_count = 0;
+        uint64_t sfen_write_count_current_file = 0;
+    };
+}

From e01397c674a843e2f90623b50d92dca6712b3f63 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 25 Oct 2020 10:43:45 +0100
Subject: [PATCH 274/398] Remove multi_think

---
 src/Makefile              |   3 +-
 src/learn/convert.cpp     |   2 -
 src/learn/multi_think.cpp |  98 ------------------------
 src/learn/multi_think.h   | 152 --------------------------------------
 4 files changed, 1 insertion(+), 254 deletions(-)
 delete mode 100644 src/learn/multi_think.cpp
 delete mode 100644 src/learn/multi_think.h

diff --git a/src/Makefile b/src/Makefile
index 0b2f99ed..f2c4d269 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -59,8 +59,7 @@ SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp
 	learn/sfen_packer.cpp \
 	learn/learn.cpp \
 	learn/gensfen.cpp \
-	learn/convert.cpp \
-	learn/multi_think.cpp
+	learn/convert.cpp
 
 OBJS = $(notdir $(SRCS:.cpp=.o))
 
diff --git a/src/learn/convert.cpp b/src/learn/convert.cpp
index a7528b02..dfd30509 100644
--- a/src/learn/convert.cpp
+++ b/src/learn/convert.cpp
@@ -1,7 +1,5 @@
 #include "convert.h"
 
-#include "multi_think.h"
-
 #include "uci.h"
 #include "misc.h"
 #include "thread.h"
diff --git a/src/learn/multi_think.cpp b/src/learn/multi_think.cpp
deleted file mode 100644
index bf1ab29b..00000000
--- a/src/learn/multi_think.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-﻿#include "multi_think.h"
-
-#include "tt.h"
-#include "uci.h"
-#include "types.h"
-#include "search.h"
-
-#include "nnue/evaluate_nnue.h"
-
-#include <thread>
-
-void MultiThink::go_think()
-{
-    // Call the derived class's init().
-    init();
-
-    // The loop upper limit is set with set_loop_max().
-    loop_count = 0;
-    done_count = 0;
-
-    // Create threads as many as Options["Threads"] and start thinking.
-    std::vector<std::thread> threads;
-    auto thread_num = (size_t)Options["Threads"];
-
-    // Secure end flag of worker thread
-        threads_finished=0;
-
-    // start worker thread
-    for (size_t i = 0; i < thread_num; ++i)
-    {
-        threads.push_back(std::thread([i, this]
-        {
-            // exhaust all processor threads.
-            WinProcGroup::bindThisThread(i);
-
-            // execute the overridden process
-            this->thread_worker(i);
-
-            // Set the end flag because the thread has ended
-            this->threads_finished++;
-        }));
-    }
-
-    // wait for all threads to finish
-    // for (auto& th :threads)
-    // th.join();
-    // If you write like, the thread will rush here while it is still working,
-    // During that time, callback_func() cannot be called and you cannot save.
-    // Therefore, you need to check the end flag yourself.
-
-    // function to determine if all threads have finished
-    auto threads_done = [&]()
-    {
-        return threads_finished == thread_num;
-    };
-
-    // Call back if the callback function is set.
-    auto do_a_callback = [&]()
-    {
-        if (callback_func)
-            callback_func();
-    };
-
-
-    for (uint64_t i = 0 ; ; )
-    {
-        // If all threads have finished, exit the loop.
-        if (threads_done())
-            break;
-
-        sleep(1000);
-
-        // callback_func() is called every callback_seconds.
-        if (++i == callback_seconds)
-        {
-            do_a_callback();
-            // Since I am returning from ↑, I reset the counter, so
-            // no matter how long it takes to save() etc. in do_a_callback()
-            // The next call will take a certain amount of time.
-            i = 0;
-        }
-    }
-
-    // Last save.
-    std::cout << std::endl << "finalize..";
-
-    // do_a_callback();
-    // → It should be saved by the caller, so I feel that it is not necessary here.
-
-    // It is possible that the exit code of the thread is running but the exit code of the thread is running, so
-    // We need to wait for the end with join().
-    for (auto& th : threads)
-        th.join();
-
-    // The file writing thread etc. are still running only when all threads are finished
-    // Since the work itself may not have completed, output only that all threads have finished.
-    std::cout << "all threads are joined." << std::endl;
-}
diff --git a/src/learn/multi_think.h b/src/learn/multi_think.h
deleted file mode 100644
index 7e541909..00000000
--- a/src/learn/multi_think.h
+++ /dev/null
@@ -1,152 +0,0 @@
-﻿#ifndef _MULTI_THINK_
-#define _MULTI_THINK_
-
-#include "learn.h"
-
-#include "misc.h"
-#include "thread_win32_osx.h"
-
-#include <atomic>
-#include <limits>
-#include <functional>
-#include <mutex>
-#include <string>
-#include <cstdint>
-
-
-// Learning from a game record, when making yourself think and generating a fixed track, etc.
-// Helper class used when multiple threads want to call Search::think() individually.
-// Derive and use this class.
-struct MultiThink
-{
-    static constexpr std::uint64_t LOOP_COUNT_FINISHED = std::numeric_limits<std::uint64_t>::max();
-
-    MultiThink() : prng{}, loop_count(0) { }
-
-    MultiThink(std::uint64_t seed) : prng(seed), loop_count(0) { }
-
-    MultiThink(const std::string& seed) : prng(seed), loop_count(0) { }
-
-    // Call this function from the master thread, each thread will think,
-    // Return control when the thought ending condition is satisfied.
-    // Do something else.
-    // ・It is safe for each thread to call Learner::search(),qsearch()
-    // Separates the substitution table for each thread. (It will be restored after the end.)
-    // ・Book is not thread safe when in on the fly mode, so temporarily change this mode.
-    // Turn it off.
-    // [Requirements]
-    // 1) Override thread_worker()
-    // 2) Set the loop count with set_loop_max()
-    // 3) set a function to be called back periodically (if necessary)
-    // callback_func and callback_interval
-    void go_think();
-
-    // If there is something you want to initialize on the derived class side, override this,
-    // Called when initialization is completed with go_think().
-    // It is better to read the fixed trace at that timing.
-    virtual void init() {}
-
-    // A thread worker that is called by creating a thread when you go_think()
-    // Override and use this.
-    virtual void thread_worker(size_t thread_id) = 0;
-
-    // Called back every callback_seconds [seconds] when go_think().
-    std::function<void()> callback_func;
-    uint64_t callback_seconds = 600;
-
-    // Set the number of times worker processes (calls Search::think()).
-    void set_loop_max(uint64_t loop_max_) { loop_max = loop_max_; }
-
-    // Get the value set by set_loop_max().
-    uint64_t get_loop_max() const { return loop_max; }
-
-    // [ASYNC] Take the value of the loop counter and add the loop counter after taking it out.
-    // If the loop counter has reached loop_max, return UINT64_MAX.
-    // If you want to generate a phase, you must call this function at the time of generating the phase,
-    // Please note that the number of generated phases and the value of the counter will not match.
-    uint64_t get_next_loop_count() {
-        std::unique_lock<std::mutex> lk(loop_mutex);
-        if (loop_count >= loop_max)
-            return LOOP_COUNT_FINISHED;
-        return loop_count++;
-    }
-
-    // [ASYNC] For returning the processed number. Each time it is called, it returns a counter that is incremented.
-    uint64_t get_done_count() {
-        std::unique_lock<std::mutex> lk(loop_mutex);
-        return ++done_count;
-    }
-
-    // Mutex when worker thread accesses I/O
-    std::mutex io_mutex;
-
-protected:
-    // Random number generator body
-    AsyncPRNG prng;
-
-private:
-    // number of times worker processes (calls Search::think())
-    std::atomic<uint64_t> loop_max;
-    // number of times the worker has processed (calls Search::think())
-    std::atomic<uint64_t> loop_count;
-    // To return the number of times it has been processed.
-    std::atomic<uint64_t> done_count;
-
-    // Mutex when changing the variables in ↑
-    std::mutex loop_mutex;
-
-    // Thread end flag.
-    std::atomic<uint64_t> threads_finished;
-};
-
-// Mechanism to process task during idle time.
-// master passes the task with push_task_async() whenever you like.
-// When slave executes on_idle() in its spare time, it retrieves one task and continues execution until there is no queue.
-// Convenient to use when you want to write MultiThink thread worker in master-slave method.
-struct TaskDispatcher
-{
-    typedef std::function<void(size_t /* thread_id */)> Task;
-
-    // slave calls this function during idle.
-    void on_idle(size_t thread_id)
-    {
-        Task task;
-        while ((task = get_task_async()) != nullptr)
-            task(thread_id);
-
-        sleep(1);
-    }
-
-    // Stack [ASYNC] task.
-    void push_task_async(Task task)
-    {
-        std::unique_lock<std::mutex> lk(task_mutex);
-        tasks.push_back(task);
-    }
-
-    // Allocate size array elements for task in advance.
-    void task_reserve(size_t size)
-    {
-        tasks.reserve(size);
-    }
-
-protected:
-    // set of tasks
-    std::vector<Task> tasks;
-
-    // Take out one [ASYNC] task. Called from on_idle().
-    Task get_task_async()
-    {
-        std::unique_lock<std::mutex> lk(task_mutex);
-        if (tasks.size() == 0)
-            return nullptr;
-        Task task = *tasks.rbegin();
-        tasks.pop_back();
-        return task;
-    }
-
-    // a mutex for accessing tasks
-    std::mutex task_mutex;
-};
-
-#endif

From ba390a7f9a0a0243531a2489ab4f4303a26deca4 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 26 Oct 2020 13:52:35 +0100
Subject: [PATCH 275/398] Print the used factorizer when intializing training.

---
 src/nnue/evaluate_nnue_learner.cpp               |  5 +++++
 src/nnue/trainer/features/factorizer.h           |  8 ++++++++
 .../trainer/features/factorizer_feature_set.h    | 16 ++++++++++++++++
 src/nnue/trainer/features/factorizer_half_kp.h   |  8 ++++++++
 4 files changed, 37 insertions(+)

diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 7a72ea19..6775707d 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -68,6 +68,11 @@ namespace Eval::NNUE {
 
         out << std::endl;
 
+        out << "Factorizers:\n"
+            << Features::Factorizer<RawFeatures>::get_factorizers_string() << std::endl;
+
+        out << std::endl;
+
         assert(feature_transformer);
         assert(network);
 
diff --git a/src/nnue/trainer/features/factorizer.h b/src/nnue/trainer/features/factorizer.h
index 49a2fe26..15ce8022 100644
--- a/src/nnue/trainer/features/factorizer.h
+++ b/src/nnue/trainer/features/factorizer.h
@@ -13,6 +13,14 @@ namespace Eval::NNUE::Features {
     template <typename FeatureType>
     class Factorizer {
     public:
+        static constexpr std::string get_name() {
+            return std::string("No factorizer");
+        }
+
+        static constexpr std::string get_factorizers_string() {
+            return "  - " + get_name();
+        }
+
         // Get the dimensionality of the learning feature
         static constexpr IndexType get_dimensions() {
             return FeatureType::kDimensions;
diff --git a/src/nnue/trainer/features/factorizer_feature_set.h b/src/nnue/trainer/features/factorizer_feature_set.h
index 032a449b..f5ee3c5c 100644
--- a/src/nnue/trainer/features/factorizer_feature_set.h
+++ b/src/nnue/trainer/features/factorizer_feature_set.h
@@ -21,6 +21,14 @@ namespace Eval::NNUE::Features {
         static constexpr IndexType kBaseDimensions =
             FeatureSet<FirstFeatureType, RemainingFeatureTypes...>::kDimensions;
 
+        static constexpr std::string get_factorizers_string() {
+            std::string str = "  - ";
+            str += Head::get_name();
+            str += '\n';
+            str += Tail::get_factorizers_string();
+            return str;
+        }
+
         // Get the dimensionality of the learning feature
         static constexpr IndexType get_dimensions() {
             return Head::get_dimensions() + Tail::get_dimensions();
@@ -73,6 +81,14 @@ namespace Eval::NNUE::Features {
         // number of dimensions of original input features
         static constexpr IndexType kBaseDimensions = FeatureType::kDimensions;
 
+        static constexpr std::string get_name() {
+            return FeatureType::kName;
+        }
+
+        static constexpr std::string get_factorizers_string() {
+            return "  - " + get_name();
+        }
+
         // Get the dimensionality of the learning feature
         static constexpr IndexType get_dimensions() {
             return Factorizer<FeatureType>::get_dimensions();
diff --git a/src/nnue/trainer/features/factorizer_half_kp.h b/src/nnue/trainer/features/factorizer_half_kp.h
index 152722ac..601ddfa5 100644
--- a/src/nnue/trainer/features/factorizer_half_kp.h
+++ b/src/nnue/trainer/features/factorizer_half_kp.h
@@ -45,6 +45,14 @@ namespace Eval::NNUE::Features {
         static_assert(get_array_length(kProperties) == kNumTrainingFeatureTypes, "");
 
     public:
+        static constexpr std::string get_name() {
+            return std::string("Factorizer<") + FeatureType::kName + ">";
+        }
+
+        static constexpr std::string get_factorizers_string() {
+            return "  - " + get_name();
+        }
+
         // Get the dimensionality of the learning feature
         static constexpr IndexType get_dimensions() {
             return get_active_dimensions(kProperties);

From f7de49eb66b07a0ab65c32184d22af4abeced378 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 23:16:11 +0200
Subject: [PATCH 276/398] Create a collective parameter struct for learner.

---
 src/learn/learn.cpp | 374 +++++++++++++++++++-------------------------
 1 file changed, 162 insertions(+), 212 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 3faab0ea..e9eb1141 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -77,10 +77,6 @@ T operator -= (std::atomic<T>& x, const T rhs) { return x += -rhs; }
 
 namespace Learner
 {
-    static bool use_draw_games_in_training = true;
-    static bool use_draw_games_in_validation = true;
-    static bool skip_duplicated_positions_in_training = true;
-
     static double winning_probability_coefficient = 1.0 / PawnValueEg / 4.0 * std::log(10.0);
 
     // Score scale factors. ex) If we set src_score_min_value = 0.0,
@@ -373,37 +369,94 @@ namespace Learner
     // Class to generate sfen with multiple threads
     struct LearnerThink
     {
+        struct Params
+        {
+            // Mini batch size size. Be sure to set it on the side that uses this class.
+            uint64_t mini_batch_size = LEARN_MINI_BATCH_SIZE;
+
+            // Option to exclude early stage from learning
+            int reduction_gameply = 1;
+
+            // If the absolute value of the evaluation value of the deep search
+            // of the teacher phase exceeds this value, discard the teacher phase.
+            int eval_limit = 32000;
+
+            // Flag whether to dig a folder each time the evaluation function is saved.
+            // If true, do not dig the folder.
+            bool save_only_once = false;
+
+            bool shuffle = true;
+
+            bool verbose = false;
+
+            double newbob_decay = 0.5;
+            int newbob_num_trials = 4;
+            uint64_t auto_lr_drop = 0;
+
+            std::string best_nn_directory;
+
+            uint64_t eval_save_interval = LEARN_EVAL_SAVE_INTERVAL;
+            uint64_t loss_output_interval = 1'000'000;
+
+            size_t sfen_read_size = SfenReader::DEFAULT_SFEN_READ_SIZE;
+            size_t thread_buffer_size = SfenReader::DEFAULT_THREAD_BUFFER_SIZE;
+
+            bool use_draw_games_in_training = true;
+            bool use_draw_games_in_validation = true;
+            bool skip_duplicated_positions_in_training = true;
+
+            string validation_set_file_name;
+            string seed;
+
+            std::vector<std::string> filenames;
+
+            uint64_t num_threads;
+
+            void enforce_constraints()
+            {
+                num_threads = Options["Threads"];
+
+                if (loss_output_interval == 0)
+                {
+                    loss_output_interval = LEARN_RMSE_OUTPUT_INTERVAL * mini_batch_size;
+                }
+
+                // If reduction_gameply is set to 0, rand(0) will be divided by 0, so correct it to 1.
+                reduction_gameply = max(reduction_gameply, 1);
+
+                if (newbob_decay != 1.0 && !Options["SkipLoadingEval"]) {
+                    // Save the current net to [EvalSaveDir]\original.
+                    Eval::NNUE::save_eval("original");
+
+                    // Set the folder above to best_nn_directory so that the trainer can
+                    // resotre the network parameters from the original net file.
+                    best_nn_directory =
+                        Path::combine(Options["EvalSaveDir"], "original");
+                }
+            }
+        };
+
         // Number of phases used for calculation such as mse
         // mini-batch size = 1M is standard, so 0.2% of that should be negligible in terms of time.
         // Since search() is performed with depth = 1 in calculation of
         // move match rate, simple comparison is not possible...
         static constexpr uint64_t sfen_for_mse_size = 2000;
 
-        LearnerThink(
-            const std::vector<std::string>& filenames,
-            bool shuffle,
-            uint64_t thread_num,
-            const std::string& seed,
-            size_t read_size,
-            size_t buffer_size
-        ) :
-            prng(seed),
+        LearnerThink(const Params& prm) :
+            params(prm),
+            prng(prm.seed),
             sr(
-                filenames,
-                shuffle,
+                prm.filenames,
+                prm.shuffle,
                 SfenReaderMode::Cyclic,
-                thread_num,
+                prm.num_threads,
                 std::to_string(prng.next_random_seed()),
-                read_size,
-                buffer_size),
+                prm.sfen_read_size,
+                prm.thread_buffer_size),
             learn_loss_sum{}
         {
-            save_only_once = false;
             save_count = 0;
             loss_output_count = 0;
-            newbob_decay = 1.0;
-            newbob_num_trials = 2;
-            auto_lr_drop = 0;
             last_lr_drop = 0;
             best_loss = std::numeric_limits<double>::infinity();
             latest_loss_sum = 0.0;
@@ -413,34 +466,6 @@ namespace Learner
 
         void learn(uint64_t epochs);
 
-
-        std::string validation_set_file_name;
-
-        // Mini batch size size. Be sure to set it on the side that uses this class.
-        uint64_t mini_batch_size = LEARN_MINI_BATCH_SIZE;
-
-        // Option to exclude early stage from learning
-        int reduction_gameply;
-
-        // If the absolute value of the evaluation value of the deep search
-        // of the teacher phase exceeds this value, discard the teacher phase.
-        int eval_limit;
-
-        // Flag whether to dig a folder each time the evaluation function is saved.
-        // If true, do not dig the folder.
-        bool save_only_once;
-
-        bool verbose;
-
-        double newbob_decay;
-        int newbob_num_trials;
-        uint64_t auto_lr_drop;
-
-        std::string best_nn_directory;
-
-        uint64_t eval_save_interval;
-        uint64_t loss_output_interval;
-
     private:
         void learn_worker(Thread& th, std::atomic<uint64_t>& counter, uint64_t limit);
 
@@ -462,6 +487,8 @@ namespace Learner
         // save merit function parameters to a file
         bool save(bool is_final = false);
 
+        Params params;
+
         PRNG prng;
 
         // sfen reader
@@ -493,11 +520,14 @@ namespace Learner
         Eval::NNUE::verify_any_net_loaded();
 
         const PSVector sfen_for_mse =
-            validation_set_file_name.empty()
+            params.validation_set_file_name.empty()
             ? sr.read_for_mse(sfen_for_mse_size)
-            : sr.read_validation_set(validation_set_file_name, eval_limit, use_draw_games_in_validation);
+            : sr.read_validation_set(
+                params.validation_set_file_name,
+                params.eval_limit,
+                params.use_draw_games_in_validation);
 
-        if (validation_set_file_name.empty()
+        if (params.validation_set_file_name.empty()
             && sfen_for_mse.size() != sfen_for_mse_size)
         {
             auto out = sync_region_cout.new_region();
@@ -508,7 +538,7 @@ namespace Learner
             return;
         }
 
-        if (newbob_decay != 1.0) {
+        if (params.newbob_decay != 1.0) {
 
             calc_loss(sfen_for_mse, 0);
 
@@ -527,10 +557,10 @@ namespace Learner
             std::atomic<uint64_t> counter{0};
 
             Threads.execute_with_workers([this, &counter](auto& th){
-                learn_worker(th, counter, mini_batch_size);
+                learn_worker(th, counter, params.mini_batch_size);
             });
 
-            total_done += mini_batch_size;
+            total_done += params.mini_batch_size;
 
             Threads.wait_for_workers_finished();
 
@@ -574,14 +604,14 @@ namespace Learner
                 break;
             }
 
-            if (eval_limit < abs(ps.score))
+            if (params.eval_limit < abs(ps.score))
                 goto RETRY_READ;
 
-            if (!use_draw_games_in_training && ps.game_result == 0)
+            if (!params.use_draw_games_in_training && ps.game_result == 0)
                 goto RETRY_READ;
 
             // Skip over the opening phase
-            if (ps.gamePly < prng.rand(reduction_gameply))
+            if (ps.gamePly < prng.rand(params.reduction_gameply))
                 goto RETRY_READ;
 
             StateInfo si;
@@ -647,10 +677,10 @@ namespace Learner
         // should be no real issues happening since
         // the read/write phases are isolated.
         atomic_thread_fence(memory_order_seq_cst);
-        Eval::NNUE::update_parameters(epoch, verbose);
+        Eval::NNUE::update_parameters(epoch, params.verbose);
         atomic_thread_fence(memory_order_seq_cst);
 
-        if (++save_count * mini_batch_size >= eval_save_interval)
+        if (++save_count * params.mini_batch_size >= params.eval_save_interval)
         {
             save_count = 0;
 
@@ -662,7 +692,7 @@ namespace Learner
             }
         }
 
-        if (++loss_output_count * mini_batch_size >= loss_output_interval)
+        if (++loss_output_count * params.mini_batch_size >= params.loss_output_interval)
         {
             loss_output_count = 0;
 
@@ -829,7 +859,7 @@ namespace Learner
         // Each time you save, change the extension part of the file name like "0","1","2",..
         // (Because I want to compare the winning rate for each evaluation function parameter later)
 
-        if (save_only_once)
+        if (params.save_only_once)
         {
             // When EVAL_SAVE_ONLY_ONCE is defined,
             // Do not dig a subfolder because I want to save it only once.
@@ -846,50 +876,49 @@ namespace Learner
             const std::string dir_name = std::to_string(dir_number++);
             Eval::NNUE::save_eval(dir_name);
 
-            if (newbob_decay != 1.0 && latest_loss_count > 0) {
-                static int trials = newbob_num_trials;
+            if (params.newbob_decay != 1.0 && latest_loss_count > 0) {
+                static int trials = params.newbob_num_trials;
                 const double latest_loss = latest_loss_sum / latest_loss_count;
                 latest_loss_sum = 0.0;
                 latest_loss_count = 0;
                 cout << "INFO (learning_rate):" << endl;
                 cout << "  - loss = " << latest_loss;
                 auto tot = total_done;
-                if (auto_lr_drop)
+                if (params.auto_lr_drop)
                 {
                     cout << " < best (" << best_loss << "), accepted" << endl;
                     best_loss = latest_loss;
-                    best_nn_directory = Path::combine((std::string)Options["EvalSaveDir"], dir_name);
-                    trials = newbob_num_trials;
+                    trials = params.newbob_num_trials;
 
-                    if (tot >= last_lr_drop + auto_lr_drop)
+                    if (tot >= last_lr_drop + params.auto_lr_drop)
                     {
                         last_lr_drop = tot;
-                        global_learning_rate *= newbob_decay;
+                        global_learning_rate *= params.newbob_decay;
                     }
                 }
                 else if (latest_loss < best_loss)
                 {
                     cout << " < best (" << best_loss << "), accepted" << endl;
                     best_loss = latest_loss;
-                    best_nn_directory = Path::combine((std::string)Options["EvalSaveDir"], dir_name);
-                    trials = newbob_num_trials;
+                    trials = params.newbob_num_trials;
                 }
                 else
                 {
                     cout << " >= best (" << best_loss << "), rejected" << endl;
-                    best_nn_directory = Path::combine((std::string)Options["EvalSaveDir"], dir_name);
 
                     if (--trials > 0 && !is_final)
                     {
                         cout
                             << "  - reducing learning rate from " << global_learning_rate
-                            << " to " << (global_learning_rate * newbob_decay)
+                            << " to " << (global_learning_rate * params.newbob_decay)
                             << " (" << trials << " more trials)" << endl;
 
-                        global_learning_rate *= newbob_decay;
+                        global_learning_rate *= params.newbob_decay;
                     }
                 }
 
+                params.best_nn_directory = Path::combine((std::string)Options["EvalSaveDir"], dir_name);
+
                 if (trials == 0)
                 {
                     cout << "  - converged" << endl;
@@ -924,12 +953,7 @@ namespace Learner
     // Learning from the generated game record
     void learn(Position&, istringstream& is)
     {
-        const auto thread_num = (int)Options["Threads"];
-
-        vector<string> filenames;
-
-        // mini_batch_size 1M aspect by default. This can be increased.
-        auto mini_batch_size = LEARN_MINI_BATCH_SIZE;
+        LearnerThink::Params params;
 
         // Number of epochs
         uint64_t epochs = std::numeric_limits<uint64_t>::max();
@@ -938,21 +962,6 @@ namespace Learner
         string base_dir;
         string target_dir;
 
-        // If the absolute value of the evaluation value
-        // in the deep search of the teacher phase exceeds this value,
-        // that phase is discarded.
-        int eval_limit = 32000;
-
-        // Flag to save the evaluation function file only once near the end.
-        bool save_only_once = false;
-
-        // Shuffle about what you are pre-reading on the teacher aspect.
-        // (Shuffle of about 10 million phases)
-        // Turn on if you want to pass a pre-shuffled file.
-        bool no_shuffle = false;
-
-        bool verbose = false;
-
         global_learning_rate = 1.0;
 
         // elmo lambda
@@ -960,26 +969,9 @@ namespace Learner
         ELMO_LAMBDA2 = 1.0;
         ELMO_LAMBDA_LIMIT = 32000;
 
-        // if (gamePly <rand(reduction_gameply)) continue;
-        // An option to exclude the early stage from the learning target moderately like
-        // If set to 1, rand(1)==0, so nothing is excluded.
-        int reduction_gameply = 1;
-
         uint64_t nn_batch_size = 1000;
-        double newbob_decay = 0.5;
-        int newbob_num_trials = 4;
-        uint64_t auto_lr_drop = 0;
         string nn_options;
 
-        uint64_t eval_save_interval = LEARN_EVAL_SAVE_INTERVAL;
-        uint64_t loss_output_interval = 1'000'000;
-
-        size_t sfen_read_size = SfenReader::DEFAULT_SFEN_READ_SIZE;
-        size_t thread_buffer_size = SfenReader::DEFAULT_THREAD_BUFFER_SIZE;
-
-        string validation_set_file_name;
-        string seed;
-
         auto out = sync_region_cout.new_region();
 
         // Assume the filenames are staggered.
@@ -994,8 +986,8 @@ namespace Learner
             // specify the number of phases of mini-batch
             if (option == "bat")
             {
-                is >> mini_batch_size;
-                mini_batch_size *= 10000; // Unit is ten thousand
+                is >> params.mini_batch_size;
+                params.mini_batch_size *= 10000; // Unit is ten thousand
             }
 
             // Specify the folder in which the game record is stored and make it the rooting target.
@@ -1004,72 +996,73 @@ namespace Learner
             {
                 std::string filename;
                 is >> filename;
-                filenames.push_back(filename);
+                params.filenames.push_back(filename);
             }
 
             // Specify the number of loops
-            else if (option == "epochs")      is >> epochs;
+            else if (option == "epochs") is >> epochs;
 
             // Game file storage folder (get game file with relative path from here)
-            else if (option == "basedir")   is >> base_dir;
+            else if (option == "basedir") is >> base_dir;
 
             // Mini batch size
-            else if (option == "batchsize") is >> mini_batch_size;
+            else if (option == "batchsize") is >> params.mini_batch_size;
 
             // learning rate
-            else if (option == "lr")        is >> global_learning_rate;
+            else if (option == "lr") is >> global_learning_rate;
 
             // Accept also the old option name.
             else if (option == "use_draw_in_training"
                   || option == "use_draw_games_in_training")
-                is >> use_draw_games_in_training;
+                is >> params.use_draw_games_in_training;
 
             // Accept also the old option name.
             else if (option == "use_draw_in_validation"
                   || option == "use_draw_games_in_validation")
-                is >> use_draw_games_in_validation;
+                is >> params.use_draw_games_in_validation;
 
             // Accept also the old option name.
             else if (option == "use_hash_in_training"
                   || option == "skip_duplicated_positions_in_training")
-                is >> skip_duplicated_positions_in_training;
+                is >> params.skip_duplicated_positions_in_training;
 
-            else if (option == "winning_probability_coefficient") is >> winning_probability_coefficient;
+            else if (option == "winning_probability_coefficient")
+                is >> winning_probability_coefficient;
 
             // Using WDL with win rate model instead of sigmoid
             else if (option == "use_wdl") is >> use_wdl;
 
 
             // LAMBDA
-            else if (option == "lambda")       is >> ELMO_LAMBDA;
-            else if (option == "lambda2")      is >> ELMO_LAMBDA2;
+            else if (option == "lambda") is >> ELMO_LAMBDA;
+            else if (option == "lambda2") is >> ELMO_LAMBDA2;
             else if (option == "lambda_limit") is >> ELMO_LAMBDA_LIMIT;
 
-            else if (option == "reduction_gameply") is >> reduction_gameply;
+            else if (option == "reduction_gameply") is >> params.reduction_gameply;
 
-            else if (option == "eval_limit") is >> eval_limit;
-            else if (option == "save_only_once") save_only_once = true;
-            else if (option == "no_shuffle") no_shuffle = true;
+            else if (option == "eval_limit") is >> params.eval_limit;
+            else if (option == "save_only_once") params.save_only_once = true;
+            else if (option == "no_shuffle") params.shuffle = false;
 
             else if (option == "nn_batch_size") is >> nn_batch_size;
-            else if (option == "newbob_decay") is >> newbob_decay;
-            else if (option == "newbob_num_trials") is >> newbob_num_trials;
+            else if (option == "newbob_decay") is >> params.newbob_decay;
+            else if (option == "newbob_num_trials") is >> params.newbob_num_trials;
             else if (option == "nn_options") is >> nn_options;
-            else if (option == "auto_lr_drop") is >> auto_lr_drop;
+            else if (option == "auto_lr_drop") is >> params.auto_lr_drop;
 
-            else if (option == "eval_save_interval") is >> eval_save_interval;
-            else if (option == "loss_output_interval") is >> loss_output_interval;
-            else if (option == "validation_set_file_name") is >> validation_set_file_name;
+            else if (option == "eval_save_interval") is >> params.eval_save_interval;
+            else if (option == "loss_output_interval") is >> params.loss_output_interval;
+            else if (option == "validation_set_file_name") is >> params.validation_set_file_name;
 
             else if (option == "src_score_min_value") is >> src_score_min_value;
             else if (option == "src_score_max_value") is >> src_score_max_value;
             else if (option == "dest_score_min_value") is >> dest_score_min_value;
             else if (option == "dest_score_max_value") is >> dest_score_max_value;
 
-            else if (option == "sfen_read_size") is >> sfen_read_size;
-            else if (option == "thread_buffer_size") is >> thread_buffer_size;
+            else if (option == "sfen_read_size") is >> params.sfen_read_size;
+            else if (option == "thread_buffer_size") is >> params.thread_buffer_size;
 
-            else if (option == "seed") is >> seed;
+            else if (option == "seed") is >> params.seed;
             else if (option == "set_recommended_uci_options")
             {
                 UCI::setoption("Use NNUE", "pure");
@@ -1082,21 +1075,13 @@ namespace Learner
                 UCI::setoption("PruneAtShallowDepth", "false");
                 UCI::setoption("EnableTranspositionTable", "false");
             }
-            else if (option == "verbose") verbose = true;
+            else if (option == "verbose") params.verbose = true;
             else
             {
                 out << "INFO: Unknown option: " << option << ". Ignoring.\n";
             }
         }
 
-        if (loss_output_interval == 0)
-        {
-            loss_output_interval = LEARN_RMSE_OUTPUT_INTERVAL * mini_batch_size;
-        }
-
-        // If reduction_gameply is set to 0, rand(0) will be divided by 0, so correct it to 1.
-        reduction_gameply = max(reduction_gameply, 1);
-
         out << "INFO: Executing learn command\n";
 
         // Issue a warning if OpenMP is disabled.
@@ -1104,40 +1089,42 @@ namespace Learner
         out << "WARNING: OpenMP disabled." << endl;
 #endif
 
+        params.enforce_constraints();
+
         // Right now we only have the individual files.
         // We need to apply base_dir here
         if (!target_dir.empty())
         {
-            append_files_from_dir(filenames, base_dir, target_dir);
+            append_files_from_dir(params.filenames, base_dir, target_dir);
         }
-        rebase_files(filenames, base_dir);
+        rebase_files(params.filenames, base_dir);
 
         out << "INFO: Input files:\n";
-        for (auto s : filenames)
+        for (auto s : params.filenames)
             out << "  - " << s << '\n';
 
         out << "INFO: Parameters:\n";
-        if (!validation_set_file_name.empty())
+        if (!params.validation_set_file_name.empty())
         {
-            out << "  - validation set           : " << validation_set_file_name << endl;
+            out << "  - validation set           : " << params.validation_set_file_name << endl;
         }
 
         out << "  - epochs                   : " << epochs << endl;
-        out << "  - epochs * minibatch size  : " << epochs * mini_batch_size << endl;
-        out << "  - eval_limit               : " << eval_limit << endl;
-        out << "  - save_only_once           : " << (save_only_once ? "true" : "false") << endl;
-        out << "  - shuffle on read          : " << (no_shuffle ? "false" : "true") << endl;
+        out << "  - epochs * minibatch size  : " << epochs * params.mini_batch_size << endl;
+        out << "  - eval_limit               : " << params.eval_limit << endl;
+        out << "  - save_only_once           : " << (params.save_only_once ? "true" : "false") << endl;
+        out << "  - shuffle on read          : " << (params.shuffle ? "true" : "false") << endl;
 
         out << "  - Loss Function            : " << LOSS_FUNCTION << endl;
-        out << "  - minibatch size           : " << mini_batch_size << endl;
+        out << "  - minibatch size           : " << params.mini_batch_size << endl;
 
         out << "  - nn_batch_size            : " << nn_batch_size << endl;
         out << "  - nn_options               : " << nn_options << endl;
 
         out << "  - learning rate            : " << global_learning_rate << endl;
-        out << "  - use draws in training    : " << use_draw_games_in_training << endl;
-        out << "  - use draws in validation  : " << use_draw_games_in_validation << endl;
-        out << "  - skip repeated positions  : " << skip_duplicated_positions_in_training << endl;
+        out << "  - use draws in training    : " << params.use_draw_games_in_training << endl;
+        out << "  - use draws in validation  : " << params.use_draw_games_in_validation << endl;
+        out << "  - skip repeated positions  : " << params.skip_duplicated_positions_in_training << endl;
 
         out << "  - winning prob coeff       : " << winning_probability_coefficient << endl;
         out << "  - use_wdl                  : " << use_wdl << endl;
@@ -1147,27 +1134,27 @@ namespace Learner
         out << "  - dest_score_min_value     : " << dest_score_min_value << endl;
         out << "  - dest_score_max_value     : " << dest_score_max_value << endl;
 
-        out << "  - reduction_gameply        : " << reduction_gameply << endl;
+        out << "  - reduction_gameply        : " << params.reduction_gameply << endl;
 
         out << "  - LAMBDA                   : " << ELMO_LAMBDA << endl;
         out << "  - LAMBDA2                  : " << ELMO_LAMBDA2 << endl;
         out << "  - LAMBDA_LIMIT             : " << ELMO_LAMBDA_LIMIT << endl;
-        out << "  - eval_save_interval       : " << eval_save_interval << " sfens" << endl;
-        out << "  - loss_output_interval     : " << loss_output_interval << " sfens" << endl;
+        out << "  - eval_save_interval       : " << params.eval_save_interval << " sfens" << endl;
+        out << "  - loss_output_interval     : " << params.loss_output_interval << " sfens" << endl;
 
-        out << "  - sfen_read_size           : " << sfen_read_size << endl;
-        out << "  - thread_buffer_size       : " << thread_buffer_size << endl;
+        out << "  - sfen_read_size           : " << params.sfen_read_size << endl;
+        out << "  - thread_buffer_size       : " << params.thread_buffer_size << endl;
 
-        out << "  - seed                     : " << seed << endl;
-        out << "  - verbose                  : " << (verbose ? "true" : "false") << endl;
+        out << "  - seed                     : " << params.seed << endl;
+        out << "  - verbose                  : " << (params.verbose ? "true" : "false") << endl;
 
-        if (auto_lr_drop) {
-            out << "  - learning rate scheduling : every " << auto_lr_drop << " sfens" << endl;
+        if (params.auto_lr_drop) {
+            out << "  - learning rate scheduling : every " << params.auto_lr_drop << " sfens" << endl;
         }
-        else if (newbob_decay != 1.0) {
+        else if (params.newbob_decay != 1.0) {
             out << "  - learning rate scheduling : newbob with decay" << endl;
-            out << "  - newbob_decay             : " << newbob_decay << endl;
-            out << "  - newbob_num_trials        : " << newbob_num_trials << endl;
+            out << "  - newbob_decay             : " << params.newbob_decay << endl;
+            out << "  - newbob_num_trials        : " << params.newbob_num_trials << endl;
         }
         else {
             out << "  - learning rate scheduling : fixed learning rate" << endl;
@@ -1175,54 +1162,17 @@ namespace Learner
 
         out << endl;
 
-        // -----------------------------------
-        // various initialization
-        // -----------------------------------
-
         out << "INFO: Started initialization." << endl;
 
         Threads.main()->ponder = false;
 
         set_learning_search_limits();
 
-        Eval::NNUE::initialize_training(seed, out);
+        Eval::NNUE::initialize_training(params.seed, out);
         Eval::NNUE::set_batch_size(nn_batch_size);
         Eval::NNUE::set_options(nn_options);
 
-        LearnerThink learn_think(
-            filenames,
-            !no_shuffle,
-            thread_num,
-            seed,
-            sfen_read_size,
-            thread_buffer_size);
-
-        if (newbob_decay != 1.0 && !Options["SkipLoadingEval"]) {
-            // Save the current net to [EvalSaveDir]\original.
-            Eval::NNUE::save_eval("original");
-
-            // Set the folder above to best_nn_directory so that the trainer can
-            // resotre the network parameters from the original net file.
-            learn_think.best_nn_directory =
-                Path::combine(Options["EvalSaveDir"], "original");
-        }
-
-        // Reflect other option settings.
-        learn_think.eval_limit = eval_limit;
-        learn_think.save_only_once = save_only_once;
-        learn_think.reduction_gameply = reduction_gameply;
-
-        learn_think.newbob_decay = newbob_decay;
-        learn_think.newbob_num_trials = newbob_num_trials;
-        learn_think.auto_lr_drop = auto_lr_drop;
-
-        learn_think.eval_save_interval = eval_save_interval;
-        learn_think.loss_output_interval = loss_output_interval;
-
-        learn_think.mini_batch_size = mini_batch_size;
-        learn_think.validation_set_file_name = validation_set_file_name;
-
-        learn_think.verbose = verbose;
+        LearnerThink learn_think(params);
 
         out << "Finished initialization." << endl;
 

From a8066cd4a959ac6c5b1fb18ffd8b16c75572f6ab Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 23:22:47 +0200
Subject: [PATCH 277/398] Rename elmo lambdas

---
 src/learn/learn.cpp | 43 +++++++++++++++++++------------------------
 1 file changed, 19 insertions(+), 24 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index e9eb1141..c03e425c 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -87,6 +87,14 @@ namespace Learner
     static double dest_score_min_value = 0.0;
     static double dest_score_max_value = 1.0;
 
+    // A constant used in elmo (WCSC27). Adjustment required.
+    // Since elmo does not internally divide the expression, the value is different.
+    // You can set this value with the learn command.
+    // 0.33 is equivalent to the constant (0.5) used in elmo (WCSC27)
+    static double elmo_lambda_low = 1.0;
+    static double elmo_lambda_high = 1.0;
+    static double elmo_lambda_limit = 32000;
+
     // Using stockfish's WDL with win rate model instead of sigmoid
     static bool use_wdl = false;
 
@@ -239,14 +247,6 @@ namespace Learner
         return ((y2 - y1) / epsilon) / winning_probability_coefficient;
     }
 
-    // A constant used in elmo (WCSC27). Adjustment required.
-    // Since elmo does not internally divide the expression, the value is different.
-    // You can set this value with the learn command.
-    // 0.33 is equivalent to the constant (0.5) used in elmo (WCSC27)
-    double ELMO_LAMBDA = 0.33;
-    double ELMO_LAMBDA2 = 0.33;
-    double ELMO_LAMBDA_LIMIT = 32000;
-
     // Training Formula · Issue #71 · nodchip/Stockfish https://github.com/nodchip/Stockfish/issues/71
     double get_scaled_signal(double signal)
     {
@@ -274,12 +274,12 @@ namespace Learner
 
     double calculate_lambda(double teacher_signal)
     {
-        // If the evaluation value in deep search exceeds ELMO_LAMBDA_LIMIT
-        // then apply ELMO_LAMBDA2 instead of ELMO_LAMBDA.
+        // If the evaluation value in deep search exceeds elmo_lambda_limit
+        // then apply elmo_lambda_high instead of elmo_lambda_low.
         const double lambda =
-            (std::abs(teacher_signal) >= ELMO_LAMBDA_LIMIT)
-            ? ELMO_LAMBDA2
-            : ELMO_LAMBDA;
+            (std::abs(teacher_signal) >= elmo_lambda_limit)
+            ? elmo_lambda_high
+            : elmo_lambda_low;
 
         return lambda;
     }
@@ -964,11 +964,6 @@ namespace Learner
 
         global_learning_rate = 1.0;
 
-        // elmo lambda
-        ELMO_LAMBDA = 1.0;
-        ELMO_LAMBDA2 = 1.0;
-        ELMO_LAMBDA_LIMIT = 32000;
-
         uint64_t nn_batch_size = 1000;
         string nn_options;
 
@@ -1034,9 +1029,9 @@ namespace Learner
 
 
             // LAMBDA
-            else if (option == "lambda") is >> ELMO_LAMBDA;
-            else if (option == "lambda2") is >> ELMO_LAMBDA2;
-            else if (option == "lambda_limit") is >> ELMO_LAMBDA_LIMIT;
+            else if (option == "lambda") is >> elmo_lambda_low;
+            else if (option == "lambda2") is >> elmo_lambda_high;
+            else if (option == "lambda_limit") is >> elmo_lambda_limit;
 
             else if (option == "reduction_gameply") is >> params.reduction_gameply;
 
@@ -1136,9 +1131,9 @@ namespace Learner
 
         out << "  - reduction_gameply        : " << params.reduction_gameply << endl;
 
-        out << "  - LAMBDA                   : " << ELMO_LAMBDA << endl;
-        out << "  - LAMBDA2                  : " << ELMO_LAMBDA2 << endl;
-        out << "  - LAMBDA_LIMIT             : " << ELMO_LAMBDA_LIMIT << endl;
+        out << "  - elmo_lambda_low          : " << elmo_lambda_low << endl;
+        out << "  - elmo_lambda_high         : " << elmo_lambda_high << endl;
+        out << "  - elmo_lambda_limit        : " << elmo_lambda_limit << endl;
         out << "  - eval_save_interval       : " << params.eval_save_interval << " sfens" << endl;
         out << "  - loss_output_interval     : " << params.loss_output_interval << " sfens" << endl;
 

From c229929d266e1e8f4354742223e7b1121b0b8dc2 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 22:58:09 +0200
Subject: [PATCH 278/398] Remove the position parameter from learn.

---
 src/learn/learn.cpp | 2 +-
 src/learn/learn.h   | 2 +-
 src/uci.cpp         | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index c03e425c..90e6cb0f 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -951,7 +951,7 @@ namespace Learner
     }
 
     // Learning from the generated game record
-    void learn(Position&, istringstream& is)
+    void learn(istringstream& is)
     {
         LearnerThink::Params params;
 
diff --git a/src/learn/learn.h b/src/learn/learn.h
index 5efeb516..008ca7af 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -67,7 +67,7 @@ namespace Learner
     double calc_grad(Value shallow, const PackedSfenValue& psv);
 
     // Learning from the generated game record
-    void learn(Position& pos, std::istringstream& is);
+    void learn(std::istringstream& is);
 }
 
 #endif // ifndef _LEARN_H_
diff --git a/src/uci.cpp b/src/uci.cpp
index dbef05bf..e6b45c02 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -339,7 +339,7 @@ void UCI::loop(int argc, char* argv[]) {
       else if (token == "compiler") sync_cout << compiler_info() << sync_endl;
 
       else if (token == "gensfen") Learner::gensfen(is);
-      else if (token == "learn") Learner::learn(pos, is);
+      else if (token == "learn") Learner::learn(is);
       else if (token == "convert") Learner::convert(is);
       else if (token == "convert_bin") Learner::convert_bin(is);
       else if (token == "convert_plain") Learner::convert_plain(is);

From e4868cb59e83baf3a3950ae043bd5d04c75acc2f Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 23:17:56 +0200
Subject: [PATCH 279/398] Move setting learn search limits to learner.

---
 src/learn/learn.cpp | 53 +++++++++++++++++++++++----------------------
 1 file changed, 27 insertions(+), 26 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 90e6cb0f..5bb41213 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -467,6 +467,8 @@ namespace Learner
         void learn(uint64_t epochs);
 
     private:
+        static void set_learning_search_limits();
+
         void learn_worker(Thread& th, std::atomic<uint64_t>& counter, uint64_t limit);
 
         void update_weights(const PSVector& psv, uint64_t epoch);
@@ -510,13 +512,37 @@ namespace Learner
         AtomicLoss learn_loss_sum;
     };
 
+    void LearnerThink::set_learning_search_limits()
+    {
+        Threads.main()->ponder = false;
+
+        // About Search::Limits
+        // Be careful because this member variable is global and affects other threads.
+        auto& limits = Search::Limits;
+
+        limits.startTime = now();
+
+        // Make the search equivalent to the "go infinite" command. (Because it is troublesome if time management is done)
+        limits.infinite = true;
+
+        // Since PV is an obstacle when displayed, erase it.
+        limits.silent = true;
+
+        // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it.
+        limits.nodes = 0;
+
+        // depth is also processed by the one passed as an argument of Learner::search().
+        limits.depth = 0;
+    }
+
     void LearnerThink::learn(uint64_t epochs)
     {
-
 #if defined(_OPENMP)
         omp_set_num_threads((int)Options["Threads"]);
 #endif
 
+        set_learning_search_limits();
+
         Eval::NNUE::verify_any_net_loaded();
 
         const PSVector sfen_for_mse =
@@ -929,27 +955,6 @@ namespace Learner
         return false;
     }
 
-    static void set_learning_search_limits()
-    {
-        // About Search::Limits
-        // Be careful because this member variable is global and affects other threads.
-        auto& limits = Search::Limits;
-
-        limits.startTime = now();
-
-        // Make the search equivalent to the "go infinite" command. (Because it is troublesome if time management is done)
-        limits.infinite = true;
-
-        // Since PV is an obstacle when displayed, erase it.
-        limits.silent = true;
-
-        // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it.
-        limits.nodes = 0;
-
-        // depth is also processed by the one passed as an argument of Learner::search().
-        limits.depth = 0;
-    }
-
     // Learning from the generated game record
     void learn(istringstream& is)
     {
@@ -1159,10 +1164,6 @@ namespace Learner
 
         out << "INFO: Started initialization." << endl;
 
-        Threads.main()->ponder = false;
-
-        set_learning_search_limits();
-
         Eval::NNUE::initialize_training(params.seed, out);
         Eval::NNUE::set_batch_size(nn_batch_size);
         Eval::NNUE::set_options(nn_options);

From cde6ec2bf26d46dedf4547580f6e45e34d8b1ab4 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 23:29:32 +0200
Subject: [PATCH 280/398] Make all grad related functions in learn static. Pass
 calc_grad as a parameter.

---
 src/learn/learn.cpp                | 40 +++++++++++++-----------------
 src/learn/learn.h                  |  4 +--
 src/nnue/evaluate_nnue_learner.cpp |  7 +++---
 src/nnue/evaluate_nnue_learner.h   |  2 +-
 4 files changed, 23 insertions(+), 30 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 5bb41213..b0f77e89 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -185,7 +185,7 @@ namespace Learner
     }
 
     // A function that converts the evaluation value to the winning rate [0,1]
-    double winning_percentage(double value)
+    static double winning_percentage(double value)
     {
         // 1/(1+10^(-Eval/4))
         // = 1/(1+e^(-Eval/4*ln(10))
@@ -194,7 +194,7 @@ namespace Learner
     }
 
     // A function that converts the evaluation value to the winning rate [0,1]
-    double winning_percentage_wdl(double value, int ply)
+    static double winning_percentage_wdl(double value, int ply)
     {
         constexpr double wdl_total = 1000.0;
         constexpr double draw_score = 0.5;
@@ -207,7 +207,7 @@ namespace Learner
     }
 
     // A function that converts the evaluation value to the winning rate [0,1]
-    double winning_percentage(double value, int ply)
+    static double winning_percentage(double value, int ply)
     {
         if (use_wdl)
         {
@@ -219,7 +219,7 @@ namespace Learner
         }
     }
 
-    double calc_cross_entropy_of_winning_percentage(
+    static double calc_cross_entropy_of_winning_percentage(
         double deep_win_rate,
         double shallow_eval,
         int ply)
@@ -229,7 +229,7 @@ namespace Learner
         return -p * std::log(q) - (1.0 - p) * std::log(1.0 - q);
     }
 
-    double calc_d_cross_entropy_of_winning_percentage(
+    static double calc_d_cross_entropy_of_winning_percentage(
         double deep_win_rate,
         double shallow_eval,
         int ply)
@@ -248,7 +248,7 @@ namespace Learner
     }
 
     // Training Formula · Issue #71 · nodchip/Stockfish https://github.com/nodchip/Stockfish/issues/71
-    double get_scaled_signal(double signal)
+    static double get_scaled_signal(double signal)
     {
         double scaled_signal = signal;
 
@@ -266,13 +266,13 @@ namespace Learner
     }
 
     // Teacher winning probability.
-    double calculate_p(double teacher_signal, int ply)
+    static double calculate_p(double teacher_signal, int ply)
     {
         const double scaled_teacher_signal = get_scaled_signal(teacher_signal);
         return winning_percentage(scaled_teacher_signal, ply);
     }
 
-    double calculate_lambda(double teacher_signal)
+    static double calculate_lambda(double teacher_signal)
     {
         // If the evaluation value in deep search exceeds elmo_lambda_limit
         // then apply elmo_lambda_high instead of elmo_lambda_low.
@@ -284,7 +284,7 @@ namespace Learner
         return lambda;
     }
 
-    double calculate_t(int game_result)
+    static double calculate_t(int game_result)
     {
         // Use 1 as the correction term if the expected win rate is 1,
         // 0 if you lose, and 0.5 if you draw.
@@ -294,20 +294,20 @@ namespace Learner
         return t;
     }
 
-    double calc_grad(Value teacher_signal, Value shallow, const PackedSfenValue& psv)
+    static double calc_grad(Value shallow, Value teacher_signal, int result, int ply)
     {
         // elmo (WCSC27) method
         // Correct with the actual game wins and losses.
-        const double q = winning_percentage(shallow, psv.gamePly);
-        const double p = calculate_p(teacher_signal, psv.gamePly);
-        const double t = calculate_t(psv.game_result);
+        const double q = winning_percentage(shallow, ply);
+        const double p = calculate_p(teacher_signal, ply);
+        const double t = calculate_t(result);
         const double lambda = calculate_lambda(teacher_signal);
 
         double grad;
         if (use_wdl)
         {
-            const double dce_p = calc_d_cross_entropy_of_winning_percentage(p, shallow, psv.gamePly);
-            const double dce_t = calc_d_cross_entropy_of_winning_percentage(t, shallow, psv.gamePly);
+            const double dce_p = calc_d_cross_entropy_of_winning_percentage(p, shallow, ply);
+            const double dce_t = calc_d_cross_entropy_of_winning_percentage(t, shallow, ply);
             grad = lambda * dce_p + (1.0 - lambda) * dce_t;
         }
         else
@@ -324,7 +324,7 @@ namespace Learner
     // The individual cross entropy of the win/loss term and win
     // rate term of the elmo expression is returned
     // to the arguments cross_entropy_eval and cross_entropy_win.
-    Loss calc_cross_entropy(
+    static Loss calc_cross_entropy(
         Value teacher_signal,
         Value shallow,
         const PackedSfenValue& psv)
@@ -360,12 +360,6 @@ namespace Learner
         return loss;
     }
 
-    // Other objective functions may be considered in the future...
-    double calc_grad(Value shallow, const PackedSfenValue& psv)
-    {
-        return calc_grad((Value)psv.score, shallow, psv);
-    }
-
     // Class to generate sfen with multiple threads
     struct LearnerThink
     {
@@ -703,7 +697,7 @@ namespace Learner
         // should be no real issues happening since
         // the read/write phases are isolated.
         atomic_thread_fence(memory_order_seq_cst);
-        Eval::NNUE::update_parameters(epoch, params.verbose);
+        Eval::NNUE::update_parameters(epoch, params.verbose, calc_grad);
         atomic_thread_fence(memory_order_seq_cst);
 
         if (++save_count * params.mini_batch_size >= params.eval_save_interval)
diff --git a/src/learn/learn.h b/src/learn/learn.h
index 008ca7af..6ce476e5 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -64,10 +64,10 @@ namespace Learner
     // rmse calculation is done in one thread, so it takes some time, so reducing the output is effective.
     constexpr std::size_t LEARN_RMSE_OUTPUT_INTERVAL = 1;
 
-    double calc_grad(Value shallow, const PackedSfenValue& psv);
-
     // Learning from the generated game record
     void learn(std::istringstream& is);
+
+    using CalcGradFunc = double(Value, Value, int, int);
 }
 
 #endif // ifndef _LEARN_H_
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 6775707d..3e91a7de 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -18,8 +18,6 @@
 #include "misc.h"
 #include "thread_win32_osx.h"
 
-#include "learn/learn.h"
-
 // Learning rate scale
 double global_learning_rate;
 
@@ -183,7 +181,7 @@ namespace Eval::NNUE {
     }
 
     // update the evaluation function parameters
-    void update_parameters(uint64_t epoch, bool verbose) {
+    void update_parameters(uint64_t epoch, bool verbose, Learner::CalcGradFunc calc_grad) {
         assert(batch_size > 0);
 
         const auto learning_rate = static_cast<LearnFloatType>(
@@ -210,7 +208,8 @@ namespace Eval::NNUE {
                     batch[b].sign * network_output[b] * kPonanzaConstant));
                 const auto discrete = batch[b].sign * batch[b].discrete_nn_eval;
                 const auto& psv = batch[b].psv;
-                const double gradient = batch[b].sign * Learner::calc_grad(shallow, psv);
+                const double gradient =
+                    batch[b].sign * calc_grad(shallow, (Value)psv.score, psv.game_result, psv.gamePly);
                 gradients[b] = static_cast<LearnFloatType>(gradient * batch[b].weight);
 
 
diff --git a/src/nnue/evaluate_nnue_learner.h b/src/nnue/evaluate_nnue_learner.h
index 91d2aa99..8a9786e5 100644
--- a/src/nnue/evaluate_nnue_learner.h
+++ b/src/nnue/evaluate_nnue_learner.h
@@ -31,7 +31,7 @@ namespace Eval::NNUE {
         double weight);
 
     // update the evaluation function parameters
-    void update_parameters(uint64_t epoch, bool verbose);
+    void update_parameters(uint64_t epoch, bool verbose, Learner::CalcGradFunc calc_grad);
 
     // Check if there are any problems with learning
     void check_health();

From f81fa3d7127a21d58853192fd59fad5a12589ec1 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 23:35:34 +0200
Subject: [PATCH 281/398] Replace global_learning_rate with learning_rate local
 to the learner and passed to update_parameters as a parameter.

---
 src/learn/learn.cpp                | 22 ++++++++++------------
 src/nnue/evaluate_nnue_learner.cpp | 13 +++++++------
 src/nnue/evaluate_nnue_learner.h   |  6 +++++-
 3 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index b0f77e89..6cd54b13 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -56,8 +56,6 @@
 #include <omp.h>
 #endif
 
-extern double global_learning_rate;
-
 using namespace std;
 
 template <typename T>
@@ -399,6 +397,8 @@ namespace Learner
             bool use_draw_games_in_validation = true;
             bool skip_duplicated_positions_in_training = true;
 
+            double learning_rate = 1.0;
+
             string validation_set_file_name;
             string seed;
 
@@ -697,7 +697,7 @@ namespace Learner
         // should be no real issues happening since
         // the read/write phases are isolated.
         atomic_thread_fence(memory_order_seq_cst);
-        Eval::NNUE::update_parameters(epoch, params.verbose, calc_grad);
+        Eval::NNUE::update_parameters(epoch, params.verbose, params.learning_rate, calc_grad);
         atomic_thread_fence(memory_order_seq_cst);
 
         if (++save_count * params.mini_batch_size >= params.eval_save_interval)
@@ -737,7 +737,7 @@ namespace Learner
              << ", epoch " << epoch
              << endl;
 
-        out << "  - learning rate = " << global_learning_rate << endl;
+        out << "  - learning rate = " << params.learning_rate << endl;
 
         // For calculation of verification data loss
         AtomicLoss test_loss_sum{};
@@ -913,7 +913,7 @@ namespace Learner
                     if (tot >= last_lr_drop + params.auto_lr_drop)
                     {
                         last_lr_drop = tot;
-                        global_learning_rate *= params.newbob_decay;
+                        params.learning_rate *= params.newbob_decay;
                     }
                 }
                 else if (latest_loss < best_loss)
@@ -929,11 +929,11 @@ namespace Learner
                     if (--trials > 0 && !is_final)
                     {
                         cout
-                            << "  - reducing learning rate from " << global_learning_rate
-                            << " to " << (global_learning_rate * params.newbob_decay)
+                            << "  - reducing learning rate from " << params.learning_rate
+                            << " to " << (params.learning_rate * params.newbob_decay)
                             << " (" << trials << " more trials)" << endl;
 
-                        global_learning_rate *= params.newbob_decay;
+                        params.learning_rate *= params.newbob_decay;
                     }
                 }
 
@@ -961,8 +961,6 @@ namespace Learner
         string base_dir;
         string target_dir;
 
-        global_learning_rate = 1.0;
-
         uint64_t nn_batch_size = 1000;
         string nn_options;
 
@@ -1003,7 +1001,7 @@ namespace Learner
             else if (option == "batchsize") is >> params.mini_batch_size;
 
             // learning rate
-            else if (option == "lr") is >> global_learning_rate;
+            else if (option == "lr") is >> params.learning_rate;
 
             // Accept also the old option name.
             else if (option == "use_draw_in_training"
@@ -1115,7 +1113,7 @@ namespace Learner
         out << "  - nn_batch_size            : " << nn_batch_size << endl;
         out << "  - nn_options               : " << nn_options << endl;
 
-        out << "  - learning rate            : " << global_learning_rate << endl;
+        out << "  - learning rate            : " << params.learning_rate << endl;
         out << "  - use draws in training    : " << params.use_draw_games_in_training << endl;
         out << "  - use draws in validation  : " << params.use_draw_games_in_validation << endl;
         out << "  - skip repeated positions  : " << params.skip_duplicated_positions_in_training << endl;
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 3e91a7de..2a1fd6cb 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -18,9 +18,6 @@
 #include "misc.h"
 #include "thread_win32_osx.h"
 
-// Learning rate scale
-double global_learning_rate;
-
 // Code for learning NNUE evaluation function
 namespace Eval::NNUE {
 
@@ -181,11 +178,15 @@ namespace Eval::NNUE {
     }
 
     // update the evaluation function parameters
-    void update_parameters(uint64_t epoch, bool verbose, Learner::CalcGradFunc calc_grad) {
+    void update_parameters(
+        uint64_t epoch,
+        bool verbose,
+        double learning_rate,
+        Learner::CalcGradFunc calc_grad)
+    {
         assert(batch_size > 0);
 
-        const auto learning_rate = static_cast<LearnFloatType>(
-            global_learning_rate / batch_size);
+        learning_rate /= batch_size;
 
         std::lock_guard<std::mutex> lock(examples_mutex);
         std::shuffle(examples.begin(), examples.end(), rng);
diff --git a/src/nnue/evaluate_nnue_learner.h b/src/nnue/evaluate_nnue_learner.h
index 8a9786e5..d350691b 100644
--- a/src/nnue/evaluate_nnue_learner.h
+++ b/src/nnue/evaluate_nnue_learner.h
@@ -31,7 +31,11 @@ namespace Eval::NNUE {
         double weight);
 
     // update the evaluation function parameters
-    void update_parameters(uint64_t epoch, bool verbose, Learner::CalcGradFunc calc_grad);
+    void update_parameters(
+        uint64_t epoch,
+        bool verbose,
+        double learning_rate,
+        Learner::CalcGradFunc calc_grad);
 
     // Check if there are any problems with learning
     void check_health();

From 680654b254dc2c6357199825c9399998d6bfd777 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Tue, 27 Oct 2020 20:58:58 +0100
Subject: [PATCH 282/398] Add dots to output every epoch for progress
 visualization.

---
 src/nnue/evaluate_nnue_learner.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 2a1fd6cb..a97b45c7 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -244,6 +244,10 @@ namespace Eval::NNUE {
                 << " , batch_size = " << batch_size
                 << " , grad_norm = " << gradient_norm
                 << std::endl;
+        } else {
+            // Display some progress but don't synchronize as
+            // we can't really decide when to release the output lock here
+            std::cout << '.';
         }
 
         send_messages({{"quantize_parameters"}});

From bde3505758417c6cd77f2e09edac5bbd5f58b570 Mon Sep 17 00:00:00 2001
From: FauziAkram <fauzi.dabat@hotmail.com>
Date: Sat, 24 Oct 2020 02:01:04 +0300
Subject: [PATCH 283/398] Bishop Pawns based on Files

Passed STC:
https://tests.stockfishchess.org/tests/view/5f8cc8145a4eacb45305da3c
LLR: 2.94 (-2.94,2.94) {-0.25,1.25}
Total: 132544 W: 27795 L: 27328 D: 77421
Ptnml(0-2): 2756, 15558, 29272, 15835, 2851

Passed LTC:
https://tests.stockfishchess.org/tests/view/5f8df614bacb75a4f9a4721e
LLR: 2.94 (-2.94,2.94) {0.25,1.25}
Total: 169608 W: 23257 L: 22622 D: 123729
Ptnml(0-2): 1408, 16316, 48758, 16877, 1445

closes https://github.com/official-stockfish/Stockfish/pull/3194

Bench: 4067106
---
 src/evaluate.cpp | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 425ba6f8..030d1017 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -222,6 +222,12 @@ namespace {
       S(112,178), S(114,185), S(114,187), S(119,221) }
   };
 
+  // BishopPawns[distance from edge] contains a file-dependent penalty for pawns on
+  // squares of the same color as our bishop.
+  constexpr Score BishopPawns[int(FILE_NB) / 2] = {
+    S(3, 8), S(3, 9), S(1, 8), S(3, 7)
+  };
+
   // KingProtector[knight/bishop] contains penalty for each distance unit to own king
   constexpr Score KingProtector[] = { S(8, 9), S(6, 9) };
 
@@ -252,7 +258,6 @@ namespace {
   // Assorted bonuses and penalties
   constexpr Score BadOutpost          = S( -7, 36);
   constexpr Score BishopOnKingRing    = S( 24,  0);
-  constexpr Score BishopPawns         = S(  3,  7);
   constexpr Score BishopXRayPawns     = S(  4,  5);
   constexpr Score CorneredBishop      = S( 50, 50);
   constexpr Score FlankAttacks        = S(  8,  0);
@@ -453,7 +458,7 @@ namespace {
                 // when the bishop is outside the pawn chain.
                 Bitboard blocked = pos.pieces(Us, PAWN) & shift<Down>(pos.pieces());
 
-                score -= BishopPawns * pos.pawns_on_same_color_squares(Us, s)
+                score -= BishopPawns[edge_distance(file_of(s))] * pos.pawns_on_same_color_squares(Us, s)
                                      * (!(attackedBy[Us][PAWN] & s) + popcount(blocked & CenterFiles));
 
                 // Penalty for all enemy pawns x-rayed
@@ -906,7 +911,7 @@ namespace {
                                                         : pos.count<BISHOP>(WHITE) + pos.count<KNIGHT>(WHITE));
         else
             sf = std::min(sf, 36 + 7 * pos.count<PAWN>(strongSide)) - 4 * !pawnsOnBothFlanks;
-      
+
         sf -= 4 * !pawnsOnBothFlanks;
     }
 

From 6328135264d3b13a2cef3f0c835a27192cae0f40 Mon Sep 17 00:00:00 2001
From: SFisGOD <jonathandumale@gmail.com>
Date: Wed, 28 Oct 2020 04:24:55 +0800
Subject: [PATCH 284/398] Update default net to nn-2eb2e0707c2b.nnue

Optimization of the net weights of the 32 x 32 layer (1024 parameters) and net biases of the 512 x 32 layer (32 parameters) using SPSA.

Tuning of 32 x 32 Layer (800,000 games, 5 seconds time control):
https://tests.stockfishchess.org/tests/view/5f942040d3978d7e86f1aa05

Tuning of 512 x 32 Layer (80,000 games, 20 seconds time control):
https://tests.stockfishchess.org/tests/view/5f8f926d2c92c7fe3a8c608b

STC:
LLR: 2.96 (-2.94,2.94) {-0.25,1.25}
Total: 17336 W: 1918 L: 1754 D: 13664
Ptnml(0-2): 79, 1344, 5672, 1480, 93
https://tests.stockfishchess.org/tests/view/5f9882346a2c112b60691b34

LTC:
LLR: 2.94 (-2.94,2.94) {0.25,1.25}
Total: 37304 W: 1822 L: 1651 D: 33831
Ptnml(0-2): 27, 1461, 15501, 1640, 23
https://tests.stockfishchess.org/tests/view/5f98a4b36a2c112b60691b40

closes https://github.com/official-stockfish/Stockfish/pull/3201

Bench: 3403528
---
 src/evaluate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/evaluate.h b/src/evaluate.h
index 6a8603ad..6e5db6a3 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -36,7 +36,7 @@ namespace Eval {
   // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
   // for the build process (profile-build and fishtest) to work. Do not change the
   // name of the macro, as it is used in the Makefile.
-  #define EvalFileDefaultName   "nn-eba324f53044.nnue"
+  #define EvalFileDefaultName   "nn-2eb2e0707c2b.nnue"
 
   namespace NNUE {
 

From 317fda251602ceb5af90b9134539f28210392184 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 23:56:29 +0200
Subject: [PATCH 285/398] Cleanup eval saving and lr scheduling.

---
 src/learn/learn.cpp                | 136 ++++++++++++++++++-----------
 src/nnue/evaluate_nnue_learner.cpp |   2 +-
 2 files changed, 87 insertions(+), 51 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 6cd54b13..93262b42 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -456,6 +456,8 @@ namespace Learner
             latest_loss_sum = 0.0;
             latest_loss_count = 0;
             total_done = 0;
+            trials = params.newbob_num_trials;
+            dir_number = 0;
         }
 
         void learn(uint64_t epochs);
@@ -480,6 +482,8 @@ namespace Learner
 
         Value get_shallow_value(Position& pos);
 
+        bool check_progress();
+
         // save merit function parameters to a file
         bool save(bool is_final = false);
 
@@ -502,6 +506,9 @@ namespace Learner
         double latest_loss_sum;
         uint64_t latest_loss_count;
 
+        int trials;
+        int dir_number;
+
         // For calculation of learning data loss
         AtomicLoss learn_loss_sum;
     };
@@ -873,12 +880,84 @@ namespace Learner
         return shallow_value;
     }
 
+    bool LearnerThink::check_progress()
+    {
+        auto out = sync_region_cout.new_region();
+
+        const double latest_loss = latest_loss_sum / latest_loss_count;
+        bool converged = false;
+        latest_loss_sum = 0.0;
+        latest_loss_count = 0;
+
+        auto drop_lr = [&]() {
+            last_lr_drop = total_done;
+
+            out
+                << "  - reducing learning rate from " << params.learning_rate
+                << " to " << (params.learning_rate * params.newbob_decay)
+                << " (" << trials << " more trials)" << endl;
+
+            params.learning_rate *= params.newbob_decay;
+        };
+
+        auto accept = [&]() {
+            out << "  - loss = " << latest_loss << " < best (" << best_loss << "), accepted" << endl;
+
+            best_loss = latest_loss;
+            trials = params.newbob_num_trials;
+        };
+
+        auto reject = [&]() {
+            out << "  - loss = " << latest_loss << " >= best (" << best_loss << "), rejected" << endl;
+
+            --trials;
+            if (trials > 0)
+            {
+                drop_lr();
+                return false;
+            }
+            else
+            {
+                return true;
+            }
+        };
+
+        out << "INFO (learning_rate):" << endl;
+
+        if (params.auto_lr_drop)
+        {
+            accept();
+
+            if (total_done >= last_lr_drop + params.auto_lr_drop)
+            {
+                drop_lr();
+            }
+        }
+        else if (latest_loss < best_loss)
+        {
+            accept();
+        }
+        else
+        {
+            converged = reject();
+        }
+
+        if (converged)
+        {
+            out << "  - converged" << endl;
+        }
+
+        return converged;
+    }
+
     // Write evaluation function file.
     bool LearnerThink::save(bool is_final)
     {
         // Each time you save, change the extension part of the file name like "0","1","2",..
         // (Because I want to compare the winning rate for each evaluation function parameter later)
 
+        bool converged = false;
+
         if (params.save_only_once)
         {
             // When EVAL_SAVE_ONLY_ONCE is defined,
@@ -888,65 +967,22 @@ namespace Learner
         else if (is_final)
         {
             Eval::NNUE::save_eval("final");
-            return true;
+            converged = true;
         }
         else
         {
-            static int dir_number = 0;
+            // TODO: consider naming the output directory by epoch.
             const std::string dir_name = std::to_string(dir_number++);
             Eval::NNUE::save_eval(dir_name);
 
-            if (params.newbob_decay != 1.0 && latest_loss_count > 0) {
-                static int trials = params.newbob_num_trials;
-                const double latest_loss = latest_loss_sum / latest_loss_count;
-                latest_loss_sum = 0.0;
-                latest_loss_count = 0;
-                cout << "INFO (learning_rate):" << endl;
-                cout << "  - loss = " << latest_loss;
-                auto tot = total_done;
-                if (params.auto_lr_drop)
-                {
-                    cout << " < best (" << best_loss << "), accepted" << endl;
-                    best_loss = latest_loss;
-                    trials = params.newbob_num_trials;
-
-                    if (tot >= last_lr_drop + params.auto_lr_drop)
-                    {
-                        last_lr_drop = tot;
-                        params.learning_rate *= params.newbob_decay;
-                    }
-                }
-                else if (latest_loss < best_loss)
-                {
-                    cout << " < best (" << best_loss << "), accepted" << endl;
-                    best_loss = latest_loss;
-                    trials = params.newbob_num_trials;
-                }
-                else
-                {
-                    cout << " >= best (" << best_loss << "), rejected" << endl;
-
-                    if (--trials > 0 && !is_final)
-                    {
-                        cout
-                            << "  - reducing learning rate from " << params.learning_rate
-                            << " to " << (params.learning_rate * params.newbob_decay)
-                            << " (" << trials << " more trials)" << endl;
-
-                        params.learning_rate *= params.newbob_decay;
-                    }
-                }
-
+            if (params.newbob_decay != 1.0 && latest_loss_count > 0)
+            {
+                converged = check_progress();
                 params.best_nn_directory = Path::combine((std::string)Options["EvalSaveDir"], dir_name);
-
-                if (trials == 0)
-                {
-                    cout << "  - converged" << endl;
-                    return true;
-                }
             }
         }
-        return false;
+
+        return converged;
     }
 
     // Learning from the generated game record
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index a97b45c7..0cd61a41 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -280,6 +280,6 @@ namespace Eval::NNUE {
 #ifndef NDEBUG
         assert(result);
 #endif
-        out << "INFO (save_eval): Saving current evaluation file in " << eval_dir << std::endl;
+        out << "INFO (save_eval): Finished saving evaluation file in " << eval_dir << std::endl;
     }
 }  // namespace Eval::NNUE
\ No newline at end of file

From 0f6c08c73f516873b312cb8fce0d824a2167b075 Mon Sep 17 00:00:00 2001
From: syzygy1 <3028851+syzygy1@users.noreply.github.com>
Date: Tue, 27 Oct 2020 19:22:41 +0100
Subject: [PATCH 286/398] Do not skip non-recapture ttMove when in check

The qsearch() MovePicker incorrectly skips a non-recapture ttMove
when in check (if depth <= DEPTH_QS_RECAPTURES). This is clearly not
intended and can cause qsearch() to return a mate score when there
is no mate. Introduced in cad300c and 6596f0e, as observed by
joergoster in #3171 and #3198.

This PR fixes the bug by not skipping the non-recapture ttMove when in check.

Passed non-regression STC:
https://tests.stockfishchess.org/tests/view/5f9867ea6a2c112b60691b10
LLR: 2.98 (-2.94,2.94) {-1.25,0.25}
Total: 27112 W: 2943 L: 2842 D: 21327
Ptnml(0-2): 127, 2170, 8878, 2237, 144

Passed non-regression LTC:
https://tests.stockfishchess.org/tests/view/5f9967326a2c112b60691bb0
LLR: 2.99 (-2.94,2.94) {-0.75,0.25}
Total: 18392 W: 807 L: 738 D: 16847
Ptnml(0-2): 9, 655, 7802, 718, 12

closes https://github.com/official-stockfish/Stockfish/pull/3199
closes https://github.com/official-stockfish/Stockfish/pull/3198

Bench: 3870606
---
 src/movepick.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/movepick.cpp b/src/movepick.cpp
index 153d323e..f5e02385 100644
--- a/src/movepick.cpp
+++ b/src/movepick.cpp
@@ -73,8 +73,9 @@ MovePicker::MovePicker(const Position& p, Move ttm, Depth d, const ButterflyHist
   assert(d <= 0);
 
   stage = (pos.checkers() ? EVASION_TT : QSEARCH_TT) +
-           !(ttm && (depth > DEPTH_QS_RECAPTURES || to_sq(ttm) == recaptureSquare)
-                 && pos.pseudo_legal(ttm));
+          !(   ttm
+            && (pos.checkers() || depth > DEPTH_QS_RECAPTURES || to_sq(ttm) == recaptureSquare)
+            && pos.pseudo_legal(ttm));
 }
 
 /// MovePicker constructor for ProbCut: we generate captures with SEE greater

From ec9e49e875f06c450d1511964886cd2df17c72ca Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 28 Oct 2020 15:24:29 +0100
Subject: [PATCH 287/398] Add a HalfKA architecture (a product of K - king, and
 A - any piece) along with all required infrastructure. HalfKA doesn't
 discriminate kings compared to HalfKP. Keep old architecture as the default
 one.

---
 src/Makefile                                  |  3 +
 src/nnue/architectures/halfka_256x2-32-32.h   | 54 +++++++++++
 src/nnue/evaluate_nnue_learner.cpp            |  1 +
 src/nnue/features/a.cpp                       | 50 ++++++++++
 src/nnue/features/a.h                         | 54 +++++++++++
 src/nnue/features/half_ka.cpp                 | 89 ++++++++++++++++++
 src/nnue/features/half_ka.h                   | 75 +++++++++++++++
 src/nnue/features/half_relative_ka.cpp        | 86 +++++++++++++++++
 src/nnue/features/half_relative_ka.h          | 68 ++++++++++++++
 .../trainer/features/factorizer_half_ka.h     | 93 +++++++++++++++++++
 10 files changed, 573 insertions(+)
 create mode 100644 src/nnue/architectures/halfka_256x2-32-32.h
 create mode 100644 src/nnue/features/a.cpp
 create mode 100644 src/nnue/features/a.h
 create mode 100644 src/nnue/features/half_ka.cpp
 create mode 100644 src/nnue/features/half_ka.h
 create mode 100644 src/nnue/features/half_relative_ka.cpp
 create mode 100644 src/nnue/features/half_relative_ka.h
 create mode 100644 src/nnue/trainer/features/factorizer_half_ka.h

diff --git a/src/Makefile b/src/Makefile
index f2c4d269..45d27ef2 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -50,9 +50,12 @@ SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp
 	nnue/evaluate_nnue.cpp \
 	nnue/evaluate_nnue_learner.cpp \
 	nnue/features/half_kp.cpp \
+	nnue/features/half_ka.cpp \
 	nnue/features/half_relative_kp.cpp \
+	nnue/features/half_relative_ka.cpp \
 	nnue/features/k.cpp \
 	nnue/features/p.cpp \
+	nnue/features/a.cpp \
 	nnue/features/castling_right.cpp \
 	nnue/features/enpassant.cpp \
 	nnue/nnue_test_command.cpp \
diff --git a/src/nnue/architectures/halfka_256x2-32-32.h b/src/nnue/architectures/halfka_256x2-32-32.h
new file mode 100644
index 00000000..c108ef5d
--- /dev/null
+++ b/src/nnue/architectures/halfka_256x2-32-32.h
@@ -0,0 +1,54 @@
+/*
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+// Definition of input features and network structure used in NNUE evaluation function
+
+#ifndef NNUE_HALFKA_256X2_32_32_H_INCLUDED
+#define NNUE_HALFKA_256X2_32_32_H_INCLUDED
+
+#include "nnue/features/feature_set.h"
+#include "nnue/features/half_ka.h"
+
+#include "nnue/layers/input_slice.h"
+#include "nnue/layers/affine_transform.h"
+#include "nnue/layers/clipped_relu.h"
+
+namespace Eval::NNUE {
+
+    // Input features used in evaluation function
+    using RawFeatures = Features::FeatureSet<
+        Features::HalfKA<Features::Side::kFriend>>;
+
+    // Number of input feature dimensions after conversion
+    constexpr IndexType kTransformedFeatureDimensions = 256;
+
+    namespace Layers {
+
+        // Define network structure
+        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+        using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+
+    }  // namespace Layers
+
+    using Network = Layers::OutputLayer;
+
+}  // namespace Eval::NNUE
+
+#endif // #ifndef NNUE_HALFA_256X2_32_32_H_INCLUDED
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 0cd61a41..4de939c5 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -7,6 +7,7 @@
 
 #include "trainer/features/factorizer_feature_set.h"
 #include "trainer/features/factorizer_half_kp.h"
+#include "trainer/features/factorizer_half_ka.h"
 #include "trainer/trainer_feature_transformer.h"
 #include "trainer/trainer_input_slice.h"
 #include "trainer/trainer_affine_transform.h"
diff --git a/src/nnue/features/a.cpp b/src/nnue/features/a.cpp
new file mode 100644
index 00000000..6ceb4efa
--- /dev/null
+++ b/src/nnue/features/a.cpp
@@ -0,0 +1,50 @@
+﻿#include "a.h"
+#include "index_list.h"
+
+// Definition of input feature A of NNUE evaluation function
+namespace Eval::NNUE::Features {
+
+    // Orient a square according to perspective (flip rank for black)
+    inline Square orient(Color perspective, Square s) {
+        return Square(int(s) ^ (bool(perspective) * SQ_A8));
+    }
+
+    // Find the index of the feature quantity from the king position and PieceSquare
+    inline IndexType A::make_index(
+        Color perspective, Square s, Piece pc) {
+        return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
+    }
+
+    // Get a list of indices with a value of 1 among the features
+    void A::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {
+
+        Bitboard bb = pos.pieces();
+        while (bb) {
+            Square s = pop_lsb(&bb);
+            active->push_back(make_index(perspective, s, pos.piece_on(s)));
+        }
+    }
+
+    // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+    void A::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* added) {
+
+        const auto& dp = pos.state()->dirtyPiece;
+        for (int i = 0; i < dp.dirty_num; ++i) {
+            Piece pc = dp.piece[i];
+
+            if (dp.from[i] != SQ_NONE)
+              removed->push_back(make_index(perspective, dp.from[i], pc));
+
+            if (dp.to[i] != SQ_NONE)
+              added->push_back(make_index(perspective, dp.to[i], pc));
+        }
+    }
+
+}  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/a.h b/src/nnue/features/a.h
new file mode 100644
index 00000000..50a0d8be
--- /dev/null
+++ b/src/nnue/features/a.h
@@ -0,0 +1,54 @@
+﻿#ifndef _NNUE_FEATURES_A_H_
+#define _NNUE_FEATURES_A_H_
+
+#include "features_common.h"
+
+#include "evaluate.h"
+
+// Definition of input feature A of NNUE evaluation function
+// A is a union of P features and K features, so technically the
+// same effect can be achieved by including both P and K features
+// but it would result in slower index appending because
+// P would conditionally exclude K features and vice versa,
+// where A doesn't have any conditionals.
+namespace Eval::NNUE::Features {
+
+    // Feature P: PieceSquare of pieces other than balls
+    class A {
+    public:
+        // feature quantity name
+        static constexpr const char* kName = "A";
+
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t kHashValue = 0x7A4C414Cu;
+
+        // number of feature dimensions
+        static constexpr IndexType kDimensions = PS_END2;
+
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions = 32;
+
+        // Timing of full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
+
+        // Get a list of indices with a value of 1 among the features
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);
+
+        // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
+
+    private:
+        // Index of a feature for a given piece on some square
+        static IndexType make_index(Color perspective, Square s, Piece pc);
+    };
+
+}  // namespace Eval::NNUE::Features
+
+#endif // #ifndef _NNUE_FEATURES_UNION_P_K_H_
diff --git a/src/nnue/features/half_ka.cpp b/src/nnue/features/half_ka.cpp
new file mode 100644
index 00000000..83e59067
--- /dev/null
+++ b/src/nnue/features/half_ka.cpp
@@ -0,0 +1,89 @@
+/*
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+//Definition of input features HalfKA of NNUE evaluation function
+
+#include "half_ka.h"
+#include "index_list.h"
+
+namespace Eval::NNUE::Features {
+
+    // Orient a square according to perspective (flip rank for black)
+    inline Square orient(Color perspective, Square s) {
+        return Square(int(s) ^ (bool(perspective) * SQ_A8));
+    }
+
+    // Find the index of the feature quantity from the king position and PieceSquare
+    template <Side AssociatedKing>
+    inline IndexType HalfKA<AssociatedKing>::make_index(
+        Color perspective,
+        Square s,
+        Piece pc,
+        Square ksq) {
+
+        return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective] + PS_END2 * ksq);
+    }
+
+    // Get a list of indices for active features
+    template <Side AssociatedKing>
+    void HalfKA<AssociatedKing>::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {
+
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));
+
+        Bitboard bb = pos.pieces();
+        while (bb) {
+            Square s = pop_lsb(&bb);
+            active->push_back(make_index(perspective, s, pos.piece_on(s), ksq));
+        }
+    }
+
+    // Get a list of indices for recently changed features
+    template <Side AssociatedKing>
+    void HalfKA<AssociatedKing>::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* added) {
+
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));
+
+        const auto& dp = pos.state()->dirtyPiece;
+        for (int i = 0; i < dp.dirty_num; ++i) {
+            Piece pc = dp.piece[i];
+
+            if (dp.from[i] != SQ_NONE)
+                removed->push_back(make_index(perspective, dp.from[i], pc, ksq));
+
+            if (dp.to[i] != SQ_NONE)
+                added->push_back(make_index(perspective, dp.to[i], pc, ksq));
+        }
+    }
+
+    template class HalfKA<Side::kFriend>;
+    template class HalfKA<Side::kEnemy>;
+
+}  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/half_ka.h b/src/nnue/features/half_ka.h
new file mode 100644
index 00000000..2839357e
--- /dev/null
+++ b/src/nnue/features/half_ka.h
@@ -0,0 +1,75 @@
+/*
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef NNUE_FEATURES_HALF_KA_H_INCLUDED
+#define NNUE_FEATURES_HALF_KA_H_INCLUDED
+
+#include "features_common.h"
+
+#include "evaluate.h"
+
+//Definition of input features HalfKPK of NNUE evaluation function
+namespace Eval::NNUE::Features {
+
+    // Feature HalfKPK: Combination of the position of own king
+    // and the position of pieces other than kings
+    template <Side AssociatedKing>
+    class HalfKA {
+
+    public:
+        // Feature name
+        static constexpr const char* kName = (AssociatedKing == Side::kFriend) ?
+            "HalfKA(Friend)" : "HalfKA(Enemy)";
+
+        // Hash value embedded in the evaluation file
+        static constexpr std::uint32_t kHashValue =
+            0x5F134CB9u ^ (AssociatedKing == Side::kFriend);
+
+        // Number of feature dimensions
+        static constexpr IndexType kDimensions =
+            static_cast<IndexType>(SQUARE_NB) * static_cast<IndexType>(PS_END2);
+
+        // Maximum number of simultaneously active features
+        static constexpr IndexType kMaxActiveDimensions = 32;
+
+        // Trigger for full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger =
+            (AssociatedKing == Side::kFriend) ?
+            TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
+
+        // Get a list of indices for active features
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);
+
+        // Get a list of indices for recently changed features
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
+
+    private:
+        // Index of a feature for a given king position and another piece on some square
+        static IndexType make_index(Color perspective, Square s, Piece pc, Square sq_k);
+    };
+
+}  // namespace Eval::NNUE::Features
+
+#endif // #ifndef NNUE_FEATURES_HALF_KA_H_INCLUDED
diff --git a/src/nnue/features/half_relative_ka.cpp b/src/nnue/features/half_relative_ka.cpp
new file mode 100644
index 00000000..ba3edbcf
--- /dev/null
+++ b/src/nnue/features/half_relative_ka.cpp
@@ -0,0 +1,86 @@
+﻿#include "half_relative_ka.h"
+#include "index_list.h"
+
+//Definition of input features HalfRelativeKA of NNUE evaluation function
+namespace Eval::NNUE::Features {
+
+    // Orient a square according to perspective (flip rank for black)
+    inline Square orient(Color perspective, Square s) {
+        return Square(int(s) ^ (bool(perspective) * SQ_A8));
+    }
+
+    // Find the index of the feature quantity from the ball position and PieceSquare
+    template <Side AssociatedKing>
+    inline IndexType HalfRelativeKA<AssociatedKing>::make_index(
+        Color perspective,
+        Square s,
+        Piece pc,
+        Square sq_k) {
+
+        const IndexType p = IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
+        return make_index(sq_k, p);
+    }
+
+    // Find the index of the feature quantity from the ball position and PieceSquare
+    template <Side AssociatedKing>
+    inline IndexType HalfRelativeKA<AssociatedKing>::make_index(
+        Square sq_k,
+        IndexType p) {
+
+        constexpr IndexType W = kBoardWidth;
+        constexpr IndexType H = kBoardHeight;
+        const IndexType piece_index = (p - PS_W_PAWN) / SQUARE_NB;
+        const Square sq_p = static_cast<Square>((p - PS_W_PAWN) % SQUARE_NB);
+        const IndexType relative_file = file_of(sq_p) - file_of(sq_k) + (W / 2);
+        const IndexType relative_rank = rank_of(sq_p) - rank_of(sq_k) + (H / 2);
+        return H * W * piece_index + H * relative_file + relative_rank;
+    }
+
+    // Get a list of indices with a value of 1 among the features
+    template <Side AssociatedKing>
+    void HalfRelativeKA<AssociatedKing>::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {
+
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));
+
+        Bitboard bb = pos.pieces();
+        while (bb) {
+            Square s = pop_lsb(&bb);
+            active->push_back(make_index(perspective, s, pos.piece_on(s), ksq));
+        }
+    }
+
+    // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+    template <Side AssociatedKing>
+    void HalfRelativeKA<AssociatedKing>::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* added) {
+
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));
+
+        const auto& dp = pos.state()->dirtyPiece;
+        for (int i = 0; i < dp.dirty_num; ++i) {
+            Piece pc = dp.piece[i];
+
+            if (dp.from[i] != SQ_NONE)
+                removed->push_back(make_index(perspective, dp.from[i], pc, ksq));
+
+            if (dp.to[i] != SQ_NONE)
+                added->push_back(make_index(perspective, dp.to[i], pc, ksq));
+        }
+    }
+
+    template class HalfRelativeKA<Side::kFriend>;
+    template class HalfRelativeKA<Side::kEnemy>;
+
+}  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/half_relative_ka.h b/src/nnue/features/half_relative_ka.h
new file mode 100644
index 00000000..f42661e9
--- /dev/null
+++ b/src/nnue/features/half_relative_ka.h
@@ -0,0 +1,68 @@
+﻿#ifndef _NNUE_FEATURES_HALF_RELATIVE_KA_H_
+#define _NNUE_FEATURES_HALF_RELATIVE_KA_H_
+
+#include "features_common.h"
+
+#include "evaluate.h"
+
+// Definition of input features HalfRelativeKA of NNUE evaluation function
+// K - King
+// A - Any piece
+// KA - product of K and A
+namespace Eval::NNUE::Features {
+
+    // Feature HalfRelativeKA: Relative position of each piece other than the ball based on own ball or enemy ball
+    template <Side AssociatedKing>
+    class HalfRelativeKA {
+    public:
+        // feature quantity name
+        static constexpr const char* kName = (AssociatedKing == Side::kFriend) ?
+            "HalfRelativeKA(Friend)" : "HalfRelativeKA(Enemy)";
+
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t kHashValue =
+            0xA123051Fu ^ (AssociatedKing == Side::kFriend);
+
+        static constexpr IndexType kNumPieceKinds = 6 * 2;
+
+        // width of the virtual board with the ball in the center
+        static constexpr IndexType kBoardWidth = FILE_NB * 2 - 1;
+
+        // height of a virtual board with balls in the center
+        static constexpr IndexType kBoardHeight = RANK_NB * 2 - 1;
+
+        // number of feature dimensions
+        static constexpr IndexType kDimensions =
+            kNumPieceKinds * kBoardHeight * kBoardWidth;
+
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions = 32;
+
+        // Timing of full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger =
+            (AssociatedKing == Side::kFriend) ?
+            TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
+
+        // Get a list of indices with a value of 1 among the features
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);
+
+        // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
+
+        // Find the index of the feature quantity from the ball position and PieceSquare
+        static IndexType make_index(Square s, IndexType p);
+
+        // Find the index of the feature quantity from the ball position and PieceSquare
+        static IndexType make_index(Color perspective, Square s, Piece pc, Square sq_k);
+    };
+
+}  // namespace Eval::NNUE::Features
+
+#endif // #ifndef _NNUE_FEATURES_HALF_RELATIVE_KA_H_
diff --git a/src/nnue/trainer/features/factorizer_half_ka.h b/src/nnue/trainer/features/factorizer_half_ka.h
new file mode 100644
index 00000000..90bd9d97
--- /dev/null
+++ b/src/nnue/trainer/features/factorizer_half_ka.h
@@ -0,0 +1,93 @@
+﻿#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KA_H_
+#define _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KA_H_
+
+#include "factorizer.h"
+
+#include "nnue/features/half_ka.h"
+#include "nnue/features/a.h"
+#include "nnue/features/half_relative_ka.h"
+
+// Specialization of NNUE evaluation function feature conversion class template for HalfKA
+namespace Eval::NNUE::Features {
+
+    // Class template that converts input features into learning features
+    // Specialization for HalfKA
+    template <Side AssociatedKing>
+    class Factorizer<HalfKA<AssociatedKing>> {
+    private:
+        using FeatureType = HalfKA<AssociatedKing>;
+
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions =
+            FeatureType::kMaxActiveDimensions;
+
+        // Type of learning feature
+        enum TrainingFeatureType {
+            kFeaturesHalfKA,
+            kFeaturesA,
+            kFeaturesHalfRelativeKA,
+            kNumTrainingFeatureTypes,
+        };
+
+        // Learning feature information
+        static constexpr FeatureProperties kProperties[] = {
+            // kFeaturesHalfKPK
+            {true, FeatureType::kDimensions},
+            // kFeaturesPK
+            {true, Factorizer<A>::get_dimensions()},
+            // kFeaturesHalfRelativeKPK
+            {true, Factorizer<HalfRelativeKA<AssociatedKing>>::get_dimensions()},
+        };
+
+        static_assert(get_array_length(kProperties) == kNumTrainingFeatureTypes, "");
+
+    public:
+        static constexpr std::string get_name() {
+            return std::string("Factorizer<") + FeatureType::kName + ">";
+        }
+
+        static constexpr std::string get_factorizers_string() {
+            return "  - " + get_name();
+        }
+
+        // Get the dimensionality of the learning feature
+        static constexpr IndexType get_dimensions() {
+            return get_active_dimensions(kProperties);
+        }
+
+        // Get index of learning feature and scale of learning rate
+        static void append_training_features(
+            IndexType base_index, std::vector<TrainingFeature>* training_features) {
+
+            // kFeaturesHalfKPK
+            IndexType index_offset = append_base_feature<FeatureType>(
+                kProperties[kFeaturesHalfKA], base_index, training_features);
+
+            const auto sq_k = static_cast<Square>(base_index / PS_END2);
+            const auto a = static_cast<IndexType>(base_index % PS_END2);
+
+            // kFeaturesPK
+            index_offset += inherit_features_if_required<A>(
+                index_offset, kProperties[kFeaturesA], a, training_features);
+
+            // kFeaturesHalfRelativeKPK
+            if (a >= PS_W_PAWN) {
+                index_offset += inherit_features_if_required<HalfRelativeKA<AssociatedKing>>(
+                    index_offset, kProperties[kFeaturesHalfRelativeKA],
+                    HalfRelativeKA<AssociatedKing>::make_index(sq_k, a),
+                    training_features);
+            }
+            else {
+                index_offset += skip_features(kProperties[kFeaturesHalfRelativeKA]);
+            }
+
+            assert(index_offset == get_dimensions());
+        }
+    };
+
+    template <Side AssociatedKing>
+    constexpr FeatureProperties Factorizer<HalfKA<AssociatedKing>>::kProperties[];
+
+}  // namespace Eval::NNUE::Features
+
+#endif // #ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KA_H_

From 8fac468259e9bcd667c9d44cad48fc736b1bb98d Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Tue, 27 Oct 2020 19:37:06 +0100
Subject: [PATCH 288/398] Add a cache line aligned allocator.

---
 src/misc.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/src/misc.h b/src/misc.h
index 9f250b6e..be9b4c38 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -563,6 +563,23 @@ public:
   void deallocate(T* p, std::size_t ) { std_aligned_free(p); }
 };
 
+template <typename T>
+class CacheLineAlignedAllocator {
+public:
+    using value_type = T;
+
+    constexpr static uint64_t cache_line_size = 64;
+
+    CacheLineAlignedAllocator() {}
+    CacheLineAlignedAllocator(const CacheLineAlignedAllocator&) {}
+    CacheLineAlignedAllocator(CacheLineAlignedAllocator&&) {}
+
+    template <typename U> CacheLineAlignedAllocator(const CacheLineAlignedAllocator<U>&) {}
+
+    T* allocate(std::size_t n) { return (T*)std_aligned_alloc(cache_line_size, n * sizeof(T)); }
+    void deallocate(T* p, std::size_t) { std_aligned_free(p); }
+};
+
 // --------------------
 //  Dependency Wrapper
 // --------------------

From f1e96cab55a7825a00ce6fdc7cae49ee77adbdd7 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Tue, 27 Oct 2020 19:37:19 +0100
Subject: [PATCH 289/398] Align trainer arrays to cache line.

---
 src/nnue/trainer/trainer_affine_transform.h    | 12 ++++++------
 src/nnue/trainer/trainer_clipped_relu.h        |  4 ++--
 src/nnue/trainer/trainer_feature_transformer.h | 10 +++++-----
 src/nnue/trainer/trainer_input_slice.h         |  6 +++---
 src/nnue/trainer/trainer_sum.h                 |  2 +-
 5 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h
index 3179aeb0..449a0a11 100644
--- a/src/nnue/trainer/trainer_affine_transform.h
+++ b/src/nnue/trainer/trainer_affine_transform.h
@@ -353,18 +353,18 @@ namespace Eval::NNUE {
         LayerType* const target_layer_;
 
         // parameter
-        LearnFloatType biases_[kOutputDimensions];
-        LearnFloatType weights_[kOutputDimensions * kInputDimensions];
+        alignas(kCacheLineSize) LearnFloatType biases_[kOutputDimensions];
+        alignas(kCacheLineSize) LearnFloatType weights_[kOutputDimensions * kInputDimensions];
 
         // Buffer used for updating parameters
-        LearnFloatType biases_diff_[kOutputDimensions];
-        LearnFloatType weights_diff_[kOutputDimensions * kInputDimensions];
+        alignas(kCacheLineSize) LearnFloatType biases_diff_[kOutputDimensions];
+        alignas(kCacheLineSize) LearnFloatType weights_diff_[kOutputDimensions * kInputDimensions];
 
         // Forward propagation buffer
-        std::vector<LearnFloatType> output_;
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
 
         // buffer for back propagation
-        std::vector<LearnFloatType> gradients_;
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
 
         // hyper parameter
         LearnFloatType momentum_;
diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index 57e9bac4..5f2ff065 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -149,10 +149,10 @@ namespace Eval::NNUE {
         LayerType* const target_layer_;
 
         // Forward propagation buffer
-        std::vector<LearnFloatType> output_;
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
 
         // buffer for back propagation
-        std::vector<LearnFloatType> gradients_;
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
 
         // Health check statistics
         LearnFloatType min_activations_[kOutputDimensions];
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 97b19c46..9f0648d2 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -421,11 +421,11 @@ namespace Eval::NNUE {
             LearnFloatType weights_[kHalfDimensions * kInputDimensions];
 
         // Buffer used for updating parameters
-        LearnFloatType biases_diff_[kHalfDimensions];
-        std::vector<LearnFloatType> gradients_;
+        alignas(kCacheLineSize) LearnFloatType biases_diff_[kHalfDimensions];
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
 
         // Forward propagation buffer
-        std::vector<LearnFloatType> output_;
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
 
         // Features that appeared in the training data
         std::bitset<kInputDimensions> observed_features;
@@ -437,8 +437,8 @@ namespace Eval::NNUE {
         // Health check statistics
         LearnFloatType min_pre_activation_;
         LearnFloatType max_pre_activation_;
-        LearnFloatType min_activations_[kHalfDimensions];
-        LearnFloatType max_activations_[kHalfDimensions];
+        alignas(kCacheLineSize) LearnFloatType min_activations_[kHalfDimensions];
+        alignas(kCacheLineSize) LearnFloatType max_activations_[kHalfDimensions];
     };
 
 }  // namespace Eval::NNUE
diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
index 43968776..9b8e5e13 100644
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -163,7 +163,7 @@ namespace Eval::NNUE {
         const LearnFloatType* output_;
 
         // buffer for back propagation
-        std::vector<LearnFloatType> gradients_;
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
     };
 
     // Learning: Input layer
@@ -256,10 +256,10 @@ namespace Eval::NNUE {
         const std::shared_ptr<SharedInputTrainer> shared_input_trainer_;
 
         // Forward propagation buffer
-        std::vector<LearnFloatType> output_;
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
 
         // buffer for back propagation
-        std::vector<LearnFloatType> gradients_;
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
     };
 
 }  // namespace Eval::NNUE
diff --git a/src/nnue/trainer/trainer_sum.h b/src/nnue/trainer/trainer_sum.h
index c2e40b1c..b35420d6 100644
--- a/src/nnue/trainer/trainer_sum.h
+++ b/src/nnue/trainer/trainer_sum.h
@@ -184,7 +184,7 @@ namespace Eval::NNUE {
         LayerType* const target_layer_;
 
         // Forward propagation buffer
-        std::vector<LearnFloatType> output_;
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
     };
 
 }  // namespace Eval::NNUE

From ee0917a3459ee90a27cab4b519e571ca4fc22ac1 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 26 Oct 2020 15:06:15 +0100
Subject: [PATCH 290/398] Pass ThreadPool to update_parameters, propagate, and
 backpropagate.

---
 src/learn/learn.cpp                           |  2 +-
 src/nnue/evaluate_nnue_learner.cpp            |  6 +++--
 src/nnue/evaluate_nnue_learner.h              |  3 +++
 src/nnue/trainer/trainer_affine_transform.h   | 11 ++++++----
 src/nnue/trainer/trainer_clipped_relu.h       | 11 ++++++----
 .../trainer/trainer_feature_transformer.h     | 11 ++++++++--
 src/nnue/trainer/trainer_input_slice.h        | 22 +++++++++++--------
 src/nnue/trainer/trainer_sum.h                | 15 ++++++++-----
 8 files changed, 53 insertions(+), 28 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 93262b42..66461cc5 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -704,7 +704,7 @@ namespace Learner
         // should be no real issues happening since
         // the read/write phases are isolated.
         atomic_thread_fence(memory_order_seq_cst);
-        Eval::NNUE::update_parameters(epoch, params.verbose, params.learning_rate, calc_grad);
+        Eval::NNUE::update_parameters(Threads, epoch, params.verbose, params.learning_rate, calc_grad);
         atomic_thread_fence(memory_order_seq_cst);
 
         if (++save_count * params.mini_batch_size >= params.eval_save_interval)
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 4de939c5..6294865d 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -18,6 +18,7 @@
 #include "uci.h"
 #include "misc.h"
 #include "thread_win32_osx.h"
+#include "thread.h"
 
 // Code for learning NNUE evaluation function
 namespace Eval::NNUE {
@@ -180,6 +181,7 @@ namespace Eval::NNUE {
 
     // update the evaluation function parameters
     void update_parameters(
+        ThreadPool& thread_pool,
         uint64_t epoch,
         bool verbose,
         double learning_rate,
@@ -202,7 +204,7 @@ namespace Eval::NNUE {
             std::vector<Example> batch(examples.end() - batch_size, examples.end());
             examples.resize(examples.size() - batch_size);
 
-            const auto network_output = trainer->propagate(batch);
+            const auto network_output = trainer->propagate(thread_pool, batch);
 
             std::vector<LearnFloatType> gradients(batch.size());
             for (std::size_t b = 0; b < batch.size(); ++b) {
@@ -226,7 +228,7 @@ namespace Eval::NNUE {
                 }
             }
 
-            trainer->backpropagate(gradients.data(), learning_rate);
+            trainer->backpropagate(thread_pool, gradients.data(), learning_rate);
 
             collect_stats = false;
         }
diff --git a/src/nnue/evaluate_nnue_learner.h b/src/nnue/evaluate_nnue_learner.h
index d350691b..8633f713 100644
--- a/src/nnue/evaluate_nnue_learner.h
+++ b/src/nnue/evaluate_nnue_learner.h
@@ -5,6 +5,8 @@
 
 #include "misc.h"
 
+struct ThreadPool;
+
 // Interface used for learning NNUE evaluation function
 namespace Eval::NNUE {
 
@@ -32,6 +34,7 @@ namespace Eval::NNUE {
 
     // update the evaluation function parameters
     void update_parameters(
+        ThreadPool& thread_pool,
         uint64_t epoch,
         bool verbose,
         double learning_rate,
diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h
index 449a0a11..5d2f29c9 100644
--- a/src/nnue/trainer/trainer_affine_transform.h
+++ b/src/nnue/trainer/trainer_affine_transform.h
@@ -7,6 +7,8 @@
 
 #include "nnue/layers/affine_transform.h"
 
+#include "thread.h"
+
 #include <random>
 
 // Specialization of NNUE evaluation function learning class template for AffineTransform
@@ -88,14 +90,14 @@ namespace Eval::NNUE {
         }
 
         // forward propagation
-        const LearnFloatType* propagate(const std::vector<Example>& batch) {
+        const LearnFloatType* propagate(ThreadPool& thread_pool, const std::vector<Example>& batch) {
             if (output_.size() < kOutputDimensions * batch.size()) {
                 output_.resize(kOutputDimensions * batch.size());
                 gradients_.resize(kInputDimensions * batch.size());
             }
 
             batch_size_ = static_cast<IndexType>(batch.size());
-            batch_input_ = previous_layer_trainer_->propagate(batch);
+            batch_input_ = previous_layer_trainer_->propagate(thread_pool, batch);
 #if defined(USE_BLAS)
             for (IndexType b = 0; b < batch_size_; ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
@@ -127,7 +129,8 @@ namespace Eval::NNUE {
         }
 
         // backpropagation
-        void backpropagate(const LearnFloatType* gradients,
+        void backpropagate(ThreadPool& thread_pool,
+                           const LearnFloatType* gradients,
                            LearnFloatType learning_rate) {
 
             const LearnFloatType local_learning_rate =
@@ -211,7 +214,7 @@ namespace Eval::NNUE {
             }
             num_weights_diffs_ += kOutputDimensions * kInputDimensions;
 
-            previous_layer_trainer_->backpropagate(gradients_.data(), learning_rate);
+            previous_layer_trainer_->backpropagate(thread_pool, gradients_.data(), learning_rate);
         }
 
     private:
diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index 5f2ff065..8e29e4a1 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -7,6 +7,8 @@
 
 #include "nnue/layers/clipped_relu.h"
 
+#include "thread.h"
+
 // Specialization of NNUE evaluation function learning class template for ClippedReLU
 namespace Eval::NNUE {
 
@@ -41,13 +43,13 @@ namespace Eval::NNUE {
         }
 
         // forward propagation
-        const LearnFloatType* propagate(const std::vector<Example>& batch) {
+        const LearnFloatType* propagate(ThreadPool& thread_pool, const std::vector<Example>& batch) {
             if (output_.size() < kOutputDimensions * batch.size()) {
               output_.resize(kOutputDimensions * batch.size());
               gradients_.resize(kInputDimensions * batch.size());
             }
 
-            const auto input = previous_layer_trainer_->propagate(batch);
+            const auto input = previous_layer_trainer_->propagate(thread_pool, batch);
             batch_size_ = static_cast<IndexType>(batch.size());
             for (IndexType b = 0; b < batch_size_; ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
@@ -63,7 +65,8 @@ namespace Eval::NNUE {
         }
 
         // backpropagation
-        void backpropagate(const LearnFloatType* gradients,
+        void backpropagate(ThreadPool& thread_pool,
+                           const LearnFloatType* gradients,
                            LearnFloatType learning_rate) {
 
             for (IndexType b = 0; b < batch_size_; ++b) {
@@ -77,7 +80,7 @@ namespace Eval::NNUE {
             }
             num_total_ += batch_size_ * kOutputDimensions;
 
-            previous_layer_trainer_->backpropagate(gradients_.data(), learning_rate);
+            previous_layer_trainer_->backpropagate(thread_pool, gradients_.data(), learning_rate);
         }
 
     private:
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 9f0648d2..a778f956 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -9,6 +9,8 @@
 
 #include "nnue/nnue_feature_transformer.h"
 
+#include "thread.h"
+
 #include <array>
 #include <bitset>
 #include <numeric>
@@ -90,12 +92,14 @@ namespace Eval::NNUE {
         }
 
         // forward propagation
-        const LearnFloatType* propagate(const std::vector<Example>& batch) {
+        const LearnFloatType* propagate(ThreadPool& thread_pool, const std::vector<Example>& batch) {
             if (output_.size() < kOutputDimensions * batch.size()) {
                 output_.resize(kOutputDimensions * batch.size());
                 gradients_.resize(kOutputDimensions * batch.size());
             }
 
+            (void)thread_pool;
+
             batch_ = &batch;
             // affine transform
 #pragma omp parallel for
@@ -143,9 +147,12 @@ namespace Eval::NNUE {
         }
 
         // backpropagation
-        void backpropagate(const LearnFloatType* gradients,
+        void backpropagate(ThreadPool& thread_pool,
+                           const LearnFloatType* gradients,
                            LearnFloatType learning_rate) {
 
+            (void)thread_pool;
+
             const LearnFloatType local_learning_rate =
                 learning_rate * learning_rate_scale_;
 
diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
index 9b8e5e13..4bb38104 100644
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -7,6 +7,8 @@
 
 #include "nnue/layers/input_slice.h"
 
+#include "thread.h"
+
 // Specialization of NNUE evaluation function learning class template for InputSlice
 namespace Eval::NNUE {
 
@@ -60,7 +62,7 @@ namespace Eval::NNUE {
         }
 
         // forward propagation
-        const LearnFloatType* propagate(const std::vector<Example>& batch) {
+        const LearnFloatType* propagate(ThreadPool& thread_pool, const std::vector<Example>& batch) {
             if (gradients_.size() < kInputDimensions * batch.size()) {
                 gradients_.resize(kInputDimensions * batch.size());
             }
@@ -69,7 +71,7 @@ namespace Eval::NNUE {
 
             if (num_calls_ == 0) {
                 current_operation_ = Operation::kPropagate;
-                output_ = feature_transformer_trainer_->propagate(batch);
+                output_ = feature_transformer_trainer_->propagate(thread_pool, batch);
             }
 
             assert(current_operation_ == Operation::kPropagate);
@@ -83,11 +85,12 @@ namespace Eval::NNUE {
         }
 
         // backpropagation
-        void backpropagate(const LearnFloatType* gradients,
+        void backpropagate(ThreadPool& thread_pool,
+                           const LearnFloatType* gradients,
                            LearnFloatType learning_rate) {
 
             if (num_referrers_ == 1) {
-                feature_transformer_trainer_->backpropagate(gradients, learning_rate);
+                feature_transformer_trainer_->backpropagate(thread_pool, gradients, learning_rate);
                 return;
             }
 
@@ -112,7 +115,7 @@ namespace Eval::NNUE {
 
             if (++num_calls_ == num_referrers_) {
                 feature_transformer_trainer_->backpropagate(
-                    gradients_.data(), learning_rate);
+                    thread_pool, gradients_.data(), learning_rate);
                 num_calls_ = 0;
                 current_operation_ = Operation::kNone;
             }
@@ -193,7 +196,7 @@ namespace Eval::NNUE {
         }
 
         // forward propagation
-        const LearnFloatType* propagate(const std::vector<Example>& batch) {
+        const LearnFloatType* propagate(ThreadPool& thread_pool,const std::vector<Example>& batch) {
             if (output_.size() < kOutputDimensions * batch.size()) {
               output_.resize(kOutputDimensions * batch.size());
               gradients_.resize(kInputDimensions * batch.size());
@@ -201,7 +204,7 @@ namespace Eval::NNUE {
 
             batch_size_ = static_cast<IndexType>(batch.size());
 
-            const auto input = shared_input_trainer_->propagate(batch);
+            const auto input = shared_input_trainer_->propagate(thread_pool, batch);
             for (IndexType b = 0; b < batch_size_; ++b) {
                 const IndexType input_offset = kInputDimensions * b;
                 const IndexType output_offset = kOutputDimensions * b;
@@ -219,7 +222,8 @@ namespace Eval::NNUE {
         }
 
         // backpropagation
-        void backpropagate(const LearnFloatType* gradients,
+        void backpropagate(ThreadPool& thread_pool,
+                           const LearnFloatType* gradients,
                            LearnFloatType learning_rate) {
 
             for (IndexType b = 0; b < batch_size_; ++b) {
@@ -233,7 +237,7 @@ namespace Eval::NNUE {
                     }
                 }
             }
-            shared_input_trainer_->backpropagate(gradients_.data(), learning_rate);
+            shared_input_trainer_->backpropagate(thread_pool, gradients_.data(), learning_rate);
         }
 
     private:
diff --git a/src/nnue/trainer/trainer_sum.h b/src/nnue/trainer/trainer_sum.h
index b35420d6..6defb95f 100644
--- a/src/nnue/trainer/trainer_sum.h
+++ b/src/nnue/trainer/trainer_sum.h
@@ -7,6 +7,8 @@
 
 #include "nnue/layers/sum.h"
 
+#include "thread.h"
+
 // Specialization of NNUE evaluation function learning class template for Sum
 namespace Eval::NNUE {
 
@@ -45,10 +47,10 @@ namespace Eval::NNUE {
         }
 
         // forward propagation
-        /*const*/ LearnFloatType* propagate(const std::vector<Example>& batch) {
+        /*const*/ LearnFloatType* propagate(ThreadPool& thread_pool, const std::vector<Example>& batch) {
             batch_size_ = static_cast<IndexType>(batch.size());
-            auto output = Tail::propagate(batch);
-            const auto head_output = previous_layer_trainer_->propagate(batch);
+            auto output = Tail::propagate(thread_pool, batch);
+            const auto head_output = previous_layer_trainer_->propagate(thread_pool, batch);
 
 #if defined(USE_BLAS)
             cblas_saxpy(kOutputDimensions * batch_size_, 1.0,
@@ -66,11 +68,12 @@ namespace Eval::NNUE {
         }
 
         // backpropagation
-        void backpropagate(const LearnFloatType* gradients,
+        void backpropagate(ThreadPool& thread_pool,
+                           const LearnFloatType* gradients,
                            LearnFloatType learning_rate) {
 
-            Tail::backpropagate(gradients, learning_rate);
-            previous_layer_trainer_->backpropagate(gradients, learning_rate);
+            Tail::backpropagate(thread_pool, gradients, learning_rate);
+            previous_layer_trainer_->backpropagate(thread_pool, gradients, learning_rate);
         }
 
     private:

From c56a4a36eb92e8fd32b8923a52896352465f93b0 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 28 Oct 2020 14:41:51 +0100
Subject: [PATCH 291/398] Add our own blas-like routines that use stockfish's
 thread pool for parallelization.

---
 src/Makefile                 |    1 +
 src/extra/stockfish_blas.cpp | 1033 ++++++++++++++++++++++++++++++++++
 src/extra/stockfish_blas.h   |  130 +++++
 src/thread.h                 |   29 +
 src/uci.cpp                  |    9 +
 5 files changed, 1202 insertions(+)
 create mode 100644 src/extra/stockfish_blas.cpp
 create mode 100644 src/extra/stockfish_blas.h

diff --git a/src/Makefile b/src/Makefile
index 45d27ef2..cba4e351 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -47,6 +47,7 @@ PGOGENSFEN = ./$(EXE) gensfen depth 3 loop 1000 output_file_name $(PGO_TRAINING_
 SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp \
 	material.cpp misc.cpp movegen.cpp movepick.cpp pawns.cpp position.cpp psqt.cpp \
 	search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \
+	extra/stockfish_blas.cpp \
 	nnue/evaluate_nnue.cpp \
 	nnue/evaluate_nnue_learner.cpp \
 	nnue/features/half_kp.cpp \
diff --git a/src/extra/stockfish_blas.cpp b/src/extra/stockfish_blas.cpp
new file mode 100644
index 00000000..0ba40b49
--- /dev/null
+++ b/src/extra/stockfish_blas.cpp
@@ -0,0 +1,1033 @@
+#include "stockfish_blas.h"
+
+#include "thread.h"
+
+#include <cstring>
+#include <random>
+#include <iostream>
+#include <vector>
+#include <algorithm>
+#include <cmath>
+#include <atomic>
+#include <chrono>
+
+#if defined(USE_SSE2)
+#include <xmmintrin.h>
+#endif
+
+#if defined (USE_SSE3)
+#include <pmmintrin.h>
+#endif
+
+#if defined(USE_BLAS)
+#include <cblas.h>
+#endif
+
+namespace Blas {
+    void scopy(
+        const int N,
+        const float * SF_BLAS_RESTRICT X,
+        float * SF_BLAS_RESTRICT Y
+    )
+    {
+        std::memcpy(Y, X, sizeof(float) * N);
+    }
+
+    void scopy(
+        const int N,
+        const float * SF_BLAS_RESTRICT X, const int incX,
+        float * SF_BLAS_RESTRICT Y, const int incY
+    )
+    {
+        if (incX == 1 && incY == 1)
+        {
+            scopy(N, X, Y);
+        }
+        else
+        {
+            for(int i = 0; i < N; ++i)
+            {
+                *Y = *X;
+                X += incX;
+                Y += incY;
+            }
+        }
+    }
+
+    void scopy(
+        ThreadPool&,
+        const int N,
+        const float * SF_BLAS_RESTRICT X,
+        float * SF_BLAS_RESTRICT Y
+    )
+    {
+        scopy(N, X, Y);
+    }
+
+    void scopy(
+        ThreadPool&,
+        const int N,
+        const float * SF_BLAS_RESTRICT X, const int incX,
+        float * SF_BLAS_RESTRICT Y, const int incY
+    )
+    {
+        scopy(N, X, incX, Y, incY);
+    }
+
+    void sscal(
+        const int N,
+        const float alpha,
+        float * SF_BLAS_RESTRICT X
+    )
+    {
+#if defined (USE_SSE2)
+
+        const __m128 alpha4 = _mm_set1_ps(alpha);
+
+        int i = 0;
+        for(; i < N - 31; i += 32)
+        {
+            __m128 x0 = _mm_loadu_ps(X + i +  0);
+            __m128 x1 = _mm_loadu_ps(X + i +  4);
+            __m128 x2 = _mm_loadu_ps(X + i +  8);
+            __m128 x3 = _mm_loadu_ps(X + i + 12);
+            __m128 x4 = _mm_loadu_ps(X + i + 16);
+            __m128 x5 = _mm_loadu_ps(X + i + 20);
+            __m128 x6 = _mm_loadu_ps(X + i + 24);
+            __m128 x7 = _mm_loadu_ps(X + i + 28);
+
+            x0 = _mm_mul_ps(x0, alpha4);
+            x1 = _mm_mul_ps(x1, alpha4);
+            x2 = _mm_mul_ps(x2, alpha4);
+            x3 = _mm_mul_ps(x3, alpha4);
+            x4 = _mm_mul_ps(x4, alpha4);
+            x5 = _mm_mul_ps(x5, alpha4);
+            x6 = _mm_mul_ps(x6, alpha4);
+            x7 = _mm_mul_ps(x7, alpha4);
+
+            _mm_storeu_ps(X + i +  0, x0);
+            _mm_storeu_ps(X + i +  4, x1);
+            _mm_storeu_ps(X + i +  8, x2);
+            _mm_storeu_ps(X + i + 12, x3);
+            _mm_storeu_ps(X + i + 16, x4);
+            _mm_storeu_ps(X + i + 20, x5);
+            _mm_storeu_ps(X + i + 24, x6);
+            _mm_storeu_ps(X + i + 28, x7);
+        }
+
+        for(; i < N; ++i)
+        {
+            X[i] *= alpha;
+        }
+
+#else
+
+        for(int i = 0; i < N; ++i)
+        {
+            X[i] *= alpha;
+        }
+
+#endif
+    }
+
+    void sscal(
+        const int N,
+        const float alpha,
+        float * SF_BLAS_RESTRICT X, const int incX
+    )
+    {
+        if (incX == 1)
+        {
+            sscal(N, alpha, X);
+        }
+        else
+        {
+            for(int i = 0; i < N; ++i)
+            {
+                *X *= alpha;
+                X += incX;
+            }
+        }
+    }
+
+    void sscal(
+        ThreadPool&,
+        const int N,
+        const float alpha,
+        float * SF_BLAS_RESTRICT X
+    )
+    {
+        sscal(N, alpha, X);
+    }
+
+    void sscal(
+        ThreadPool&,
+        const int N,
+        const float alpha,
+        float *X, const int incX
+    )
+    {
+        sscal(N, alpha, X, incX);
+    }
+
+    void saxpy(
+        const int N,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT X,
+        float * SF_BLAS_RESTRICT Y
+    )
+    {
+
+#if defined (USE_SSE2)
+
+        const __m128 alpha4 = _mm_set1_ps(alpha);
+
+        int i = 0;
+        for(; i < N - 15; i += 16)
+        {
+            __m128 x0 = _mm_loadu_ps(X + i +  0);
+            __m128 x1 = _mm_loadu_ps(X + i +  4);
+            __m128 x2 = _mm_loadu_ps(X + i +  8);
+            __m128 x3 = _mm_loadu_ps(X + i + 12);
+
+            __m128 y0 = _mm_loadu_ps(Y + i +  0);
+            __m128 y1 = _mm_loadu_ps(Y + i +  4);
+            __m128 y2 = _mm_loadu_ps(Y + i +  8);
+            __m128 y3 = _mm_loadu_ps(Y + i + 12);
+
+            x0 = _mm_mul_ps(x0, alpha4);
+            x1 = _mm_mul_ps(x1, alpha4);
+            x2 = _mm_mul_ps(x2, alpha4);
+            x3 = _mm_mul_ps(x3, alpha4);
+
+            x0 = _mm_add_ps(x0, y0);
+            x1 = _mm_add_ps(x1, y1);
+            x2 = _mm_add_ps(x2, y2);
+            x3 = _mm_add_ps(x3, y3);
+
+            _mm_storeu_ps(Y + i +  0, x0);
+            _mm_storeu_ps(Y + i +  4, x1);
+            _mm_storeu_ps(Y + i +  8, x2);
+            _mm_storeu_ps(Y + i + 12, x3);
+        }
+
+        for(; i < N; ++i)
+        {
+            Y[i] += X[i] * alpha;
+        }
+
+#else
+
+        for(int i = 0; i < N; ++i)
+        {
+            Y[i] += X[i] * alpha;
+        }
+
+#endif
+
+    }
+
+    void saxpy(
+        const int N,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT X, const int incX,
+        float * SF_BLAS_RESTRICT Y, const int incY
+    )
+    {
+        if (incX == 1 && incY == 1)
+        {
+            saxpy(N, alpha, X, Y);
+        }
+        else
+        {
+            for(int i = 0; i < N; ++i)
+            {
+                *Y += *X * alpha;
+                Y += incY;
+                X += incX;
+            }
+        }
+    }
+
+    void saxpy(
+        ThreadPool&,
+        const int N,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT X,
+        float * SF_BLAS_RESTRICT Y
+    )
+    {
+        saxpy(N, alpha, X, Y);
+    }
+
+    void saxpy(
+        ThreadPool&,
+        const int N,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT X, const int incX,
+        float * SF_BLAS_RESTRICT Y, const int incY
+    )
+    {
+        saxpy(N, alpha, X, incX, Y, incY);
+    }
+
+#if defined (USE_SSE3)
+    inline __m128 m128_hadd_ps(__m128 a, __m128 b, __m128 c, __m128 d)
+    {
+        const __m128 t0 = _mm_hadd_ps(a, b);
+        const __m128 t1 = _mm_hadd_ps(c, d);
+        return _mm_hadd_ps(t0, t1);
+    }
+#endif
+
+#if defined (USE_SSE2)
+
+    inline void transpose4x4_sse2(
+        const float* SF_BLAS_RESTRICT A, const int lda,
+        float* SF_BLAS_RESTRICT B, const int ldb
+    )
+    {
+        __m128 row1 = _mm_loadu_ps(&A[0 * lda]);
+        __m128 row2 = _mm_loadu_ps(&A[1 * lda]);
+        __m128 row3 = _mm_loadu_ps(&A[2 * lda]);
+        __m128 row4 = _mm_loadu_ps(&A[3 * lda]);
+
+        _MM_TRANSPOSE4_PS(row1, row2, row3, row4);
+
+        _mm_storeu_ps(&B[0 * ldb], row1);
+        _mm_storeu_ps(&B[1 * ldb], row2);
+        _mm_storeu_ps(&B[2 * ldb], row3);
+        _mm_storeu_ps(&B[3 * ldb], row4);
+    }
+
+    void transpose_sse2(
+        const int N, const int M,
+        const float* SF_BLAS_RESTRICT A, const int lda,
+        float* SF_BLAS_RESTRICT B, const int ldb
+    )
+    {
+        static constexpr int block_size = 16;
+
+        for (int n = 0; n < N; n += block_size)
+        {
+            for (int m = 0; m < M; m += block_size)
+            {
+                const int max_n2 = n + block_size < N ? n + block_size : N;
+                const int max_m2 = m + block_size < M ? m + block_size : M;
+
+                int n2 = n;
+                for (; n2 < max_n2 - 3; n2 += 4)
+                {
+                    int m2 = m;
+                    for (; m2 < max_m2 - 3; m2 += 4)
+                    {
+                        transpose4x4_sse2(
+                            &A[n2 * lda + m2], lda,
+                            &B[m2 * ldb + n2], ldb
+                        );
+                    }
+
+                    for (; m2 < max_m2; ++m2)
+                    {
+                        B[m2 * ldb + n2 + 0] = A[(n2 + 0) * lda + m2];
+                        B[m2 * ldb + n2 + 1] = A[(n2 + 1) * lda + m2];
+                        B[m2 * ldb + n2 + 2] = A[(n2 + 2) * lda + m2];
+                        B[m2 * ldb + n2 + 3] = A[(n2 + 3) * lda + m2];
+                    }
+                }
+
+                for (; n2 < max_n2; ++n2)
+                {
+                    for (int m2 = m; m2 < max_m2; ++m2)
+                    {
+                        B[m2 * ldb + n2] = A[n2 * lda + m2];
+                    }
+                }
+            }
+        }
+    }
+#endif
+
+    void transpose(
+        const int N, const int M,
+        const float * SF_BLAS_RESTRICT A, const int lda,
+        float* SF_BLAS_RESTRICT B, const int ldb
+    )
+    {
+#if defined (USE_SSE2)
+
+        transpose_sse2(
+            N, M,
+            A, lda,
+            B, ldb
+        );
+
+#else
+
+        for(int r = 0; r < N; ++r)
+        {
+            for (int c = 0; c < M; ++c)
+            {
+                B[c*ldb + r] = A[r*lda + c];
+            }
+        }
+
+#endif
+    }
+
+    void sgemm_row_major_transpose_right(
+        ThreadPool& thread_pool,
+        const int M, const int N, const int K,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT A, const int lda,
+        const float * SF_BLAS_RESTRICT B, const int ldb,
+        const float beta,
+        float * SF_BLAS_RESTRICT C, const int ldc
+    )
+    {
+
+#if defined(USE_SSE3)
+
+        const __m128 alpha4 = _mm_set1_ps(alpha);
+        const __m128 beta4 = _mm_set1_ps(beta);
+
+        std::atomic<int> m_atomic = 0;
+        thread_pool.execute_with_workers(
+            [
+                M, N, K,
+                alpha, alpha4,
+                A, lda,
+                B, ldb,
+                beta, beta4,
+                C, ldc,
+                &m_atomic
+            ](Thread&) {
+                for (;;)
+                {
+                    const int m = m_atomic.fetch_add(2);
+                    if (m >= M - 1)
+                        break;
+
+                    int n = 0;
+                    for (; n < N - 3; n += 4)
+                    {
+                        //        mn
+                        __m128 sum00 = _mm_setzero_ps();
+                        __m128 sum01 = _mm_setzero_ps();
+                        __m128 sum02 = _mm_setzero_ps();
+                        __m128 sum03 = _mm_setzero_ps();
+                        __m128 sum10 = _mm_setzero_ps();
+                        __m128 sum11 = _mm_setzero_ps();
+                        __m128 sum12 = _mm_setzero_ps();
+                        __m128 sum13 = _mm_setzero_ps();
+
+                        // Horizontal sum of elements in sum[m][n] corresponds to
+                        // the final element in the C.
+
+                        int k = 0;
+                        for (; k < K - 3; k += 4)
+                        {
+                            const __m128 a0 = _mm_loadu_ps(&A[(m+0)*lda+k+0]);
+                            const __m128 a1 = _mm_loadu_ps(&A[(m+1)*lda+k+0]);
+
+                            const __m128 b0 = _mm_loadu_ps(&B[(n+0)*ldb+k+0]);
+                            const __m128 b1 = _mm_loadu_ps(&B[(n+1)*ldb+k+0]);
+                            const __m128 b2 = _mm_loadu_ps(&B[(n+2)*ldb+k+0]);
+                            const __m128 b3 = _mm_loadu_ps(&B[(n+3)*ldb+k+0]);
+
+                            sum00 = _mm_add_ps(sum00, _mm_mul_ps(a0, b0));
+                            sum01 = _mm_add_ps(sum01, _mm_mul_ps(a0, b1));
+                            sum02 = _mm_add_ps(sum02, _mm_mul_ps(a0, b2));
+                            sum03 = _mm_add_ps(sum03, _mm_mul_ps(a0, b3));
+                            sum10 = _mm_add_ps(sum10, _mm_mul_ps(a1, b0));
+                            sum11 = _mm_add_ps(sum11, _mm_mul_ps(a1, b1));
+                            sum12 = _mm_add_ps(sum12, _mm_mul_ps(a1, b2));
+                            sum13 = _mm_add_ps(sum13, _mm_mul_ps(a1, b3));
+                        }
+
+                        for(; k < K; k += 1)
+                        {
+                            const float a0 = A[(m+0)*lda+k+0];
+                            const float a1 = A[(m+1)*lda+k+0];
+
+                            const float b0 = B[(n+0)*ldb+k+0];
+                            const float b1 = B[(n+1)*ldb+k+0];
+                            const float b2 = B[(n+2)*ldb+k+0];
+                            const float b3 = B[(n+3)*ldb+k+0];
+
+                            // Since all will be summed vertically anyway we can
+                            // just add to the first element.
+                            // Other elements are left unmodified.
+                            sum00 = _mm_add_ss(sum00, _mm_set_ss(a0 * b0));
+                            sum01 = _mm_add_ss(sum01, _mm_set_ss(a0 * b1));
+                            sum02 = _mm_add_ss(sum02, _mm_set_ss(a0 * b2));
+                            sum03 = _mm_add_ss(sum03, _mm_set_ss(a0 * b3));
+                            sum10 = _mm_add_ss(sum10, _mm_set_ss(a1 * b0));
+                            sum11 = _mm_add_ss(sum11, _mm_set_ss(a1 * b1));
+                            sum12 = _mm_add_ss(sum12, _mm_set_ss(a1 * b2));
+                            sum13 = _mm_add_ss(sum13, _mm_set_ss(a1 * b3));
+                        }
+
+                        __m128 s0 = m128_hadd_ps(sum00, sum01, sum02, sum03);
+                        __m128 s1 = m128_hadd_ps(sum10, sum11, sum12, sum13);
+                        s0 = _mm_mul_ps(s0, alpha4);
+                        s1 = _mm_mul_ps(s1, alpha4);
+
+                        __m128 c0 = _mm_loadu_ps(&C[(m+0)*ldc+(n+0)]);
+                        __m128 c1 = _mm_loadu_ps(&C[(m+1)*ldc+(n+0)]);
+                        c0 = _mm_mul_ps(c0, beta4);
+                        c1 = _mm_mul_ps(c1, beta4);
+
+                        c0 = _mm_add_ps(c0, s0);
+                        c1 = _mm_add_ps(c1, s1);
+
+                        _mm_storeu_ps(&C[(m+0)*ldc+(n+0)], c0);
+                        _mm_storeu_ps(&C[(m+1)*ldc+(n+0)], c1);
+                    }
+
+                    for(; n < N; n += 1)
+                    {
+                        float sum0 = 0.0f;
+                        float sum1 = 0.0f;
+
+                        for (int k = 0; k < K; ++k)
+                        {
+                            const float a0 = A[(m+0)*lda+k+0];
+                            const float a1 = A[(m+1)*lda+k+0];
+
+                            const float b0 = B[(n+0)*ldb+k+0];
+
+                            sum0 += a0 * b0;
+                            sum1 += a1 * b0;
+                        }
+
+                        C[(m+0)*ldc+(n+0)] = C[(m+0)*ldc+(n+0)] * beta + sum0 * alpha;
+                        C[(m+1)*ldc+(n+0)] = C[(m+1)*ldc+(n+0)] * beta + sum1 * alpha;
+                    }
+                }
+            }
+        );
+
+        int m = M - (M % 2);
+        for (; m < M; m += 1)
+        {
+            for (int n = 0; n < N; n += 1)
+            {
+                float sum = 0.0f;
+
+                for (int k = 0; k < K; k += 1)
+                {
+                    sum += A[m*lda + k] * B[n*ldb + k];
+                }
+
+                C[m*ldc + n] = C[m*ldc + n] * beta + sum * alpha;
+            }
+        }
+
+        thread_pool.wait_for_workers_finished();
+
+#else
+
+        thread_pool.for_each_index_with_workers(
+            0, M,
+            [&](Thread&, int m) {
+                for (int n = 0; n < N; n += 1)
+                {
+                    float sum = 0.0f;
+
+                    for (int k = 0; k < K; k += 1)
+                    {
+                        sum += A[m*lda + k] * B[n*ldb + k];
+                    }
+
+                    C[m*ldc + n] = C[m*ldc + n] * beta + sum * alpha;
+                }
+            }
+        );
+        thread_pool.wait_for_workers_finished();
+
+#endif
+    }
+
+    // The pointer to the storage returned by this function
+    // is valid until the next call to this function from
+    // the same thread with the same idx.
+    // This is an unsafe function and should be used with caution
+    // and only within this translation unit.
+    // The number of buffers available is just enough to make
+    // all functions here work.
+    float* get_thread_local_temporary_storage(
+        int requested_size, int idx
+    )
+    {
+        static constexpr int MAX_NUM_BUFFERS = 2;
+
+        static thread_local int s_data_size[MAX_NUM_BUFFERS] = {0};
+        static thread_local std::unique_ptr<float[]> s_data[MAX_NUM_BUFFERS];
+
+        if (requested_size > s_data_size[idx])
+        {
+            s_data[idx] = std::make_unique<float[]>(requested_size);
+            s_data_size[idx] = requested_size;
+        }
+
+        return s_data[idx].get();
+    }
+
+    void sgemm_row_major_transpose_none(
+        ThreadPool& thread_pool,
+        const int M, const int N, const int K,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT A, const int lda,
+        const float * SF_BLAS_RESTRICT B, const int ldb,
+        const float beta,
+        float * SF_BLAS_RESTRICT C, const int ldc
+    )
+    {
+        constexpr static int temporary_buffer_index = 1;
+
+        auto B_tr = get_thread_local_temporary_storage(K * N, temporary_buffer_index);
+
+        transpose(
+            K, N,
+            B, ldb,
+            B_tr, K
+        );
+
+        sgemm_row_major_transpose_right(
+            thread_pool,
+            M, N, K,
+            alpha,
+            A, lda,
+            B_tr, K,
+            beta,
+            C, ldc
+        );
+    }
+
+    void sgemm_row_major(
+        ThreadPool& thread_pool,
+        MatrixTranspose TransA, MatrixTranspose TransB,
+        const int M, const int N, const int K,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT A, const int lda,
+        const float * SF_BLAS_RESTRICT B, const int ldb,
+        const float beta,
+        float * SF_BLAS_RESTRICT C, const int ldc
+    )
+    {
+        constexpr static int temporary_buffer_index = 0;
+
+        if (TransA == MatrixTranspose::Trans && TransB == MatrixTranspose::Trans)
+        {
+            auto A_tr = get_thread_local_temporary_storage(K * M, temporary_buffer_index);
+
+            transpose(
+                K, M,
+                A, lda,
+                A_tr, K
+            );
+
+            sgemm_row_major_transpose_right(
+                thread_pool,
+                M, N, K,
+                alpha,
+                A_tr, K,
+                B, ldb,
+                beta,
+                C, ldc
+            );
+        }
+        else if (TransA == MatrixTranspose::NoTrans && TransB == MatrixTranspose::Trans)
+        {
+            sgemm_row_major_transpose_right(
+                thread_pool,
+                M, N, K,
+                alpha,
+                A, lda,
+                B, ldb,
+                beta,
+                C, ldc
+            );
+        }
+        else if (TransA == MatrixTranspose::Trans && TransB == MatrixTranspose::NoTrans)
+        {
+            auto A_tr = get_thread_local_temporary_storage(K * M, temporary_buffer_index);
+
+            transpose(
+                K, M,
+                A, lda,
+                A_tr, K
+            );
+
+            sgemm_row_major_transpose_none(
+                thread_pool,
+                M, N, K,
+                alpha,
+                A_tr, K,
+                B, ldb,
+                beta,
+                C, ldc
+            );
+        }
+        else // no transpositions
+        {
+            sgemm_row_major_transpose_none(
+                thread_pool,
+                M, N, K,
+                alpha,
+                A, lda,
+                B, ldb,
+                beta,
+                C, ldc
+            );
+        }
+    }
+
+    void sgemm(
+        ThreadPool& thread_pool,
+        MatrixLayout layout, MatrixTranspose TransA, MatrixTranspose TransB,
+        const int M, const int N, const int K,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT A, const int lda,
+        const float * SF_BLAS_RESTRICT B, const int ldb,
+        const float beta,
+        float * SF_BLAS_RESTRICT C, const int ldc
+    )
+    {
+        if (layout == MatrixLayout::RowMajor)
+        {
+            sgemm_row_major(
+                thread_pool,
+                TransA, TransB,
+                M, N, K,
+                alpha,
+                A, lda,
+                B, ldb,
+                beta,
+                C, ldc
+            );
+        }
+        else
+        {
+            sgemm_row_major(
+                thread_pool,
+                TransB, TransA,
+                N, M, K,
+                alpha,
+                B, ldb,
+                A, lda,
+                beta,
+                C, ldc
+            );
+        }
+    }
+
+    std::vector<float> generate_random_matrix(int rows, int cols)
+    {
+        std::vector<float> m(rows * cols);
+
+        std::mt19937_64 rng;
+        std::uniform_real_distribution<float> d(-1.0, 1.0);
+
+        for(auto& v : m)
+        {
+            v = d(rng);
+        }
+
+        return m;
+    }
+
+    std::vector<float> generate_zero_matrix(int rows, int cols)
+    {
+        return std::vector<float>(rows * cols, 0.0f);
+    }
+
+    float matrix_relative_error(
+        const std::vector<float>& ref,
+        const std::vector<float>& our
+    )
+    {
+        double sum = 0.0;
+        double diff_sum = 0.0;
+
+        for(size_t i = 0; i < ref.size(); ++i)
+        {
+            sum += std::abs(ref[i]);
+            diff_sum += std::abs(ref[i] - our[i]);
+        }
+
+        return diff_sum / sum;
+    }
+
+    float norm(
+        const std::vector<float>& v
+    )
+    {
+        double sum = 0.0;
+
+        for(auto& e : v)
+        {
+            sum += e * e;
+        }
+
+        return std::sqrt(sum);
+    }
+
+#if defined (USE_BLAS)
+
+    CBLAS_LAYOUT matrix_layout_to_blas_layout(MatrixLayout layout)
+    {
+        if (layout == MatrixLayout::RowMajor)
+            return CblasRowMajor;
+        else if (layout == MatrixLayout::ColMajor)
+            return CblasColMajor;
+
+        return static_cast<CBLAS_LAYOUT>(-1);
+    }
+
+    const char* matrix_layout_to_string(MatrixLayout layout)
+    {
+        if (layout == MatrixLayout::RowMajor)
+            return "RowMajor";
+        else if (layout == MatrixLayout::ColMajor)
+            return "ColMajor";
+
+        return "INVALID";
+    }
+
+    CBLAS_TRANSPOSE matrix_transpose_to_blas_transpose(MatrixTranspose tr)
+    {
+        if (tr == MatrixTranspose::NoTrans)
+            return CblasNoTrans;
+        else if (tr == MatrixTranspose::Trans)
+            return CblasTrans;
+
+        return static_cast<CBLAS_TRANSPOSE>(-1);
+    }
+
+    const char* matrix_transpose_to_string(MatrixTranspose tr)
+    {
+        if (tr == MatrixTranspose::NoTrans)
+            return "NoTrans";
+        else if (tr == MatrixTranspose::Trans)
+            return "Trans";
+
+        return "INVALID";
+    }
+
+    void test_sgemm(
+        ThreadPool& thread_pool,
+        MatrixLayout layout, MatrixTranspose trA, MatrixTranspose trB,
+        int M, int N, int K
+    )
+    {
+        auto A = generate_random_matrix(M * 2, K * 2);
+        auto B = generate_random_matrix(K * 2, N * 2);
+        auto C_ref = generate_random_matrix(M * 2, N * 2);
+        auto C_our = C_ref;
+
+        std::cout
+            << matrix_layout_to_string(layout) << ' '
+            << matrix_transpose_to_string(trA) << ' '
+            << matrix_transpose_to_string(trB) << '\n';
+
+        std::cout << "A norm: " << norm(A) << '\n';
+        std::cout << "B norm: " << norm(B) << '\n';
+        std::cout << "C norm: " << norm(C_ref) << '\n';
+
+        const int lda = (trA == MatrixTranspose::NoTrans) == (layout == MatrixLayout::RowMajor) ? K * 2 : M * 2;
+        const int ldb = (trB == MatrixTranspose::NoTrans) == (layout == MatrixLayout::RowMajor) ? N * 2 : K * 2;
+        const int ldc = (layout == MatrixLayout::RowMajor) ? N * 2 : M * 2;
+
+        cblas_sgemm(
+            matrix_layout_to_blas_layout(layout),
+            matrix_transpose_to_blas_transpose(trA),
+            matrix_transpose_to_blas_transpose(trB),
+            M, N, K,
+            1.0,
+            A.data(), lda,
+            B.data(), ldb,
+            1.0,
+            C_ref.data(), ldc
+        );
+
+        sgemm(
+            thread_pool,
+            layout, trA, trB,
+            M, N, K,
+            1.0,
+            A.data(), lda,
+            B.data(), ldb,
+            1.0,
+            C_our.data(), ldc
+        );
+
+        std::cout << "C_ref norm: " << norm(C_ref) << '\n';
+        std::cout << "C_our norm: " << norm(C_our) << '\n';
+        std::cout << "Relative error: " << matrix_relative_error(C_ref, C_our) << '\n';
+
+        std::cout << '\n';
+    }
+
+    void test_sgemm(
+        ThreadPool& thread_pool
+    )
+    {
+        constexpr int M = 57;
+        constexpr int N = 127;
+        constexpr int K = 31;
+
+        std::cout << "SGEMM test:\n";
+
+        for(auto layout : { MatrixLayout::RowMajor, MatrixLayout::ColMajor })
+        {
+            for(auto trA : { MatrixTranspose::NoTrans, MatrixTranspose::Trans })
+            {
+                for(auto trB : { MatrixTranspose::NoTrans, MatrixTranspose::Trans })
+                {
+                    test_sgemm(
+                        thread_pool,
+                        layout, trA, trB,
+                        M, N, K
+                    );
+                }
+            }
+        }
+    }
+
+    void bench_sgemm(
+        ThreadPool& thread_pool,
+        MatrixLayout layout, MatrixTranspose trA, MatrixTranspose trB,
+        int M, int N, int K
+    )
+    {
+        constexpr int num_iters = 1000;
+
+        auto A = generate_random_matrix(M * 2, K * 2);
+        auto B = generate_random_matrix(K * 2, N * 2);
+        auto C_ref = generate_random_matrix(M * 2, N * 2);
+        auto C_our = C_ref;
+
+        std::cout
+            << matrix_layout_to_string(layout) << ' '
+            << matrix_transpose_to_string(trA) << ' '
+            << matrix_transpose_to_string(trB) << '\n';
+
+        std::cout << "A norm: " << norm(A) << '\n';
+        std::cout << "B norm: " << norm(B) << '\n';
+        std::cout << "C norm: " << norm(C_ref) << '\n';
+
+        const int lda = (trA == MatrixTranspose::NoTrans) == (layout == MatrixLayout::RowMajor) ? K * 2 : M * 2;
+        const int ldb = (trB == MatrixTranspose::NoTrans) == (layout == MatrixLayout::RowMajor) ? N * 2 : K * 2;
+        const int ldc = (layout == MatrixLayout::RowMajor) ? N * 2 : M * 2;
+
+        auto t0_ref = std::chrono::high_resolution_clock::now();
+        for(int i = 0; i < num_iters; ++i)
+        {
+            cblas_sgemm(
+                matrix_layout_to_blas_layout(layout),
+                matrix_transpose_to_blas_transpose(trA),
+                matrix_transpose_to_blas_transpose(trB),
+                M, N, K,
+                1.0,
+                A.data(), lda,
+                B.data(), ldb,
+                -0.5,
+                C_ref.data(), ldc
+            );
+        }
+        auto t1_ref = std::chrono::high_resolution_clock::now();
+        auto diff_ref = t1_ref - t0_ref;
+
+        auto t0_our = std::chrono::high_resolution_clock::now();
+        for(int i = 0; i < num_iters; ++i)
+        {
+            sgemm(
+                thread_pool,
+                layout, trA, trB,
+                M, N, K,
+                1.0,
+                A.data(), lda,
+                B.data(), ldb,
+                -0.5,
+                C_our.data(), ldc
+            );
+        }
+        auto t1_our = std::chrono::high_resolution_clock::now();
+        auto diff_our = t1_our - t0_our;
+
+        std::cout << "C_ref norm: " << norm(C_ref) << '\n';
+        std::cout << "C_our norm: " << norm(C_our) << '\n';
+        std::cout << "Relative error: " << matrix_relative_error(C_ref, C_our) << '\n';
+        std::cout << "Ref time: " << std::chrono::duration_cast<std::chrono::nanoseconds>(diff_ref).count() << " [ns]\n";
+        std::cout << "Our time: " << std::chrono::duration_cast<std::chrono::nanoseconds>(diff_our).count() << " [ns]\n";
+
+        std::cout << '\n';
+    }
+
+    void bench_sgemm(
+        ThreadPool& thread_pool
+    )
+    {
+        constexpr int M = 107;
+        constexpr int N = 213;
+        constexpr int K = 57;
+
+        std::cout << "SGEMM benchmark:\n";
+
+        for(auto layout : { MatrixLayout::RowMajor, MatrixLayout::ColMajor })
+        {
+            for(auto trA : { MatrixTranspose::NoTrans, MatrixTranspose::Trans })
+            {
+                for(auto trB : { MatrixTranspose::NoTrans, MatrixTranspose::Trans })
+                {
+                    bench_sgemm(
+                        thread_pool,
+                        layout, trA, trB,
+                        M, N, K
+                    );
+                }
+            }
+        }
+    }
+
+#endif
+
+    void print_arch()
+    {
+#if defined (USE_SSE3)
+        std::cout << "Using the sse3 implementation.\n";
+#elif defined (USE_SSE2)
+        std::cout << "Using the sse2 implementation.\n";
+#else
+        std::cout << "Using the base implementation.\n";
+#endif
+    }
+
+    void test(
+        ThreadPool& thread_pool
+    )
+    {
+#if defined (USE_BLAS)
+        print_arch();
+        test_sgemm(thread_pool);
+#else
+        std::cout << "Blas tests are only runnable when USE_BLAS is defined.\n";
+        (void)thread_pool;
+#endif
+    }
+
+    void bench(
+        ThreadPool& thread_pool
+    )
+    {
+#if defined (USE_BLAS)
+        print_arch();
+        bench_sgemm(thread_pool);
+#else
+        std::cout << "Blas benchmarks are only runnable when USE_BLAS is defined.\n";
+        (void)thread_pool;
+#endif
+    }
+}
\ No newline at end of file
diff --git a/src/extra/stockfish_blas.h b/src/extra/stockfish_blas.h
new file mode 100644
index 00000000..65da7e99
--- /dev/null
+++ b/src/extra/stockfish_blas.h
@@ -0,0 +1,130 @@
+#ifndef _STOCKFISH_BLAS_H_
+#define _STOCKFISH_BLAS_H_
+
+struct ThreadPool;
+
+#if defined (_MSC_VER)
+#define SF_BLAS_RESTRICT __restrict
+#elif defined (__INTEL_COMPILER)
+#define SF_BLAS_RESTRICT restrict
+#elif defined (__clang__)
+#define SF_BLAS_RESTRICT __restrict__
+#elif defined (__GNUC__)
+#define SF_BLAS_RESTRICT __restrict__
+#endif
+
+namespace Blas {
+
+    enum struct MatrixLayout {
+        RowMajor = 101,
+        ColMajor = 102
+    };
+
+    enum struct MatrixTranspose {
+        NoTrans = 111,
+        Trans = 112
+    };
+
+    void scopy(
+        const int N,
+        const float * SF_BLAS_RESTRICT X,
+        float * SF_BLAS_RESTRICT Y
+    );
+
+    void scopy(
+        const int N,
+        const float * SF_BLAS_RESTRICT X, const int incX,
+        float * SF_BLAS_RESTRICT Y, const int incY
+    );
+
+    void scopy(
+        ThreadPool& thread_pool,
+        const int N,
+        const float * SF_BLAS_RESTRICT X,
+        float * SF_BLAS_RESTRICT Y
+    );
+
+    void scopy(
+        ThreadPool& thread_pool,
+        const int N,
+        const float * SF_BLAS_RESTRICT X, const int incX,
+        float * SF_BLAS_RESTRICT Y, const int incY
+    );
+
+    void sscal(
+        const int N,
+        const float alpha,
+        float * SF_BLAS_RESTRICT X
+    );
+
+    void sscal(
+        const int N,
+        const float alpha,
+        float * SF_BLAS_RESTRICT X, const int incX
+    );
+
+    void sscal(
+        ThreadPool& thread_pool,
+        const int N,
+        const float alpha,
+        float * SF_BLAS_RESTRICT X
+    );
+
+    void sscal(
+        ThreadPool& thread_pool,
+        const int N,
+        const float alpha,
+        float * SF_BLAS_RESTRICT X, const int incX
+    );
+
+    void saxpy(
+        const int N,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT X,
+        float * SF_BLAS_RESTRICT Y
+    );
+
+    void saxpy(
+        const int N,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT X, const int incX,
+        float * SF_BLAS_RESTRICT Y, const int incY
+    );
+
+    void saxpy(
+        ThreadPool& thread_pool,
+        const int N,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT X,
+        float * SF_BLAS_RESTRICT Y
+    );
+
+    void saxpy(
+        ThreadPool& thread_pool,
+        const int N,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT X, const int incX,
+        float * SF_BLAS_RESTRICT Y, const int incY
+    );
+
+    void sgemm(
+        ThreadPool& thread_pool,
+        MatrixLayout layout, MatrixTranspose TransA, MatrixTranspose TransB,
+        const int M, const int N, const int K,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT A, const int lda,
+        const float * SF_BLAS_RESTRICT B, const int ldb,
+        const float beta,
+        float * SF_BLAS_RESTRICT C, const int ldc
+    );
+
+    void test(
+        ThreadPool& thread_pool
+    );
+
+    void bench(
+        ThreadPool& thread_pool
+    );
+}
+
+#endif
diff --git a/src/thread.h b/src/thread.h
index c0a01770..3bc00729 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -39,6 +39,15 @@
 /// pointer to an entry its life time is unlimited and we don't have
 /// to care about someone changing the entry under our feet.
 
+namespace Detail {
+
+  template <typename T>
+  struct TypeIdentity {
+    using Type = T;
+  };
+
+}
+
 class Thread {
 
   std::mutex mutex;
@@ -120,6 +129,26 @@ struct ThreadPool : public std::vector<Thread*> {
   // to the state of the `worker` function object.
   void execute_with_workers(const std::function<void(Thread&)>& worker);
 
+  template <typename IndexT, typename FuncT>
+  void for_each_index_with_workers(
+    IndexT begin,
+    typename Detail::TypeIdentity<IndexT>::Type end,
+    FuncT func)
+  {
+    std::atomic<IndexT> i_atomic = begin;
+
+    execute_with_workers(
+      [&i_atomic, end, func](Thread& th) mutable {
+        for(;;) {
+          const auto i = i_atomic.fetch_add(1);
+          if (i >= end)
+            break;
+
+          func(th, i);
+        }
+      });
+  }
+
   void start_thinking(Position&, StateListPtr&, const Search::LimitsType&, bool = false);
   void clear();
   void set(size_t);
diff --git a/src/uci.cpp b/src/uci.cpp
index e6b45c02..ae21a3ae 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -22,6 +22,7 @@
 #include <sstream>
 #include <string>
 
+#include "extra/stockfish_blas.h"
 #include "nnue/evaluate_nnue.h"
 #include "evaluate.h"
 #include "movegen.h"
@@ -354,6 +355,14 @@ void UCI::loop(int argc, char* argv[]) {
           std::cout << th.thread_idx() << '\n';
         });
       }
+      else if (token == "blastest")
+      {
+        Blas::test(Threads);
+      }
+      else if (token == "blasbench")
+      {
+        Blas::bench(Threads);
+      }
 
       // test command
       else if (token == "test") test_cmd(pos, is);

From a56d8124d897ae0704efe66483b9f36b3a0c9203 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 28 Oct 2020 14:52:27 +0100
Subject: [PATCH 292/398] Replace non-blas parts of trainers with our own
 blas-like routines.

---
 src/nnue/trainer/trainer_affine_transform.h   | 166 ++++++++++--------
 .../trainer/trainer_feature_transformer.h     | 120 +++++++++----
 src/nnue/trainer/trainer_input_slice.h        |  20 ++-
 src/nnue/trainer/trainer_sum.h                |  22 ++-
 4 files changed, 207 insertions(+), 121 deletions(-)

diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h
index 5d2f29c9..610805ca 100644
--- a/src/nnue/trainer/trainer_affine_transform.h
+++ b/src/nnue/trainer/trainer_affine_transform.h
@@ -3,6 +3,8 @@
 
 #include "trainer.h"
 
+#include "extra/stockfish_blas.h"
+
 #include "learn/learn.h"
 
 #include "nnue/layers/affine_transform.h"
@@ -98,32 +100,46 @@ namespace Eval::NNUE {
 
             batch_size_ = static_cast<IndexType>(batch.size());
             batch_input_ = previous_layer_trainer_->propagate(thread_pool, batch);
+
 #if defined(USE_BLAS)
+
             for (IndexType b = 0; b < batch_size_; ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
-                cblas_scopy(kOutputDimensions, biases_, 1, &output_[batch_offset], 1);
+                cblas_scopy(
+                    kOutputDimensions, biases_, 1, &output_[batch_offset], 1
+                );
             }
 
-            cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
-                        kOutputDimensions, batch_size_, kInputDimensions, 1.0,
-                        weights_, kInputDimensions,
-                        batch_input_, kInputDimensions,
-                        1.0, &output_[0], kOutputDimensions);
+            cblas_sgemm(
+                CblasColMajor, CblasTrans, CblasNoTrans,
+                kOutputDimensions, batch_size_, kInputDimensions,
+                1.0,
+                weights_, kInputDimensions,
+                batch_input_, kInputDimensions,
+                1.0,
+                &output_[0], kOutputDimensions
+            );
 #else
-            for (IndexType b = 0; b < batch_size_; ++b) {
-                const IndexType input_batch_offset = kInputDimensions * b;
-                const IndexType output_batch_offset = kOutputDimensions * b;
-                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                    double sum = biases_[i];
-                    for (IndexType j = 0; j < kInputDimensions; ++j) {
-                        const IndexType index = kInputDimensions * i + j;
-                        sum += weights_[index] * batch_input_[input_batch_offset + j];
-                    }
 
-                    output_[output_batch_offset + i] = static_cast<LearnFloatType>(sum);
-                }
+            for (IndexType b = 0; b < batch_size_; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                Blas::scopy(
+                    thread_pool,
+                    kOutputDimensions, biases_, 1, &output_[batch_offset], 1
+                );
             }
 
+            Blas::sgemm(
+                thread_pool,
+                Blas::MatrixLayout::ColMajor, Blas::MatrixTranspose::Trans, Blas::MatrixTranspose::NoTrans,
+                kOutputDimensions, batch_size_, kInputDimensions,
+                1.0,
+                weights_, kInputDimensions,
+                batch_input_, kInputDimensions,
+                1.0,
+                &output_[0], kOutputDimensions
+            );
+
 #endif
             return output_.data();
         }
@@ -137,67 +153,77 @@ namespace Eval::NNUE {
                 learning_rate * learning_rate_scale_;
 
 #if defined(USE_BLAS)
-            // backpropagate
-            cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
-                        kInputDimensions, batch_size_, kOutputDimensions, 1.0,
-                        weights_, kInputDimensions,
-                        gradients, kOutputDimensions,
-                        0.0, &gradients_[0], kInputDimensions);
+
+            cblas_sgemm(
+                CblasColMajor, CblasNoTrans, CblasNoTrans,
+                kInputDimensions, batch_size_, kOutputDimensions,
+                1.0,
+                weights_, kInputDimensions,
+                gradients, kOutputDimensions,
+                0.0,
+                &gradients_[0], kInputDimensions
+            );
 
             // update
-            cblas_sscal(kOutputDimensions, momentum_, biases_diff_, 1);
+            cblas_sscal(
+                kOutputDimensions, momentum_, biases_diff_, 1
+            );
+
             for (IndexType b = 0; b < batch_size_; ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
-                cblas_saxpy(kOutputDimensions, 1.0,
+                cblas_saxpy(
+                    kOutputDimensions, 1.0,
+                    &gradients[batch_offset], 1, biases_diff_, 1
+                );
+            }
+
+            cblas_sgemm(
+                CblasRowMajor, CblasTrans, CblasNoTrans,
+                kOutputDimensions, kInputDimensions, batch_size_,
+                1.0,
+                gradients, kOutputDimensions,
+                batch_input_, kInputDimensions,
+                momentum_,
+                weights_diff_, kInputDimensions
+            );
+
+#else
+
+            // backpropagate
+            Blas::sgemm(
+                thread_pool,
+                Blas::MatrixLayout::ColMajor, Blas::MatrixTranspose::NoTrans, Blas::MatrixTranspose::NoTrans,
+                kInputDimensions, batch_size_, kOutputDimensions,
+                1.0,
+                weights_, kInputDimensions,
+                gradients, kOutputDimensions,
+                0.0,
+                &gradients_[0], kInputDimensions
+            );
+
+
+            Blas::sscal(
+                thread_pool,
+                kOutputDimensions, momentum_, biases_diff_, 1
+            );
+
+            for (IndexType b = 0; b < batch_size_; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                Blas::saxpy(thread_pool, kOutputDimensions, 1.0,
                           &gradients[batch_offset], 1, biases_diff_, 1);
             }
 
-            cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
-                        kOutputDimensions, kInputDimensions, batch_size_, 1.0,
-                        gradients, kOutputDimensions,
-                        batch_input_, kInputDimensions,
-                        momentum_, weights_diff_, kInputDimensions);
+            Blas::sgemm(
+                thread_pool,
+                Blas::MatrixLayout::RowMajor, Blas::MatrixTranspose::Trans, Blas::MatrixTranspose::NoTrans,
+                kOutputDimensions, kInputDimensions, batch_size_,
+                1.0,
+                gradients, kOutputDimensions,
+                batch_input_, kInputDimensions,
+                momentum_,
+                weights_diff_, kInputDimensions
+            );
 
-#else
-            // backpropagate
-            for (IndexType b = 0; b < batch_size_; ++b) {
-                const IndexType input_batch_offset = kInputDimensions * b;
-                const IndexType output_batch_offset = kOutputDimensions * b;
-                for (IndexType j = 0; j < kInputDimensions; ++j) {
-                    double sum = 0.0;
-                    for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                        const IndexType index = kInputDimensions * i + j;
-                        sum += weights_[index] * gradients[output_batch_offset + i];
-                    }
-                    gradients_[input_batch_offset + j] = static_cast<LearnFloatType>(sum);
-                }
-            }
-
-            // update
-            for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                biases_diff_[i] *= momentum_;
-            }
-
-            for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
-                weights_diff_[i] *= momentum_;
-            }
-
-            for (IndexType b = 0; b < batch_size_; ++b) {
-                const IndexType input_batch_offset = kInputDimensions * b;
-                const IndexType output_batch_offset = kOutputDimensions * b;
-
-                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                    biases_diff_[i] += gradients[output_batch_offset + i];
-                }
-
-                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                    for (IndexType j = 0; j < kInputDimensions; ++j) {
-                        const IndexType index = kInputDimensions * i + j;
-                        weights_diff_[index] += gradients[output_batch_offset + i] *
-                            batch_input_[input_batch_offset + j];
-                    }
-                }
-            }
 #endif
 
             for (IndexType i = 0; i < kOutputDimensions; ++i) {
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index a778f956..8be584e8 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -3,6 +3,8 @@
 
 #include "trainer.h"
 
+#include "extra/stockfish_blas.h"
+
 #include "features/factorizer_feature_set.h"
 
 #include "learn/learn.h"
@@ -107,24 +109,36 @@ namespace Eval::NNUE {
                 const IndexType batch_offset = kOutputDimensions * b;
                 for (IndexType c = 0; c < 2; ++c) {
                     const IndexType output_offset = batch_offset + kHalfDimensions * c;
+
 #if defined(USE_BLAS)
-                    cblas_scopy(kHalfDimensions, biases_, 1, &output_[output_offset], 1);
+
+                    cblas_scopy(
+                        kHalfDimensions, biases_, 1, &output_[output_offset], 1
+                    );
+
                     for (const auto& feature : batch[b].training_features[c]) {
                         const IndexType weights_offset = kHalfDimensions * feature.get_index();
-                        cblas_saxpy(kHalfDimensions, (float)feature.get_count(),
-                                    &weights_[weights_offset], 1, &output_[output_offset], 1);
+                        cblas_saxpy(
+                            kHalfDimensions, (float)feature.get_count(),
+                            &weights_[weights_offset], 1, &output_[output_offset], 1
+                        );
                     }
+
 #else
-                    for (IndexType i = 0; i < kHalfDimensions; ++i) {
-                        output_[output_offset + i] = biases_[i];
-                    }
+
+                    Blas::scopy(
+                        thread_pool,
+                        kHalfDimensions, biases_, 1, &output_[output_offset], 1
+                    );
                     for (const auto& feature : batch[b].training_features[c]) {
                         const IndexType weights_offset = kHalfDimensions * feature.get_index();
-                        for (IndexType i = 0; i < kHalfDimensions; ++i) {
-                            output_[output_offset + i] +=
-                                feature.get_count() * weights_[weights_offset + i];
-                        }
+                        Blas::saxpy(
+                            thread_pool,
+                            kHalfDimensions, (float)feature.get_count(),
+                            &weights_[weights_offset], 1, &output_[output_offset], 1
+                        );
                     }
+
 #endif
                 }
             }
@@ -171,19 +185,27 @@ namespace Eval::NNUE {
             // Correct the learning rate and adjust the scale without using momentum
             const LearnFloatType effective_learning_rate =
                 static_cast<LearnFloatType>(local_learning_rate / (1.0 - momentum_));
+
 #if defined(USE_BLAS)
-            cblas_sscal(kHalfDimensions, momentum_, biases_diff_, 1);
+
+            cblas_sscal(
+                kHalfDimensions, momentum_, biases_diff_, 1
+            );
             for (IndexType b = 0; b < batch_->size(); ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
                 for (IndexType c = 0; c < 2; ++c) {
                     const IndexType output_offset = batch_offset + kHalfDimensions * c;
-                    cblas_saxpy(kHalfDimensions, 1.0,
-                                &gradients_[output_offset], 1, biases_diff_, 1);
+                    cblas_saxpy(
+                        kHalfDimensions, 1.0,
+                        &gradients_[output_offset], 1, biases_diff_, 1
+                    );
                 }
             }
 
-            cblas_saxpy(kHalfDimensions, -local_learning_rate,
-                        biases_diff_, 1, biases_, 1);
+            cblas_saxpy(
+                kHalfDimensions, -local_learning_rate,
+                biases_diff_, 1, biases_, 1
+            );
 
 #pragma omp parallel
             {
@@ -205,45 +227,67 @@ namespace Eval::NNUE {
                             const auto scale = static_cast<LearnFloatType>(
                                 effective_learning_rate / feature.get_count());
 
-                            cblas_saxpy(kHalfDimensions, -scale,
-                                        &gradients_[output_offset], 1,
-                                        &weights_[weights_offset], 1);
+                            cblas_saxpy(
+                                kHalfDimensions, -scale,
+                                &gradients_[output_offset], 1,
+                                &weights_[weights_offset], 1
+                            );
                         }
                     }
                 }
             }
 
 #else
-            for (IndexType i = 0; i < kHalfDimensions; ++i) {
-                biases_diff_[i] *= momentum_;
-            }
+
+            Blas::sscal(
+                thread_pool,
+                kHalfDimensions, momentum_, biases_diff_, 1
+            );
 
             for (IndexType b = 0; b < batch_->size(); ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
                 for (IndexType c = 0; c < 2; ++c) {
                     const IndexType output_offset = batch_offset + kHalfDimensions * c;
-                    for (IndexType i = 0; i < kHalfDimensions; ++i) {
-                        biases_diff_[i] += gradients_[output_offset + i];
-                    }
+                    Blas::saxpy(
+                        thread_pool,
+                        kHalfDimensions, 1.0,
+                        &gradients_[output_offset], 1, biases_diff_, 1
+                    );
                 }
             }
 
-            for (IndexType i = 0; i < kHalfDimensions; ++i) {
-                biases_[i] -= local_learning_rate * biases_diff_[i];
-            }
+            Blas::saxpy(
+                thread_pool,
+                kHalfDimensions, -local_learning_rate,
+                biases_diff_, 1, biases_, 1
+            );
 
-            for (IndexType b = 0; b < batch_->size(); ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                for (IndexType c = 0; c < 2; ++c) {
-                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
-                    for (const auto& feature : (*batch_)[b].training_features[c]) {
-                        const IndexType weights_offset = kHalfDimensions * feature.get_index();
-                        const auto scale = static_cast<LearnFloatType>(
-                            effective_learning_rate / feature.get_count());
+#pragma omp parallel
+            {
+#if defined(_OPENMP)
+                const IndexType num_threads = omp_get_num_threads();
+                const IndexType thread_index = omp_get_thread_num();
+#endif
+                for (IndexType b = 0; b < batch_->size(); ++b) {
+                    const IndexType batch_offset = kOutputDimensions * b;
+                    for (IndexType c = 0; c < 2; ++c) {
+                        const IndexType output_offset = batch_offset + kHalfDimensions * c;
+                        for (const auto& feature : (*batch_)[b].training_features[c]) {
+#if defined(_OPENMP)
+                            if (feature.get_index() % num_threads != thread_index)
+                                continue;
+#endif
+                            const IndexType weights_offset =
+                                kHalfDimensions * feature.get_index();
+                            const auto scale = static_cast<LearnFloatType>(
+                                effective_learning_rate / feature.get_count());
 
-                        for (IndexType i = 0; i < kHalfDimensions; ++i) {
-                            weights_[weights_offset + i] -=
-                                scale * gradients_[output_offset + i];
+                            Blas::saxpy(
+                                thread_pool,
+                                kHalfDimensions, -scale,
+                                &gradients_[output_offset], 1,
+                                &weights_[weights_offset], 1
+                            );
                         }
                     }
                 }
diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
index 4bb38104..03e9fec0 100644
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -3,6 +3,8 @@
 
 #include "trainer.h"
 
+#include "extra/stockfish_blas.h"
+
 #include "learn/learn.h"
 
 #include "nnue/layers/input_slice.h"
@@ -208,13 +210,21 @@ namespace Eval::NNUE {
             for (IndexType b = 0; b < batch_size_; ++b) {
                 const IndexType input_offset = kInputDimensions * b;
                 const IndexType output_offset = kOutputDimensions * b;
+
 #if defined(USE_BLAS)
-                cblas_scopy(kOutputDimensions, &input[input_offset + Offset], 1,
-                            &output_[output_offset], 1);
+
+                cblas_scopy(
+                    kOutputDimensions, &input[input_offset + Offset], 1,
+                    &output_[output_offset], 1
+                );
 #else
-                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                    output_[output_offset + i] = input[input_offset + Offset + i];
-                }
+
+                Blas::scopy(
+                    thread_pool,
+                    kOutputDimensions, &input[input_offset + Offset], 1,
+                    &output_[output_offset], 1
+                );
+
 #endif
             }
 
diff --git a/src/nnue/trainer/trainer_sum.h b/src/nnue/trainer/trainer_sum.h
index 6defb95f..88ff302c 100644
--- a/src/nnue/trainer/trainer_sum.h
+++ b/src/nnue/trainer/trainer_sum.h
@@ -3,6 +3,8 @@
 
 #include "trainer.h"
 
+#include "extra/stockfish_blas.h"
+
 #include "learn/learn.h"
 
 #include "nnue/layers/sum.h"
@@ -53,15 +55,19 @@ namespace Eval::NNUE {
             const auto head_output = previous_layer_trainer_->propagate(thread_pool, batch);
 
 #if defined(USE_BLAS)
-            cblas_saxpy(kOutputDimensions * batch_size_, 1.0,
-                        head_output, 1, output, 1);
+
+            cblas_saxpy(
+                kOutputDimensions * batch_size_, 1.0,
+                head_output, 1, output, 1
+            );
+
 #else
-            for (IndexType b = 0; b < batch_size_; ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                    output[batch_offset + i] += head_output[batch_offset + i];
-                }
-            }
+
+            Blas::saxpy(
+                thread_pool,
+                kOutputDimensions * batch_size_, 1.0,
+                head_output, 1, output, 1
+            );
 
 #endif
             return output;

From 8c81bbd3db5b6f4d9927a220acc5ca0e063cdf7b Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 30 Oct 2020 10:36:39 +0100
Subject: [PATCH 293/398] Fix the counter in for_each_index_with_workers going
 out of scope before workers finish.

---
 src/thread.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/thread.h b/src/thread.h
index 3bc00729..1f0ec6a2 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -135,10 +135,15 @@ struct ThreadPool : public std::vector<Thread*> {
     typename Detail::TypeIdentity<IndexT>::Type end,
     FuncT func)
   {
-    std::atomic<IndexT> i_atomic = begin;
+    // This value must outlive the function call.
+    // It's fairly safe if we make it static
+    // because for_each_index_with_workers
+    // is not reentrant nor thread safe.
+    static std::atomic<IndexT> i_atomic;
+    i_atomic.store(begin);
 
     execute_with_workers(
-      [&i_atomic, end, func](Thread& th) mutable {
+      [end, func](Thread& th) mutable {
         for(;;) {
           const auto i = i_atomic.fetch_add(1);
           if (i >= end)

From 7bedf6c5aba05ea7e42623cd5a31eb3e1be8bf66 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 30 Oct 2020 10:37:03 +0100
Subject: [PATCH 294/398] Specify the whole evalsave message because otherwise
 the first evalsave/0 triggers it.

---
 tests/instrumented_learn.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/instrumented_learn.sh b/tests/instrumented_learn.sh
index 07f5f98b..9109e78b 100755
--- a/tests/instrumented_learn.sh
+++ b/tests/instrumented_learn.sh
@@ -129,7 +129,7 @@ cat << EOF > learn01.exp
  send "isready\n"
  send "learn targetdir training_data epochs 1 sfen_read_size 100 thread_buffer_size 10 batchsize 100 use_draw_in_training 1 use_draw_in_validation 1 lr 1 eval_limit 32000 nn_batch_size 30 newbob_decay 0.5 eval_save_interval 30 loss_output_interval 10 validation_set_file_name validation_data/validation_data.bin\n"
 
- expect "INFO (save_eval): Finished saving evaluation file in"
+ expect "INFO (save_eval): Finished saving evaluation file in evalsave/final"
 
  send "quit\n"
  expect eof

From 2c10b1babcf4c6917f83f473a1123296032274a8 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Tue, 27 Oct 2020 17:36:50 +0100
Subject: [PATCH 295/398] Optimize feature transformer clipped relu.

---
 .../trainer/trainer_feature_transformer.h     | 115 ++++++++++++++++++
 1 file changed, 115 insertions(+)

diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 8be584e8..c883b594 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -143,6 +143,119 @@ namespace Eval::NNUE {
                 }
             }
 
+#if defined (USE_SSE2)
+
+            {
+                static_assert(kHalfDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time");
+
+                auto m128_hmin_ps = [](__m128 x3210) {
+                    __m128 x0032 = _mm_shuffle_ps(x3210, x3210, _MM_SHUFFLE(0, 0, 3, 2));
+                    __m128 min_x_x_13_20 = _mm_min_ps(x3210, x0032);
+                    // a = [ # , # , min(x[1], x[3]) , min(x[2], x[0]) ]
+                    __m128 min_x_x_20_13 = _mm_shuffle_ps(min_x_x_13_20, min_x_x_13_20, _MM_SHUFFLE(0, 0, 0, 1));
+                    return _mm_cvtss_f32(_mm_min_ps(min_x_x_13_20, min_x_x_20_13));
+                };
+
+                auto m128_hmax_ps = [](__m128 x3210) {
+                    __m128 x0032 = _mm_shuffle_ps(x3210, x3210, _MM_SHUFFLE(0, 0, 3, 2));
+                    __m128 max_x_x_13_20 = _mm_max_ps(x3210, x0032);
+                    // a = [ # , # , max(x[1], x[3]) , max(x[2], x[0]) ]
+                    __m128 max_x_x_20_13 = _mm_shuffle_ps(max_x_x_13_20, max_x_x_13_20, _MM_SHUFFLE(0, 0, 0, 1));
+                    return _mm_cvtss_f32(_mm_max_ps(max_x_x_13_20, max_x_x_20_13));
+                };
+
+                const int total_size = batch.size() * kOutputDimensions;
+
+                const __m128 kZero4 = _mm_set1_ps(+kZero);
+                const __m128 kOne4 = _mm_set1_ps(+kOne);
+
+                __m128 min_pre_activation0 = _mm_set1_ps(min_pre_activation_);
+                __m128 min_pre_activation1 = _mm_set1_ps(min_pre_activation_);
+                __m128 max_pre_activation0 = _mm_set1_ps(max_pre_activation_);
+                __m128 max_pre_activation1 = _mm_set1_ps(max_pre_activation_);
+
+                for (int i = 0; i < total_size; i += 16)
+                {
+                    __m128 out0 = _mm_loadu_ps(&output_[i +  0]);
+                    __m128 out1 = _mm_loadu_ps(&output_[i +  4]);
+                    __m128 out2 = _mm_loadu_ps(&output_[i +  8]);
+                    __m128 out3 = _mm_loadu_ps(&output_[i + 12]);
+
+                    __m128 min01 = _mm_min_ps(out0, out1);
+                    __m128 min23 = _mm_min_ps(out2, out3);
+
+                    __m128 max01 = _mm_max_ps(out0, out1);
+                    __m128 max23 = _mm_max_ps(out2, out3);
+
+                    min_pre_activation0 = _mm_min_ps(min_pre_activation0, min01);
+                    min_pre_activation1 = _mm_min_ps(min_pre_activation1, min23);
+                    max_pre_activation0 = _mm_max_ps(max_pre_activation0, max01);
+                    max_pre_activation1 = _mm_max_ps(max_pre_activation1, max23);
+
+                    out0 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out0));
+                    out1 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out1));
+                    out2 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out2));
+                    out3 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out3));
+
+                    _mm_storeu_ps(&output_[i +  0], out0);
+                    _mm_storeu_ps(&output_[i +  4], out1);
+                    _mm_storeu_ps(&output_[i +  8], out2);
+                    _mm_storeu_ps(&output_[i + 12], out3);
+                }
+
+                min_pre_activation_ = m128_hmin_ps(_mm_min_ps(min_pre_activation0, min_pre_activation1));
+                max_pre_activation_ = m128_hmax_ps(_mm_max_ps(max_pre_activation0, max_pre_activation1));
+
+                for (IndexType b = 0; b < batch.size(); ++b) 
+                {
+                    const IndexType batch_offset = kOutputDimensions * b;
+
+                    for (IndexType half = 0; half < 2; ++half)
+                    {
+                        const IndexType half_offset = batch_offset + half * kHalfDimensions;
+                        for (IndexType i = 0; i < kHalfDimensions; i += 16)
+                        {
+                            const __m128 out0 = _mm_loadu_ps(&output_[i +  0 + half_offset]);
+                            const __m128 out1 = _mm_loadu_ps(&output_[i +  4 + half_offset]);
+                            const __m128 out2 = _mm_loadu_ps(&output_[i +  8 + half_offset]);
+                            const __m128 out3 = _mm_loadu_ps(&output_[i + 12 + half_offset]);
+
+                            __m128 minact0 = _mm_loadu_ps(&min_activations_[i +  0]);
+                            __m128 minact1 = _mm_loadu_ps(&min_activations_[i +  4]);
+                            __m128 minact2 = _mm_loadu_ps(&min_activations_[i +  8]);
+                            __m128 minact3 = _mm_loadu_ps(&min_activations_[i + 12]);
+
+                            __m128 maxact0 = _mm_loadu_ps(&max_activations_[i +  0]);
+                            __m128 maxact1 = _mm_loadu_ps(&max_activations_[i +  4]);
+                            __m128 maxact2 = _mm_loadu_ps(&max_activations_[i +  8]);
+                            __m128 maxact3 = _mm_loadu_ps(&max_activations_[i + 12]);
+
+                            minact0 = _mm_min_ps(out0, minact0);
+                            minact1 = _mm_min_ps(out1, minact1);
+                            minact2 = _mm_min_ps(out2, minact2);
+                            minact3 = _mm_min_ps(out3, minact3);
+
+                            maxact0 = _mm_max_ps(out0, maxact0);
+                            maxact1 = _mm_max_ps(out1, maxact1);
+                            maxact2 = _mm_max_ps(out2, maxact2);
+                            maxact3 = _mm_max_ps(out3, maxact3);
+
+                            _mm_storeu_ps(&min_activations_[i +  0], minact0);
+                            _mm_storeu_ps(&min_activations_[i +  4], minact1);
+                            _mm_storeu_ps(&min_activations_[i +  8], minact2);
+                            _mm_storeu_ps(&min_activations_[i + 12], minact3);
+
+                            _mm_storeu_ps(&max_activations_[i +  0], maxact0);
+                            _mm_storeu_ps(&max_activations_[i +  4], maxact1);
+                            _mm_storeu_ps(&max_activations_[i +  8], maxact2);
+                            _mm_storeu_ps(&max_activations_[i + 12], maxact3);
+                        }
+                    }
+                }
+            }
+
+#else
+
             // clipped ReLU
             for (IndexType b = 0; b < batch.size(); ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
@@ -157,6 +270,8 @@ namespace Eval::NNUE {
                 }
             }
 
+#endif
+
             return output_.data();
         }
 

From c96743c5bd17173be2e08f00fac89e9f50746238 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 28 Oct 2020 14:59:18 +0100
Subject: [PATCH 296/398] Optimize feature transformer backpropagation stats.

---
 .../trainer/trainer_feature_transformer.h     | 52 +++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index c883b594..77edfbde 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -285,6 +285,55 @@ namespace Eval::NNUE {
             const LearnFloatType local_learning_rate =
                 learning_rate * learning_rate_scale_;
 
+#if defined (USE_SSE2)
+            
+            {
+                static_assert(kHalfDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time");
+
+                const __m128 kZero4 = _mm_set1_ps(+kZero);
+                const __m128 kOne4 = _mm_set1_ps(+kOne);
+
+                const IndexType total_size = batch_->size() * kOutputDimensions;
+
+                for (IndexType i = 0; i < total_size; i += 16)
+                {
+                    __m128 out0 = _mm_loadu_ps(&output_[i + 0]);
+                    __m128 out1 = _mm_loadu_ps(&output_[i + 4]);
+                    __m128 out2 = _mm_loadu_ps(&output_[i + 8]);
+                    __m128 out3 = _mm_loadu_ps(&output_[i + 12]);
+
+                    __m128 clipped0 = _mm_or_ps(_mm_cmple_ps(out0, kZero4), _mm_cmpge_ps(out0, kOne4));
+                    __m128 clipped1 = _mm_or_ps(_mm_cmple_ps(out1, kZero4), _mm_cmpge_ps(out1, kOne4));
+                    __m128 clipped2 = _mm_or_ps(_mm_cmple_ps(out2, kZero4), _mm_cmpge_ps(out2, kOne4));
+                    __m128 clipped3 = _mm_or_ps(_mm_cmple_ps(out3, kZero4), _mm_cmpge_ps(out3, kOne4));
+
+                    __m128 grad0 = _mm_loadu_ps(&gradients[i + 0]);
+                    __m128 grad1 = _mm_loadu_ps(&gradients[i + 4]);
+                    __m128 grad2 = _mm_loadu_ps(&gradients[i + 8]);
+                    __m128 grad3 = _mm_loadu_ps(&gradients[i + 12]);
+
+                    grad0 = _mm_andnot_ps(clipped0, grad0);
+                    grad1 = _mm_andnot_ps(clipped1, grad1);
+                    grad2 = _mm_andnot_ps(clipped2, grad2);
+                    grad3 = _mm_andnot_ps(clipped3, grad3);
+
+                    _mm_storeu_ps(&gradients_[i + 0], grad0);
+                    _mm_storeu_ps(&gradients_[i + 4], grad1);
+                    _mm_storeu_ps(&gradients_[i + 8], grad2);
+                    _mm_storeu_ps(&gradients_[i + 12], grad3);
+
+                    const int clipped_mask =
+                        (_mm_movemask_ps(clipped0) << 0)
+                        | (_mm_movemask_ps(clipped1) << 4)
+                        | (_mm_movemask_ps(clipped2) << 8)
+                        | (_mm_movemask_ps(clipped3) << 12);
+
+                    num_clipped_ += popcount(clipped_mask);
+                }
+            }
+
+#else
+
             for (IndexType b = 0; b < batch_->size(); ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
                 for (IndexType i = 0; i < kOutputDimensions; ++i) {
@@ -294,6 +343,9 @@ namespace Eval::NNUE {
                     num_clipped_ += clipped;
                 }
             }
+
+#endif
+
             num_total_ += batch_->size() * kOutputDimensions;
 
             // Since the weight matrix updates only the columns corresponding to the features that appeared in the input,

From 941897ff2c24c66afa0e42e59c004839968d05d6 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 28 Oct 2020 15:03:09 +0100
Subject: [PATCH 297/398] Optimize trainer clipped relu backpropagate.

---
 src/nnue/trainer/trainer_clipped_relu.h | 52 +++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index 8e29e4a1..dd6fc701 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -69,6 +69,55 @@ namespace Eval::NNUE {
                            const LearnFloatType* gradients,
                            LearnFloatType learning_rate) {
 
+#if defined (USE_SSE2)
+
+            {
+                static_assert(kOutputDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time");
+
+                const __m128 kZero4 = _mm_set1_ps(+kZero);
+                const __m128 kOne4 = _mm_set1_ps(+kOne);
+
+                const IndexType total_size = batch_size_ * kOutputDimensions;
+
+                for (IndexType i = 0; i < total_size; i += 16)
+                {
+                    __m128 out0 = _mm_loadu_ps(&output_[i + 0]);
+                    __m128 out1 = _mm_loadu_ps(&output_[i + 4]);
+                    __m128 out2 = _mm_loadu_ps(&output_[i + 8]);
+                    __m128 out3 = _mm_loadu_ps(&output_[i + 12]);
+
+                    __m128 clipped0 = _mm_or_ps(_mm_cmple_ps(out0, kZero4), _mm_cmpge_ps(out0, kOne4));
+                    __m128 clipped1 = _mm_or_ps(_mm_cmple_ps(out1, kZero4), _mm_cmpge_ps(out1, kOne4));
+                    __m128 clipped2 = _mm_or_ps(_mm_cmple_ps(out2, kZero4), _mm_cmpge_ps(out2, kOne4));
+                    __m128 clipped3 = _mm_or_ps(_mm_cmple_ps(out3, kZero4), _mm_cmpge_ps(out3, kOne4));
+
+                    __m128 grad0 = _mm_loadu_ps(&gradients[i + 0]);
+                    __m128 grad1 = _mm_loadu_ps(&gradients[i + 4]);
+                    __m128 grad2 = _mm_loadu_ps(&gradients[i + 8]);
+                    __m128 grad3 = _mm_loadu_ps(&gradients[i + 12]);
+
+                    grad0 = _mm_andnot_ps(clipped0, grad0);
+                    grad1 = _mm_andnot_ps(clipped1, grad1);
+                    grad2 = _mm_andnot_ps(clipped2, grad2);
+                    grad3 = _mm_andnot_ps(clipped3, grad3);
+
+                    _mm_storeu_ps(&gradients_[i + 0], grad0);
+                    _mm_storeu_ps(&gradients_[i + 4], grad1);
+                    _mm_storeu_ps(&gradients_[i + 8], grad2);
+                    _mm_storeu_ps(&gradients_[i + 12], grad3);
+
+                    const int clipped_mask =
+                        (_mm_movemask_ps(clipped0) << 0)
+                        | (_mm_movemask_ps(clipped1) << 4)
+                        | (_mm_movemask_ps(clipped2) << 8)
+                        | (_mm_movemask_ps(clipped3) << 12);
+
+                    num_clipped_ += popcount(clipped_mask);
+                }
+            }
+
+#else
+
             for (IndexType b = 0; b < batch_size_; ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
                 for (IndexType i = 0; i < kOutputDimensions; ++i) {
@@ -78,6 +127,9 @@ namespace Eval::NNUE {
                     num_clipped_ += clipped;
                 }
             }
+
+#endif
+
             num_total_ += batch_size_ * kOutputDimensions;
 
             previous_layer_trainer_->backpropagate(thread_pool, gradients_.data(), learning_rate);

From b5714c4084719cd089c2d70266404e4e36f0a129 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Tue, 27 Oct 2020 18:41:17 +0100
Subject: [PATCH 298/398] Parallelize input slice trainer backprop.

---
 src/nnue/trainer/trainer_input_slice.h | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
index 03e9fec0..a93a3ea0 100644
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -236,17 +236,29 @@ namespace Eval::NNUE {
                            const LearnFloatType* gradients,
                            LearnFloatType learning_rate) {
 
-            for (IndexType b = 0; b < batch_size_; ++b) {
-                const IndexType input_offset = kInputDimensions * b;
-                const IndexType output_offset = kOutputDimensions * b;
-                for (IndexType i = 0; i < kInputDimensions; ++i) {
-                    if ((int)i < (int)Offset || i >= Offset + kOutputDimensions) {
+            thread_pool.for_each_index_with_workers(
+                0, batch_size_,
+                [&](Thread&, int b) {
+                    const IndexType input_offset = kInputDimensions * b;
+                    const IndexType output_offset = kOutputDimensions * b;
+
+                    IndexType i = 0;
+                    for (; i < Offset; ++i) {
                         gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
-                    } else {
+                    }
+
+                    for (; i < Offset + kOutputDimensions; ++i) {
                         gradients_[input_offset + i] = gradients[output_offset + i - Offset];
                     }
+
+                    for (; i < kInputDimensions; ++i)
+                    {
+                        gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
+                    }
                 }
-            }
+            );
+            thread_pool.wait_for_workers_finished();
+
             shared_input_trainer_->backpropagate(thread_pool, gradients_.data(), learning_rate);
         }
 

From db1b33d4acfe02d4eb05eac5f810729da1d1ebf4 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Tue, 27 Oct 2020 18:57:47 +0100
Subject: [PATCH 299/398] Optimize trainer clipped relu propagate

---
 src/nnue/trainer/trainer_clipped_relu.h | 68 +++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index dd6fc701..124671ed 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -50,7 +50,73 @@ namespace Eval::NNUE {
             }
 
             const auto input = previous_layer_trainer_->propagate(thread_pool, batch);
+
             batch_size_ = static_cast<IndexType>(batch.size());
+
+#if defined (USE_SSE2)
+
+            {
+                static_assert(kOutputDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time");
+
+                const __m128 kZero4 = _mm_set1_ps(+kZero);
+                const __m128 kOne4 = _mm_set1_ps(+kOne);
+
+                for (IndexType b = 0; b < batch.size(); ++b)
+                {
+                    const IndexType batch_offset = kOutputDimensions * b;
+
+                    for (IndexType i = 0; i < kOutputDimensions; i += 16)
+                    {
+                        __m128 out0 = _mm_loadu_ps(&input[i + 0 + batch_offset]);
+                        __m128 out1 = _mm_loadu_ps(&input[i + 4 + batch_offset]);
+                        __m128 out2 = _mm_loadu_ps(&input[i + 8 + batch_offset]);
+                        __m128 out3 = _mm_loadu_ps(&input[i + 12 + batch_offset]);
+
+                        out0 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out0));
+                        out1 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out1));
+                        out2 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out2));
+                        out3 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out3));
+
+                        _mm_storeu_ps(&output_[i + 0 + batch_offset], out0);
+                        _mm_storeu_ps(&output_[i + 4 + batch_offset], out1);
+                        _mm_storeu_ps(&output_[i + 8 + batch_offset], out2);
+                        _mm_storeu_ps(&output_[i + 12 + batch_offset], out3);
+
+                        __m128 minact0 = _mm_loadu_ps(&min_activations_[i + 0]);
+                        __m128 minact1 = _mm_loadu_ps(&min_activations_[i + 4]);
+                        __m128 minact2 = _mm_loadu_ps(&min_activations_[i + 8]);
+                        __m128 minact3 = _mm_loadu_ps(&min_activations_[i + 12]);
+
+                        __m128 maxact0 = _mm_loadu_ps(&max_activations_[i + 0]);
+                        __m128 maxact1 = _mm_loadu_ps(&max_activations_[i + 4]);
+                        __m128 maxact2 = _mm_loadu_ps(&max_activations_[i + 8]);
+                        __m128 maxact3 = _mm_loadu_ps(&max_activations_[i + 12]);
+
+                        minact0 = _mm_min_ps(out0, minact0);
+                        minact1 = _mm_min_ps(out1, minact1);
+                        minact2 = _mm_min_ps(out2, minact2);
+                        minact3 = _mm_min_ps(out3, minact3);
+
+                        maxact0 = _mm_max_ps(out0, maxact0);
+                        maxact1 = _mm_max_ps(out1, maxact1);
+                        maxact2 = _mm_max_ps(out2, maxact2);
+                        maxact3 = _mm_max_ps(out3, maxact3);
+
+                        _mm_storeu_ps(&min_activations_[i + 0], minact0);
+                        _mm_storeu_ps(&min_activations_[i + 4], minact1);
+                        _mm_storeu_ps(&min_activations_[i + 8], minact2);
+                        _mm_storeu_ps(&min_activations_[i + 12], minact3);
+
+                        _mm_storeu_ps(&max_activations_[i + 0], maxact0);
+                        _mm_storeu_ps(&max_activations_[i + 4], maxact1);
+                        _mm_storeu_ps(&max_activations_[i + 8], maxact2);
+                        _mm_storeu_ps(&max_activations_[i + 12], maxact3);
+                    }
+                }
+            }
+
+#else
+
             for (IndexType b = 0; b < batch_size_; ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
                 for (IndexType i = 0; i < kOutputDimensions; ++i) {
@@ -61,6 +127,8 @@ namespace Eval::NNUE {
                 }
             }
 
+#endif
+
             return output_.data();
         }
 

From e8907bcfc456cd9c5966dbc238018b0bc961eece Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Tue, 27 Oct 2020 11:57:45 +0100
Subject: [PATCH 300/398] Replace omp in trainer_feature_transformer

---
 .../trainer/trainer_feature_transformer.h     | 182 ++++++++----------
 1 file changed, 82 insertions(+), 100 deletions(-)

diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 77edfbde..3062e432 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -19,10 +19,6 @@
 #include <random>
 #include <set>
 
-#if defined(_OPENMP)
-#include <omp.h>
-#endif
-
 // Specialization for feature transformer of learning class template of NNUE evaluation function
 namespace Eval::NNUE {
 
@@ -104,44 +100,45 @@ namespace Eval::NNUE {
 
             batch_ = &batch;
             // affine transform
-#pragma omp parallel for
-            for (IndexType b = 0; b < batch.size(); ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                for (IndexType c = 0; c < 2; ++c) {
-                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
+            thread_pool.for_each_index_with_workers(
+                0, batch.size(),
+                [&](Thread&, int b) {
+                    const IndexType batch_offset = kOutputDimensions * b;
+                    for (IndexType c = 0; c < 2; ++c) {
+                        const IndexType output_offset = batch_offset + kHalfDimensions * c;
 
 #if defined(USE_BLAS)
 
-                    cblas_scopy(
-                        kHalfDimensions, biases_, 1, &output_[output_offset], 1
-                    );
-
-                    for (const auto& feature : batch[b].training_features[c]) {
-                        const IndexType weights_offset = kHalfDimensions * feature.get_index();
-                        cblas_saxpy(
-                            kHalfDimensions, (float)feature.get_count(),
-                            &weights_[weights_offset], 1, &output_[output_offset], 1
+                        cblas_scopy(
+                            kHalfDimensions, biases_, 1, &output_[output_offset], 1
                         );
-                    }
+
+                        for (const auto& feature : batch[b].training_features[c]) {
+                            const IndexType weights_offset = kHalfDimensions * feature.get_index();
+                            cblas_saxpy(
+                                kHalfDimensions, (float)feature.get_count(),
+                                &weights_[weights_offset], 1, &output_[output_offset], 1
+                            );
+                        }
 
 #else
 
-                    Blas::scopy(
-                        thread_pool,
-                        kHalfDimensions, biases_, 1, &output_[output_offset], 1
-                    );
-                    for (const auto& feature : batch[b].training_features[c]) {
-                        const IndexType weights_offset = kHalfDimensions * feature.get_index();
-                        Blas::saxpy(
-                            thread_pool,
-                            kHalfDimensions, (float)feature.get_count(),
-                            &weights_[weights_offset], 1, &output_[output_offset], 1
+                        Blas::scopy(
+                            kHalfDimensions, biases_, 1, &output_[output_offset], 1
                         );
-                    }
+                        for (const auto& feature : batch[b].training_features[c]) {
+                            const IndexType weights_offset = kHalfDimensions * feature.get_index();
+                            Blas::saxpy(
+                                kHalfDimensions, (float)feature.get_count(),
+                                &weights_[weights_offset], 1, &output_[output_offset], 1
+                            );
+                        }
 
 #endif
+                    }
                 }
-            }
+            );
+            thread_pool.wait_for_workers_finished();
 
 #if defined (USE_SSE2)
 
@@ -358,6 +355,7 @@ namespace Eval::NNUE {
             cblas_sscal(
                 kHalfDimensions, momentum_, biases_diff_, 1
             );
+
             for (IndexType b = 0; b < batch_->size(); ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
                 for (IndexType c = 0; c < 2; ++c) {
@@ -374,36 +372,6 @@ namespace Eval::NNUE {
                 biases_diff_, 1, biases_, 1
             );
 
-#pragma omp parallel
-            {
-#if defined(_OPENMP)
-                const IndexType num_threads = omp_get_num_threads();
-                const IndexType thread_index = omp_get_thread_num();
-#endif
-                for (IndexType b = 0; b < batch_->size(); ++b) {
-                    const IndexType batch_offset = kOutputDimensions * b;
-                    for (IndexType c = 0; c < 2; ++c) {
-                        const IndexType output_offset = batch_offset + kHalfDimensions * c;
-                        for (const auto& feature : (*batch_)[b].training_features[c]) {
-#if defined(_OPENMP)
-                            if (feature.get_index() % num_threads != thread_index)
-                                continue;
-#endif
-                            const IndexType weights_offset =
-                                kHalfDimensions * feature.get_index();
-                            const auto scale = static_cast<LearnFloatType>(
-                                effective_learning_rate / feature.get_count());
-
-                            cblas_saxpy(
-                                kHalfDimensions, -scale,
-                                &gradients_[output_offset], 1,
-                                &weights_[weights_offset], 1
-                            );
-                        }
-                    }
-                }
-            }
-
 #else
 
             Blas::sscal(
@@ -429,38 +397,47 @@ namespace Eval::NNUE {
                 biases_diff_, 1, biases_, 1
             );
 
-#pragma omp parallel
-            {
-#if defined(_OPENMP)
-                const IndexType num_threads = omp_get_num_threads();
-                const IndexType thread_index = omp_get_thread_num();
 #endif
-                for (IndexType b = 0; b < batch_->size(); ++b) {
-                    const IndexType batch_offset = kOutputDimensions * b;
-                    for (IndexType c = 0; c < 2; ++c) {
-                        const IndexType output_offset = batch_offset + kHalfDimensions * c;
-                        for (const auto& feature : (*batch_)[b].training_features[c]) {
-#if defined(_OPENMP)
-                            if (feature.get_index() % num_threads != thread_index)
-                                continue;
-#endif
-                            const IndexType weights_offset =
-                                kHalfDimensions * feature.get_index();
-                            const auto scale = static_cast<LearnFloatType>(
-                                effective_learning_rate / feature.get_count());
 
-                            Blas::saxpy(
-                                thread_pool,
-                                kHalfDimensions, -scale,
-                                &gradients_[output_offset], 1,
-                                &weights_[weights_offset], 1
-                            );
+            thread_pool.execute_with_workers(
+                [&, num_threads = thread_pool.size()](Thread& th) {
+                    const auto thread_index = th.thread_idx();
+
+                    for (IndexType b = 0; b < batch_->size(); ++b) {
+                        const IndexType batch_offset = kOutputDimensions * b;
+                        for (IndexType c = 0; c < 2; ++c) {
+                            const IndexType output_offset = batch_offset + kHalfDimensions * c;
+                            for (const auto& feature : (*batch_)[b].training_features[c]) {
+                                if (feature.get_index() % num_threads != thread_index)
+                                    continue;
+                                const IndexType weights_offset =
+                                    kHalfDimensions * feature.get_index();
+                                const auto scale = static_cast<LearnFloatType>(
+                                    effective_learning_rate / feature.get_count());
+
+#if defined (USE_BLAS)
+
+                                cblas_saxpy(
+                                    kHalfDimensions, -scale,
+                                    &gradients_[output_offset], 1,
+                                    &weights_[weights_offset], 1
+                                );
+
+#else
+
+                                Blas::saxpy(
+                                    kHalfDimensions, -scale,
+                                    &gradients_[output_offset], 1,
+                                    &weights_[weights_offset], 1
+                                );
+
+#endif
+                            }
                         }
                     }
                 }
-            }
+            );
 
-#endif
             for (IndexType b = 0; b < batch_->size(); ++b) {
                 for (IndexType c = 0; c < 2; ++c) {
                     for (const auto& feature : (*batch_)[b].training_features[c]) {
@@ -468,6 +445,8 @@ namespace Eval::NNUE {
                     }
                 }
             }
+
+            thread_pool.wait_for_workers_finished();
         }
 
     private:
@@ -493,22 +472,25 @@ namespace Eval::NNUE {
 
             std::vector<TrainingFeature> training_features;
 
-#pragma omp parallel for private(training_features)
-            for (IndexType j = 0; j < RawFeatures::kDimensions; ++j) {
-                training_features.clear();
-                Features::Factorizer<RawFeatures>::append_training_features(
-                    j, &training_features);
+            Threads.for_each_index_with_workers(
+                0, RawFeatures::kDimensions,
+                [this, training_features](Thread&, int j) mutable {
+                    training_features.clear();
+                    Features::Factorizer<RawFeatures>::append_training_features(
+                        j, &training_features);
 
-                for (IndexType i = 0; i < kHalfDimensions; ++i) {
-                    double sum = 0.0;
-                    for (const auto& feature : training_features) {
-                        sum += weights_[kHalfDimensions * feature.get_index() + i];
+                    for (IndexType i = 0; i < kHalfDimensions; ++i) {
+                        double sum = 0.0;
+                        for (const auto& feature : training_features) {
+                            sum += weights_[kHalfDimensions * feature.get_index() + i];
+                        }
+
+                        target_layer_->weights_[kHalfDimensions * j + i] =
+                            round<typename LayerType::WeightType>(sum * kWeightScale);
                     }
-
-                    target_layer_->weights_[kHalfDimensions * j + i] =
-                        round<typename LayerType::WeightType>(sum * kWeightScale);
                 }
-            }
+            );
+            Threads.wait_for_workers_finished();
         }
 
         void reset_stats() {

From c53be1b23f48fef9c5f27203eefc1443ea107e5a Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Tue, 27 Oct 2020 19:19:23 +0100
Subject: [PATCH 301/398] Add specialized bitset for use in the trainer for
 observed features tracking.

---
 src/misc.h | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/src/misc.h b/src/misc.h
index be9b4c38..e564311f 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -397,6 +397,69 @@ inline uint64_t mul_hi64(uint64_t a, uint64_t b) {
 #endif
 }
 
+// This bitset can be accessed concurrently, provided
+// the concurrent accesses are performed on distinct
+// instances of underlying type. That means the cuncurrent
+// accesses need to be spaced by at least 
+// bits_per_bucket bits.
+// But at least best_concurrent_access_stride bits
+// is recommended to prevent false sharing.
+template <uint64_t N>
+struct LargeBitset
+{
+private:
+    constexpr static uint64_t cache_line_size = 64;
+
+public:
+    using UnderlyingType = uint64_t;
+
+    constexpr static uint64_t num_bits = N;
+    constexpr static uint64_t bits_per_bucket = 8 * sizeof(uint64_t);
+    constexpr static uint64_t num_buckets = (num_bits + bits_per_bucket - 1) / bits_per_bucket;
+    constexpr static uint64_t best_concurrent_access_stride = 8 * cache_line_size;
+
+    void set(uint64_t idx)
+    {
+        const uint64_t bucket = idx / bits_per_bucket;
+        const uint64_t bit = uint64_t(1) << (idx % bits_per_bucket);
+        bits[bucket] |= bit;
+    }
+
+    bool test(uint64_t idx) const
+    {
+        const uint64_t bucket = idx / bits_per_bucket;
+        const uint64_t bit = uint64_t(1) << (idx % bits_per_bucket);
+        return bits[bucket] & bit;
+    }
+
+    uint64_t count() const
+    {
+        uint64_t c = 0;
+        uint64_t i = 0;
+
+        for (; i < num_buckets - 3; i += 4)
+        {
+            uint64_t c0 = popcount(bits[i+0]);
+            uint64_t c1 = popcount(bits[i+1]);
+            uint64_t c2 = popcount(bits[i+2]);
+            uint64_t c3 = popcount(bits[i+3]);
+            c0 += c1;
+            c2 += c3;
+            c += c0 + c2;
+        }
+
+        for (; i < num_buckets; ++i)
+        {
+            c += popcount(bits[i]);
+        }
+
+        return c;
+    }
+
+private:
+    alignas(cache_line_size) UnderlyingType bits[num_buckets];
+};
+
 /// Under Windows it is not possible for a process to run on more than one
 /// logical processor group. This usually means to be limited to use max 64
 /// cores. To overcome this, some special platform specific API should be

From 987b6c98d4ddf2875d6b8bbe1e8be07de7233aea Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Tue, 27 Oct 2020 19:24:07 +0100
Subject: [PATCH 302/398] Move the observed feature collection to the threaded
 part now that it can be done safely.

---
 src/misc.h                                    |  7 +++-
 .../trainer/trainer_feature_transformer.h     | 35 ++++++++++++-------
 2 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/src/misc.h b/src/misc.h
index e564311f..020fa9b5 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -400,7 +400,7 @@ inline uint64_t mul_hi64(uint64_t a, uint64_t b) {
 // This bitset can be accessed concurrently, provided
 // the concurrent accesses are performed on distinct
 // instances of underlying type. That means the cuncurrent
-// accesses need to be spaced by at least 
+// accesses need to be spaced by at least
 // bits_per_bucket bits.
 // But at least best_concurrent_access_stride bits
 // is recommended to prevent false sharing.
@@ -418,6 +418,11 @@ public:
     constexpr static uint64_t num_buckets = (num_bits + bits_per_bucket - 1) / bits_per_bucket;
     constexpr static uint64_t best_concurrent_access_stride = 8 * cache_line_size;
 
+    LargeBitset()
+    {
+        std::fill(std::begin(bits), std::end(bits), 0);
+    }
+
     void set(uint64_t idx)
     {
         const uint64_t bucket = idx / bits_per_bucket;
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 3062e432..419cdf5e 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -203,7 +203,7 @@ namespace Eval::NNUE {
                 min_pre_activation_ = m128_hmin_ps(_mm_min_ps(min_pre_activation0, min_pre_activation1));
                 max_pre_activation_ = m128_hmax_ps(_mm_max_ps(max_pre_activation0, max_pre_activation1));
 
-                for (IndexType b = 0; b < batch.size(); ++b) 
+                for (IndexType b = 0; b < batch.size(); ++b)
                 {
                     const IndexType batch_offset = kOutputDimensions * b;
 
@@ -283,7 +283,7 @@ namespace Eval::NNUE {
                 learning_rate * learning_rate_scale_;
 
 #if defined (USE_SSE2)
-            
+
             {
                 static_assert(kHalfDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time");
 
@@ -408,10 +408,26 @@ namespace Eval::NNUE {
                         for (IndexType c = 0; c < 2; ++c) {
                             const IndexType output_offset = batch_offset + kHalfDimensions * c;
                             for (const auto& feature : (*batch_)[b].training_features[c]) {
-                                if (feature.get_index() % num_threads != thread_index)
+                                const IndexType feature_index = feature.get_index();
+
+                                // We assign each bucket a continuous range of bits at least
+                                // of cache line size to prevent false sharing.
+                                // For HalfKP this is enough to saturate about 80 threads.
+                                const IndexType thread_bucket =
+                                    (feature_index / BitsetType::best_concurrent_access_stride)
+                                    % num_threads;
+
+                                if (thread_bucket != thread_index)
                                     continue;
+
+                                // This operation can be performed safely because
+                                // each thread accesses a different memory location
+                                // (even a different cache line)
+                                observed_features.set(feature_index);
+
                                 const IndexType weights_offset =
-                                    kHalfDimensions * feature.get_index();
+                                    kHalfDimensions * feature_index;
+
                                 const auto scale = static_cast<LearnFloatType>(
                                     effective_learning_rate / feature.get_count());
 
@@ -438,14 +454,6 @@ namespace Eval::NNUE {
                 }
             );
 
-            for (IndexType b = 0; b < batch_->size(); ++b) {
-                for (IndexType c = 0; c < 2; ++c) {
-                    for (const auto& feature : (*batch_)[b].training_features[c]) {
-                        observed_features.set(feature.get_index());
-                    }
-                }
-            }
-
             thread_pool.wait_for_workers_finished();
         }
 
@@ -628,7 +636,8 @@ namespace Eval::NNUE {
         std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
 
         // Features that appeared in the training data
-        std::bitset<kInputDimensions> observed_features;
+        using BitsetType = LargeBitset<kInputDimensions>;
+        BitsetType observed_features;
 
         // hyper parameter
         LearnFloatType momentum_;

From dfc7f88650bf8bda4a381d36e209209cf63a9bcc Mon Sep 17 00:00:00 2001
From: mstembera <MissingEmail@email>
Date: Fri, 30 Oct 2020 13:45:40 -0700
Subject: [PATCH 303/398] Update default net to nn-cb26f10b1fd9.nnue

Result of https://tests.stockfishchess.org/tests/view/5f9a06796a2c112b60691c0f tuning.

STC
LLR: 2.94 (-2.94,2.94) {-0.25,1.25}
Total: 53712 W: 5776 L: 5561 D: 42375
Ptnml(0-2): 253, 4282, 17604, 4431, 286
https://tests.stockfishchess.org/tests/view/5f9c7bbc6a2c112b60691d4d

LTC
LLR: 2.97 (-2.94,2.94) {0.25,1.25}
Total: 80184 W: 4007 L: 3739 D: 72438
Ptnml(0-2): 58, 3302, 33130, 3518, 84
https://tests.stockfishchess.org/tests/view/5f9d01f06a2c112b60691d87

closes https://github.com/official-stockfish/Stockfish/pull/3209

bench: 3517795
---
 src/evaluate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/evaluate.h b/src/evaluate.h
index 6e5db6a3..6bec27db 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -36,7 +36,7 @@ namespace Eval {
   // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
   // for the build process (profile-build and fishtest) to work. Do not change the
   // name of the macro, as it is used in the Makefile.
-  #define EvalFileDefaultName   "nn-2eb2e0707c2b.nnue"
+  #define EvalFileDefaultName   "nn-cb26f10b1fd9.nnue"
 
   namespace NNUE {
 

From 75e06a1c89ebac9c9ec4247bc82ec728a2bffe1e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 29 Oct 2020 00:14:53 +0100
Subject: [PATCH 304/398] Optimize affine transform for SSSE3 and higher
 targets.

A non-functional speedup. Unroll the loops going over
the output dimensions in the affine transform layers by
a factor of 4 and perform 4 horizontal additions at a time.
Instead of doing naive horizontal additions on each vector
separately use hadd and shuffling between vectors to reduce
the number of instructions by using all lanes for all stages
of the horizontal adds.

passed STC of the initial version:
LLR: 2.95 (-2.94,2.94) {-0.25,1.25}
Total: 17808 W: 1914 L: 1756 D: 14138
Ptnml(0-2): 76, 1330, 5948, 1460, 90
https://tests.stockfishchess.org/tests/view/5f9d516f6a2c112b60691da3

passed STC of the final version after cleanup:
LLR: 2.95 (-2.94,2.94) {-0.25,1.25}
Total: 16296 W: 1750 L: 1595 D: 12951
Ptnml(0-2): 72, 1192, 5479, 1319, 86
https://tests.stockfishchess.org/tests/view/5f9df5776a2c112b60691de3

closes https://github.com/official-stockfish/Stockfish/pull/3203

No functional change
---
 src/nnue/layers/affine_transform.h | 478 +++++++++++++++++++++++------
 1 file changed, 384 insertions(+), 94 deletions(-)

diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index 94d0b5a9..f0292e45 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -74,113 +74,400 @@ namespace Eval::NNUE::Layers {
         const TransformedFeatureType* transformed_features, char* buffer) const {
       const auto input = previous_layer_.Propagate(
           transformed_features, buffer + kSelfBufferSize);
+
+#if defined (USE_AVX512)
+
+      [[maybe_unused]] const __m512i kOnes512 = _mm512_set1_epi16(1);
+
+      [[maybe_unused]] auto m512_hadd = [](__m512i sum, int bias) -> int {
+        return _mm512_reduce_add_epi32(sum) + bias;
+      };
+
+      [[maybe_unused]] auto m512_haddx4 = [](__m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3, __m128i bias) -> __m128i {
+        __m512i sum01a = _mm512_unpacklo_epi32(sum0, sum1);
+        __m512i sum01b = _mm512_unpackhi_epi32(sum0, sum1);
+
+        __m512i sum23a = _mm512_unpacklo_epi32(sum2, sum3);
+        __m512i sum23b = _mm512_unpackhi_epi32(sum2, sum3);
+
+        __m512i sum01 = _mm512_add_epi32(sum01a, sum01b);
+        __m512i sum23 = _mm512_add_epi32(sum23a, sum23b);
+
+        __m512i sum0123a = _mm512_unpacklo_epi64(sum01, sum23);
+        __m512i sum0123b = _mm512_unpackhi_epi64(sum01, sum23);
+
+        __m512i sum = _mm512_add_epi32(sum0123a, sum0123b);
+
+        __m256i sum256lo = _mm512_castsi512_si256(sum);
+        __m256i sum256hi = _mm512_extracti64x4_epi64(sum, 1);
+
+        sum256lo = _mm256_add_epi32(sum256lo, sum256hi);
+
+        __m128i sum128lo = _mm256_castsi256_si128(sum256lo);
+        __m128i sum128hi = _mm256_extracti128_si256(sum256lo, 1);
+
+        return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias);
+      };
+
+      [[maybe_unused]] auto m512_add_dpbusd_epi32 = [=](__m512i& acc, __m512i a, __m512i b) {
+#if defined (USE_VNNI)
+        acc = _mm512_dpbusd_epi32(acc, a, b);
+#else
+        __m512i product0 = _mm512_maddubs_epi16(a, b);
+        product0 = _mm512_madd_epi16(product0, kOnes512);
+        acc = _mm512_add_epi32(acc, product0);
+#endif
+      };
+
+#endif
+#if defined (USE_AVX2)
+
+      [[maybe_unused]] const __m256i kOnes256 = _mm256_set1_epi16(1);
+
+      [[maybe_unused]] auto m256_hadd = [](__m256i sum, int bias) -> int {
+        __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
+        sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC));
+        sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB));
+        return _mm_cvtsi128_si32(sum128) + bias;
+      };
+
+      [[maybe_unused]] auto m256_haddx4 = [](__m256i sum0, __m256i sum1, __m256i sum2, __m256i sum3, __m128i bias) -> __m128i {
+        sum0 = _mm256_hadd_epi32(sum0, sum1);
+        sum2 = _mm256_hadd_epi32(sum2, sum3);
+
+        sum0 = _mm256_hadd_epi32(sum0, sum2);
+
+        __m128i sum128lo = _mm256_castsi256_si128(sum0);
+        __m128i sum128hi = _mm256_extracti128_si256(sum0, 1);
+
+        return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias);
+      };
+
+      [[maybe_unused]] auto m256_add_dpbusd_epi32 = [=](__m256i& acc, __m256i a, __m256i b) {
+#if defined (USE_VNNI)
+        acc = _mm256_dpbusd_epi32(acc, a, b);
+#else
+        __m256i product0 = _mm256_maddubs_epi16(a, b);
+        product0 = _mm256_madd_epi16(product0, kOnes256);
+        acc = _mm256_add_epi32(acc, product0);
+#endif
+      };
+
+#endif
+
+#if defined (USE_SSSE3)
+
+      [[maybe_unused]] const __m128i kOnes128 = _mm_set1_epi16(1);
+
+      [[maybe_unused]] auto m128_hadd = [](__m128i sum, int bias) -> int {
+        sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC
+        sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB
+        return _mm_cvtsi128_si32(sum) + bias;
+      };
+
+      [[maybe_unused]] auto m128_haddx4 = [](__m128i sum0, __m128i sum1, __m128i sum2, __m128i sum3, __m128i bias) -> __m128i {
+        sum0 = _mm_hadd_epi32(sum0, sum1);
+        sum2 = _mm_hadd_epi32(sum2, sum3);
+
+        sum0 = _mm_hadd_epi32(sum0, sum2);
+
+        return _mm_add_epi32(sum0, bias);
+      };
+
+      [[maybe_unused]] auto m128_add_dpbusd_epi32 = [=](__m128i& acc, __m128i a, __m128i b) {
+        __m128i product0 = _mm_maddubs_epi16(a, b);
+        product0 = _mm_madd_epi16(product0, kOnes128);
+        acc = _mm_add_epi32(acc, product0);
+      };
+
+#endif
+
+#if defined (USE_AVX512)
+
+      constexpr IndexType kNumChunks512 = kPaddedInputDimensions / (kSimdWidth * 2);
+      constexpr IndexType kNumChunks256 = kPaddedInputDimensions / kSimdWidth;
+
       const auto output = reinterpret_cast<OutputType*>(buffer);
 
-  #if defined(USE_AVX512)
-      constexpr IndexType kNumChunks = kPaddedInputDimensions / (kSimdWidth * 2);
-      const auto input_vector = reinterpret_cast<const __m512i*>(input);
-  #if !defined(USE_VNNI)
-      const __m512i kOnes = _mm512_set1_epi16(1);
-  #endif
+      // Since to saturate a zmm register it takes 64 bytes we
+      // cannot use AVX512 for the smaller affine transforms.
+      // Instead we fallback to a AVX2 implementation if the
+      // kInputDimensions isn't a multiple of 64.
+      // Note that this means that for example for
+      // kInputDimensions of 96 we fallback to AVX2 even though
+      // the first 64 elements could be processed with AVX512.
+      // This is caused by mixing the __m256 and __m512 variables
+      // required to better handle that case and it would
+      // require handling more cases statically not to lose performance.
+      // This should be revisited if such input dimensions are to be considered.
+      [[maybe_unused]] const auto input_vector512 = reinterpret_cast<const __m512i*>(input);
+      [[maybe_unused]] const auto input_vector256 = reinterpret_cast<const __m256i*>(input);
+
+      // kOutputDimensions is either 1 or a multiple of kSimdWidth
+      // because then it is also an input dimension.
+      if constexpr (kOutputDimensions % 4 == 0)
+      {
+        for (IndexType i = 0; i < kOutputDimensions; i += 4)
+        {
+          const IndexType offset0 = (i + 0) * kPaddedInputDimensions;
+          const IndexType offset1 = (i + 1) * kPaddedInputDimensions;
+          const IndexType offset2 = (i + 2) * kPaddedInputDimensions;
+          const IndexType offset3 = (i + 3) * kPaddedInputDimensions;
+
+          const __m128i bias = *reinterpret_cast<const __m128i*>(&biases_[i]);
+          __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]);
+
+          if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) == 0)
+          {
+            __m512i sum0 = _mm512_setzero_si512();
+            __m512i sum1 = _mm512_setzero_si512();
+            __m512i sum2 = _mm512_setzero_si512();
+            __m512i sum3 = _mm512_setzero_si512();
+
+            const auto row0 = reinterpret_cast<const __m512i*>(&weights_[offset0]);
+            const auto row1 = reinterpret_cast<const __m512i*>(&weights_[offset1]);
+            const auto row2 = reinterpret_cast<const __m512i*>(&weights_[offset2]);
+            const auto row3 = reinterpret_cast<const __m512i*>(&weights_[offset3]);
+
+            for (IndexType j = 0; j < kNumChunks512; ++j)
+            {
+              const __m512i in = input_vector512[j];
+
+              m512_add_dpbusd_epi32(sum0, in, row0[j]);
+              m512_add_dpbusd_epi32(sum1, in, row1[j]);
+              m512_add_dpbusd_epi32(sum2, in, row2[j]);
+              m512_add_dpbusd_epi32(sum3, in, row3[j]);
+            }
+
+            *outptr = m512_haddx4(sum0, sum1, sum2, sum3, bias);
+          }
+          else
+          {
+            __m256i sum0 = _mm256_setzero_si256();
+            __m256i sum1 = _mm256_setzero_si256();
+            __m256i sum2 = _mm256_setzero_si256();
+            __m256i sum3 = _mm256_setzero_si256();
+
+            const auto row0 = reinterpret_cast<const __m256i*>(&weights_[offset0]);
+            const auto row1 = reinterpret_cast<const __m256i*>(&weights_[offset1]);
+            const auto row2 = reinterpret_cast<const __m256i*>(&weights_[offset2]);
+            const auto row3 = reinterpret_cast<const __m256i*>(&weights_[offset3]);
+
+            for (IndexType j = 0; j < kNumChunks256; ++j)
+            {
+              const __m256i in = input_vector256[j];
+
+              m256_add_dpbusd_epi32(sum0, in, row0[j]);
+              m256_add_dpbusd_epi32(sum1, in, row1[j]);
+              m256_add_dpbusd_epi32(sum2, in, row2[j]);
+              m256_add_dpbusd_epi32(sum3, in, row3[j]);
+            }
+
+            *outptr = m256_haddx4(sum0, sum1, sum2, sum3, bias);
+          }
+        }
+      }
+      else if constexpr (kOutputDimensions == 1)
+      {
+        if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) == 0)
+        {
+          __m512i sum0 = _mm512_setzero_si512();
+
+          const auto row0 = reinterpret_cast<const __m512i*>(&weights_[0]);
+
+          for (IndexType j = 0; j < kNumChunks512; ++j)
+          {
+            const __m512i in = input_vector512[j];
+
+            m512_add_dpbusd_epi32(sum0, in, row0[j]);
+          }
+
+          output[0] = m512_hadd(sum0, biases_[0]);
+        }
+        else
+        {
+          __m256i sum0 = _mm256_setzero_si256();
+
+          const auto row0 = reinterpret_cast<const __m256i*>(&weights_[0]);
+
+          for (IndexType j = 0; j < kNumChunks256; ++j)
+          {
+            const __m256i in = input_vector256[j];
+
+            m256_add_dpbusd_epi32(sum0, in, row0[j]);
+          }
+
+          output[0] = m256_hadd(sum0, biases_[0]);
+        }
+      }
+      else
+      {
+        // This case can never happen because kOutputDimensions
+        // is always 1 or a multiple of kSimdWidth.
+        assert(false);
+      }
+
+#elif defined (USE_AVX2)
 
-  #elif defined(USE_AVX2)
       constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+
+      const auto output = reinterpret_cast<OutputType*>(buffer);
       const auto input_vector = reinterpret_cast<const __m256i*>(input);
-  #if !defined(USE_VNNI)
-      const __m256i kOnes = _mm256_set1_epi16(1);
-  #endif
 
-  #elif defined(USE_SSE2)
+      // kOutputDimensions is either 1 or a multiple of kSimdWidth
+      // because then it is also an input dimension.
+      if constexpr (kOutputDimensions % 4 == 0)
+      {
+        for (IndexType i = 0; i < kOutputDimensions; i += 4)
+        {
+          const IndexType offset0 = (i + 0) * kPaddedInputDimensions;
+          const IndexType offset1 = (i + 1) * kPaddedInputDimensions;
+          const IndexType offset2 = (i + 2) * kPaddedInputDimensions;
+          const IndexType offset3 = (i + 3) * kPaddedInputDimensions;
+
+          const __m128i bias = *reinterpret_cast<const __m128i*>(&biases_[i]);
+          __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]);
+
+          __m256i sum0 = _mm256_setzero_si256();
+          __m256i sum1 = _mm256_setzero_si256();
+          __m256i sum2 = _mm256_setzero_si256();
+          __m256i sum3 = _mm256_setzero_si256();
+
+          const auto row0 = reinterpret_cast<const __m256i*>(&weights_[offset0]);
+          const auto row1 = reinterpret_cast<const __m256i*>(&weights_[offset1]);
+          const auto row2 = reinterpret_cast<const __m256i*>(&weights_[offset2]);
+          const auto row3 = reinterpret_cast<const __m256i*>(&weights_[offset3]);
+
+          for (IndexType j = 0; j < kNumChunks; ++j)
+          {
+            const __m256i in = input_vector[j];
+
+            m256_add_dpbusd_epi32(sum0, in, row0[j]);
+            m256_add_dpbusd_epi32(sum1, in, row1[j]);
+            m256_add_dpbusd_epi32(sum2, in, row2[j]);
+            m256_add_dpbusd_epi32(sum3, in, row3[j]);
+          }
+
+          *outptr = m256_haddx4(sum0, sum1, sum2, sum3, bias);
+        }
+      }
+      else if constexpr (kOutputDimensions == 1)
+      {
+        __m256i sum0 = _mm256_setzero_si256();
+
+        const auto row0 = reinterpret_cast<const __m256i*>(&weights_[0]);
+
+        for (IndexType j = 0; j < kNumChunks; ++j)
+        {
+          const __m256i in = input_vector[j];
+
+            m256_add_dpbusd_epi32(sum0, in, row0[j]);
+        }
+
+        output[0] = m256_hadd(sum0, biases_[0]);
+      }
+      else
+      {
+        // This case can never happen because kOutputDimensions
+        // is always 1 or a multiple of kSimdWidth.
+        assert(false);
+      }
+
+#elif defined (USE_SSSE3)
+
       constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
-  #ifndef USE_SSSE3
-      const __m128i kZeros = _mm_setzero_si128();
-  #else
-      const __m128i kOnes = _mm_set1_epi16(1);
-  #endif
+
+      auto output = reinterpret_cast<OutputType*>(buffer);
       const auto input_vector = reinterpret_cast<const __m128i*>(input);
 
-  #elif defined(USE_MMX)
+      // kOutputDimensions is either 1 or a multiple of kSimdWidth
+      // because then it is also an input dimension.
+      if constexpr (kOutputDimensions % 4 == 0)
+      {
+        for (IndexType i = 0; i < kOutputDimensions; i += 4)
+        {
+          const IndexType offset0 = (i + 0) * kPaddedInputDimensions;
+          const IndexType offset1 = (i + 1) * kPaddedInputDimensions;
+          const IndexType offset2 = (i + 2) * kPaddedInputDimensions;
+          const IndexType offset3 = (i + 3) * kPaddedInputDimensions;
+
+          const __m128i bias = *reinterpret_cast<const __m128i*>(&biases_[i]);
+          __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]);
+
+          __m128i sum0 = _mm_setzero_si128();
+          __m128i sum1 = _mm_setzero_si128();
+          __m128i sum2 = _mm_setzero_si128();
+          __m128i sum3 = _mm_setzero_si128();
+
+          const auto row0 = reinterpret_cast<const __m128i*>(&weights_[offset0]);
+          const auto row1 = reinterpret_cast<const __m128i*>(&weights_[offset1]);
+          const auto row2 = reinterpret_cast<const __m128i*>(&weights_[offset2]);
+          const auto row3 = reinterpret_cast<const __m128i*>(&weights_[offset3]);
+
+          for (int j = 0; j < (int)kNumChunks; j += 1)
+          {
+            const __m128i in = input_vector[j];
+
+            m128_add_dpbusd_epi32(sum0, in, row0[j]);
+            m128_add_dpbusd_epi32(sum1, in, row1[j]);
+            m128_add_dpbusd_epi32(sum2, in, row2[j]);
+            m128_add_dpbusd_epi32(sum3, in, row3[j]);
+          }
+
+          *outptr = m128_haddx4(sum0, sum1, sum2, sum3, bias);
+        }
+      }
+      else if constexpr (kOutputDimensions == 1)
+      {
+        __m128i sum0 = _mm_setzero_si128();
+
+        const auto row0 = reinterpret_cast<const __m128i*>(&weights_[0]);
+
+        for (int j = 0; j < (int)kNumChunks; j += 1)
+        {
+          const __m128i in = input_vector[j];
+
+          m128_add_dpbusd_epi32(sum0, in, row0[j]);
+        }
+
+        output[0] = m128_hadd(sum0, biases_[0]);
+      }
+      else
+      {
+        // This case can never happen because kOutputDimensions
+        // is always 1 or a multiple of kSimdWidth.
+        assert(false);
+      }
+
+#else
+
+// Use old implementation for the other architectures.
+
+      auto output = reinterpret_cast<OutputType*>(buffer);
+
+#if defined(USE_SSE2)
+      constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+#ifndef USE_SSSE3
+      const __m128i kZeros = _mm_setzero_si128();
+#else
+      const __m128i kOnes = _mm_set1_epi16(1);
+#endif
+      const auto input_vector = reinterpret_cast<const __m128i*>(input);
+
+#elif defined(USE_MMX)
       constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
       const __m64 kZeros = _mm_setzero_si64();
       const auto input_vector = reinterpret_cast<const __m64*>(input);
 
-  #elif defined(USE_NEON)
+#elif defined(USE_NEON)
       constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
       const auto input_vector = reinterpret_cast<const int8x8_t*>(input);
-  #endif
+#endif
 
       for (IndexType i = 0; i < kOutputDimensions; ++i) {
         const IndexType offset = i * kPaddedInputDimensions;
 
-  #if defined(USE_AVX512)
-        __m512i sum = _mm512_setzero_si512();
-        const auto row = reinterpret_cast<const __m512i*>(&weights_[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-  #if defined(USE_VNNI)
-            sum = _mm512_dpbusd_epi32(sum, _mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
-  #else
-            __m512i product = _mm512_maddubs_epi16(_mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
-            product = _mm512_madd_epi16(product, kOnes);
-            sum = _mm512_add_epi32(sum, product);
-  #endif
-        }
-
-        // Note: Changing kMaxSimdWidth from 32 to 64 breaks loading existing networks.
-        // As a result kPaddedInputDimensions may not be an even multiple of 64(512bit)
-        // and we have to do one more 256bit chunk.
-        if (kPaddedInputDimensions != kNumChunks * kSimdWidth * 2)
-        {
-            const auto iv256  = reinterpret_cast<const __m256i*>(&input_vector[kNumChunks]);
-            const auto row256 = reinterpret_cast<const __m256i*>(&row[kNumChunks]);
-  #if defined(USE_VNNI)
-            __m256i product256 = _mm256_dpbusd_epi32(
-                _mm512_castsi512_si256(sum), _mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0]));
-            sum = _mm512_inserti32x8(sum, product256, 0);
-  #else
-            __m256i product256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0]));
-            sum = _mm512_add_epi32(sum, _mm512_cvtepi16_epi32(product256));
-  #endif
-        }
-        output[i] = _mm512_reduce_add_epi32(sum) + biases_[i];
-
-  #elif defined(USE_AVX2)
-        __m256i sum = _mm256_setzero_si256();
-        const auto row = reinterpret_cast<const __m256i*>(&weights_[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-  #if defined(USE_VNNI)
-          sum = _mm256_dpbusd_epi32(sum, _mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
-  #else
-          __m256i product = _mm256_maddubs_epi16(_mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
-          product = _mm256_madd_epi16(product, kOnes);
-          sum = _mm256_add_epi32(sum, product);
-  #endif
-        }
-        __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
-        sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC));
-        sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB));
-        output[i] = _mm_cvtsi128_si32(sum128) + biases_[i];
-
-  #elif defined(USE_SSSE3)
-        __m128i sum = _mm_setzero_si128();
-        const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
-        for (int j = 0; j < (int)kNumChunks - 1; j += 2) {
-          __m128i product0 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j]));
-          product0 = _mm_madd_epi16(product0, kOnes);
-          sum = _mm_add_epi32(sum, product0);
-          __m128i product1 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j+1]), _mm_load_si128(&row[j+1]));
-          product1 = _mm_madd_epi16(product1, kOnes);
-          sum = _mm_add_epi32(sum, product1);
-        }
-        if (kNumChunks & 0x1) {
-          __m128i product = _mm_maddubs_epi16(_mm_load_si128(&input_vector[kNumChunks-1]), _mm_load_si128(&row[kNumChunks-1]));
-          product = _mm_madd_epi16(product, kOnes);
-          sum = _mm_add_epi32(sum, product);
-        }
-        sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC
-        sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB
-        output[i] = _mm_cvtsi128_si32(sum) + biases_[i];
-
-  #elif defined(USE_SSE2)
+#if defined(USE_SSE2)
         __m128i sum_lo = _mm_cvtsi32_si128(biases_[i]);
         __m128i sum_hi = kZeros;
         const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
@@ -204,7 +491,7 @@ namespace Eval::NNUE::Layers {
         sum = _mm_add_epi32(sum, sum_second_32);
         output[i] = _mm_cvtsi128_si32(sum);
 
-  #elif defined(USE_MMX)
+#elif defined(USE_MMX)
         __m64 sum_lo = _mm_cvtsi32_si64(biases_[i]);
         __m64 sum_hi = kZeros;
         const auto row = reinterpret_cast<const __m64*>(&weights_[offset]);
@@ -225,7 +512,7 @@ namespace Eval::NNUE::Layers {
         sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum));
         output[i] = _mm_cvtsi64_si32(sum);
 
-  #elif defined(USE_NEON)
+#elif defined(USE_NEON)
         int32x4_t sum = {biases_[i]};
         const auto row = reinterpret_cast<const int8x8_t*>(&weights_[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
@@ -235,18 +522,21 @@ namespace Eval::NNUE::Layers {
         }
         output[i] = sum[0] + sum[1] + sum[2] + sum[3];
 
-  #else
+#else
         OutputType sum = biases_[i];
         for (IndexType j = 0; j < kInputDimensions; ++j) {
           sum += weights_[offset + j] * input[j];
         }
         output[i] = sum;
-  #endif
+#endif
 
       }
-  #if defined(USE_MMX)
+#if defined(USE_MMX)
       _mm_empty();
-  #endif
+#endif
+
+#endif
+
       return output;
     }
 

From 931070b65ac0332469a24765a60eb27e038f73bc Mon Sep 17 00:00:00 2001
From: FauziAkram <fauzi.dabat@hotmail.com>
Date: Thu, 29 Oct 2020 17:33:18 +0300
Subject: [PATCH 305/398] Elo Worth in King Danger

Adding the EloWorth for each term in King Danger.
Should be useful for simplifications, tuning patches, and new ideas.

closes https://github.com/official-stockfish/Stockfish/pull/3204

non-functional change
---
 src/evaluate.cpp | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 030d1017..4ade46fa 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -582,18 +582,18 @@ namespace {
     int kingFlankAttack  = popcount(b1) + popcount(b2);
     int kingFlankDefense = popcount(b3);
 
-    kingDanger +=        kingAttackersCount[Them] * kingAttackersWeight[Them]
-                 + 185 * popcount(kingRing[Us] & weak)
-                 + 148 * popcount(unsafeChecks)
-                 +  98 * popcount(pos.blockers_for_king(Us))
-                 +  69 * kingAttacksCount[Them]
-                 +   3 * kingFlankAttack * kingFlankAttack / 8
-                 +       mg_value(mobility[Them] - mobility[Us])
-                 - 873 * !pos.count<QUEEN>(Them)
-                 - 100 * bool(attackedBy[Us][KNIGHT] & attackedBy[Us][KING])
-                 -   6 * mg_value(score) / 8
-                 -   4 * kingFlankDefense
-                 +  37;
+    kingDanger +=        kingAttackersCount[Them] * kingAttackersWeight[Them] // (~10 Elo)
+                 + 185 * popcount(kingRing[Us] & weak)                        // (~15 Elo)
+                 + 148 * popcount(unsafeChecks)                               // (~4 Elo)
+                 +  98 * popcount(pos.blockers_for_king(Us))                  // (~2 Elo)
+                 +  69 * kingAttacksCount[Them]                               // (~0.5 Elo)
+                 +   3 * kingFlankAttack * kingFlankAttack / 8                // (~0.5 Elo)
+                 +       mg_value(mobility[Them] - mobility[Us])              // (~0.5 Elo)
+                 - 873 * !pos.count<QUEEN>(Them)                              // (~24 Elo)
+                 - 100 * bool(attackedBy[Us][KNIGHT] & attackedBy[Us][KING])  // (~5 Elo)
+                 -   6 * mg_value(score) / 8                                  // (~8 Elo)
+                 -   4 * kingFlankDefense                                     // (~5 Elo)
+                 +  37;                                                       // (~0.5 Elo)
 
     // Transform the kingDanger units into a Score, and subtract it from the evaluation
     if (kingDanger > 100)

From a260c9a8a24a2630a900efc3821000c3481b0c5d Mon Sep 17 00:00:00 2001
From: "J. Oster" <osterj165@googlemail.com>
Date: Sun, 1 Nov 2020 18:33:17 +0100
Subject: [PATCH 306/398] Fix incorrect pruning in qsearch

Only do countermove based pruning in qsearch if we already have a move with a better score than a TB loss.

This patch fixes a bug (started as 843a961) that incorrectly prunes moves if in check,
and adds an assert to make sure no wrong mate scores are given in the future.
It replaces a no-op moveCount check with a check for bestValue.

Initially discussed in #3171 and later in #3199, #3198 and #3210.
This PR effectively closes #3171
It also likely fixes #3196 where this causes user visible incorrect TB scores,
which probably result from these incorrect mate scores.

Passed STC and LTC non-regression tests.
https://tests.stockfishchess.org/tests/view/5f9ef8dabca9bf35bae7f648
LLR: 2.93 (-2.94,2.94) {-1.25,0.25}
Total: 21672 W: 2339 L: 2230 D: 17103
Ptnml(0-2): 126, 1689, 7083, 1826, 112

https://tests.stockfishchess.org/tests/view/5f9f0caebca9bf35bae7f666
LLR: 2.97 (-2.94,2.94) {-0.75,0.25}
Total: 33152 W: 1551 L: 1485 D: 30116
Ptnml(0-2): 27, 1308, 13832, 1390, 19

closes https://github.com/official-stockfish/Stockfish/pull/3214

Bench: 3625915
---
 src/search.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/search.cpp b/src/search.cpp
index 65ed9b73..743449fa 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1565,7 +1565,7 @@ moves_loop: // When in check, search starts from here
 
       // CounterMove based pruning
       if (  !captureOrPromotion
-          && moveCount
+          && bestValue > VALUE_TB_LOSS_IN_MAX_PLY
           && (*contHist[0])[pos.moved_piece(move)][to_sq(move)] < CounterMovePruneThreshold
           && (*contHist[1])[pos.moved_piece(move)][to_sq(move)] < CounterMovePruneThreshold)
           continue;
@@ -1600,7 +1600,11 @@ moves_loop: // When in check, search starts from here
     // All legal moves have been searched. A special case: if we're in check
     // and no legal moves were found, it is checkmate.
     if (ss->inCheck && bestValue == -VALUE_INFINITE)
+    {
+        assert(!MoveList<LEGAL>(pos).size());
+
         return mated_in(ss->ply); // Plies to mate from the root
+    }
 
     tte->save(posKey, value_to_tt(bestValue, ss->ply), pvHit,
               bestValue >= beta ? BOUND_LOWER :

From 3f6451eff7c62e8d4a33c5b11f055a81b3da8387 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Tue, 3 Nov 2020 11:23:35 +0100
Subject: [PATCH 307/398] Manually align arrays on the stack

as a workaround to issues with overaligned alignas() on stack variables in gcc < 9.3 on windows.

closes https://github.com/official-stockfish/Stockfish/pull/3217

fixes #3216

No functional change
---
 src/misc.h                          | 12 ++++++++++++
 src/nnue/evaluate_nnue.cpp          | 25 ++++++++++++++++++++++---
 src/nnue/layers/clipped_relu.h      | 10 +++++-----
 src/nnue/nnue_common.h              | 23 -----------------------
 src/nnue/nnue_feature_transformer.h | 14 +++++++-------
 src/position.cpp                    |  4 ++++
 src/search.cpp                      |  8 ++++++++
 src/types.h                         |  6 ++++++
 8 files changed, 64 insertions(+), 38 deletions(-)

diff --git a/src/misc.h b/src/misc.h
index bc48f303..682ef816 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -24,6 +24,7 @@
 #include <ostream>
 #include <string>
 #include <vector>
+#include <cstdint>
 
 #include "types.h"
 
@@ -63,6 +64,17 @@ std::ostream& operator<<(std::ostream&, SyncCout);
 #define sync_cout std::cout << IO_LOCK
 #define sync_endl std::endl << IO_UNLOCK
 
+// `ptr` must point to an array of size at least
+// `sizeof(T) * N + alignment` bytes, where `N` is the
+// number of elements in the array.
+template <uintptr_t Alignment, typename T>
+T* align_ptr_up(T* ptr)
+{
+  static_assert(alignof(T) < Alignment);
+
+  const uintptr_t ptrint = reinterpret_cast<uintptr_t>(reinterpret_cast<char*>(ptr));
+  return reinterpret_cast<T*>(reinterpret_cast<char*>((ptrint + (Alignment - 1)) / Alignment * Alignment));
+}
 
 /// xorshift64star Pseudo-Random Number Generator
 /// This class is based on original code written and dedicated
diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index b5dcd992..b0ed7d2f 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -25,6 +25,7 @@
 #include "../position.h"
 #include "../misc.h"
 #include "../uci.h"
+#include "../types.h"
 
 #include "evaluate_nnue.h"
 
@@ -126,10 +127,28 @@ namespace Eval::NNUE {
   // Evaluation function. Perform differential calculation.
   Value evaluate(const Position& pos) {
 
-    alignas(kCacheLineSize) TransformedFeatureType
-        transformed_features[FeatureTransformer::kBufferSize];
+    // We manually align the arrays on the stack because with gcc < 9.3
+    // overaligning stack variables with alignas() doesn't work correctly.
+
+    constexpr uint64_t alignment = kCacheLineSize;
+
+#if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN)
+    TransformedFeatureType transformed_features_unaligned[
+      FeatureTransformer::kBufferSize + alignment / sizeof(TransformedFeatureType)];
+    char buffer_unaligned[Network::kBufferSize + alignment];
+
+    auto* transformed_features = align_ptr_up<alignment>(&transformed_features_unaligned[0]);
+    auto* buffer = align_ptr_up<alignment>(&buffer_unaligned[0]);
+#else
+    alignas(alignment)
+      TransformedFeatureType transformed_features[FeatureTransformer::kBufferSize];
+    alignas(alignment) char buffer[Network::kBufferSize];
+#endif
+
+    ASSERT_ALIGNED(transformed_features, alignment);
+    ASSERT_ALIGNED(buffer, alignment);
+
     feature_transformer->Transform(pos, transformed_features);
-    alignas(kCacheLineSize) char buffer[Network::kBufferSize];
     const auto output = network->Propagate(transformed_features, buffer);
 
     return static_cast<Value>(output[0] / FV_SCALE);
diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h
index 44d8a7de..7f6d67bf 100644
--- a/src/nnue/layers/clipped_relu.h
+++ b/src/nnue/layers/clipped_relu.h
@@ -74,12 +74,12 @@ namespace Eval::NNUE::Layers {
       const auto out = reinterpret_cast<__m256i*>(output);
       for (IndexType i = 0; i < kNumChunks; ++i) {
         const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32(
-            _mm256_loadA_si256(&in[i * 4 + 0]),
-            _mm256_loadA_si256(&in[i * 4 + 1])), kWeightScaleBits);
+            _mm256_load_si256(&in[i * 4 + 0]),
+            _mm256_load_si256(&in[i * 4 + 1])), kWeightScaleBits);
         const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32(
-            _mm256_loadA_si256(&in[i * 4 + 2]),
-            _mm256_loadA_si256(&in[i * 4 + 3])), kWeightScaleBits);
-        _mm256_storeA_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
+            _mm256_load_si256(&in[i * 4 + 2]),
+            _mm256_load_si256(&in[i * 4 + 3])), kWeightScaleBits);
+        _mm256_store_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
             _mm256_packs_epi16(words0, words1), kZero), kOffsets));
       }
       constexpr IndexType kStart = kNumChunks * kSimdWidth;
diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h
index 8afea186..a9664262 100644
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -43,29 +43,6 @@
 #include <arm_neon.h>
 #endif
 
-// HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Otherwise a binary
-//       compiled with older g++ crashes because the output memory is not aligned
-//       even though alignas is specified.
-#if defined(USE_AVX2)
-#if defined(__GNUC__ ) && (__GNUC__ < 9) && defined(_WIN32) && !defined(__clang__)
-#define _mm256_loadA_si256  _mm256_loadu_si256
-#define _mm256_storeA_si256 _mm256_storeu_si256
-#else
-#define _mm256_loadA_si256  _mm256_load_si256
-#define _mm256_storeA_si256 _mm256_store_si256
-#endif
-#endif
-
-#if defined(USE_AVX512)
-#if defined(__GNUC__ ) && (__GNUC__ < 9) && defined(_WIN32) && !defined(__clang__)
-#define _mm512_loadA_si512   _mm512_loadu_si512
-#define _mm512_storeA_si512  _mm512_storeu_si512
-#else
-#define _mm512_loadA_si512   _mm512_load_si512
-#define _mm512_storeA_si512  _mm512_store_si512
-#endif
-#endif
-
 namespace Eval::NNUE {
 
   // Version of the evaluation file
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index f145c848..c3f012e4 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -36,16 +36,16 @@ namespace Eval::NNUE {
 
   #ifdef USE_AVX512
   typedef __m512i vec_t;
-  #define vec_load(a) _mm512_loadA_si512(a)
-  #define vec_store(a,b) _mm512_storeA_si512(a,b)
+  #define vec_load(a) _mm512_load_si512(a)
+  #define vec_store(a,b) _mm512_store_si512(a,b)
   #define vec_add_16(a,b) _mm512_add_epi16(a,b)
   #define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
   static constexpr IndexType kNumRegs = 8; // only 8 are needed
 
   #elif USE_AVX2
   typedef __m256i vec_t;
-  #define vec_load(a) _mm256_loadA_si256(a)
-  #define vec_store(a,b) _mm256_storeA_si256(a,b)
+  #define vec_load(a) _mm256_load_si256(a)
+  #define vec_store(a,b) _mm256_store_si256(a,b)
   #define vec_add_16(a,b) _mm256_add_epi16(a,b)
   #define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
   static constexpr IndexType kNumRegs = 16;
@@ -157,11 +157,11 @@ namespace Eval::NNUE {
   #if defined(USE_AVX2)
         auto out = reinterpret_cast<__m256i*>(&output[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
-          __m256i sum0 = _mm256_loadA_si256(
+          __m256i sum0 = _mm256_load_si256(
               &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
-          __m256i sum1 = _mm256_loadA_si256(
+          __m256i sum1 = _mm256_load_si256(
             &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
-          _mm256_storeA_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
+          _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
               _mm256_packs_epi16(sum0, sum1), kZero), kControl));
         }
 
diff --git a/src/position.cpp b/src/position.cpp
index b707293d..5ce7da22 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -77,6 +77,8 @@ std::ostream& operator<<(std::ostream& os, const Position& pos) {
       && !pos.can_castle(ANY_CASTLING))
   {
       StateInfo st;
+      ASSERT_ALIGNED(&st, Eval::NNUE::kCacheLineSize);
+
       Position p;
       p.set(pos.fen(), pos.is_chess960(), &st, pos.this_thread());
       Tablebases::ProbeState s1, s2;
@@ -1318,6 +1320,8 @@ bool Position::pos_is_ok() const {
               assert(0 && "pos_is_ok: Bitboards");
 
   StateInfo si = *st;
+  ASSERT_ALIGNED(&si, Eval::NNUE::kCacheLineSize);
+
   set_state(&si);
   if (std::memcmp(&si, st, sizeof(StateInfo)))
       assert(0 && "pos_is_ok: State");
diff --git a/src/search.cpp b/src/search.cpp
index 743449fa..12c32194 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -164,6 +164,8 @@ namespace {
   uint64_t perft(Position& pos, Depth depth) {
 
     StateInfo st;
+    ASSERT_ALIGNED(&st, Eval::NNUE::kCacheLineSize);
+
     uint64_t cnt, nodes = 0;
     const bool leaf = (depth == 2);
 
@@ -590,6 +592,8 @@ namespace {
 
     Move pv[MAX_PLY+1], capturesSearched[32], quietsSearched[64];
     StateInfo st;
+    ASSERT_ALIGNED(&st, Eval::NNUE::kCacheLineSize);
+
     TTEntry* tte;
     Key posKey;
     Move ttMove, move, excludedMove, bestMove;
@@ -1403,6 +1407,8 @@ moves_loop: // When in check, search starts from here
 
     Move pv[MAX_PLY+1];
     StateInfo st;
+    ASSERT_ALIGNED(&st, Eval::NNUE::kCacheLineSize);
+
     TTEntry* tte;
     Key posKey;
     Move ttMove, move, bestMove;
@@ -1898,6 +1904,8 @@ string UCI::pv(const Position& pos, Depth depth, Value alpha, Value beta) {
 bool RootMove::extract_ponder_from_tt(Position& pos) {
 
     StateInfo st;
+    ASSERT_ALIGNED(&st, Eval::NNUE::kCacheLineSize);
+
     bool ttHit;
 
     assert(pv.size() == 1);
diff --git a/src/types.h b/src/types.h
index 5873c698..bf692f7e 100644
--- a/src/types.h
+++ b/src/types.h
@@ -57,6 +57,12 @@
 /// _WIN32             Building on Windows (any)
 /// _WIN64             Building on Windows 64 bit
 
+#if defined(__GNUC__ ) && (__GNUC__ < 9 || (__GNUC__ == 9 && __GNUC_MINOR__ <= 2)) && defined(_WIN32) && !defined(__clang__)
+#define ALIGNAS_ON_STACK_VARIABLES_BROKEN
+#endif
+
+#define ASSERT_ALIGNED(ptr, alignment) assert(reinterpret_cast<uintptr_t>(ptr) % alignment == 0)
+
 #if defined(_WIN64) && defined(_MSC_VER) // No Makefile used
 #  include <intrin.h> // Microsoft header for _BitScanForward64()
 #  define IS_64BIT

From 04a320666efce725ef66d1a84aaef493a880153d Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Fri, 23 Oct 2020 07:39:35 +0200
Subject: [PATCH 308/398] Change handling the special case of a single legal
 move.

Using no searching time in case of a single legal move is not beneficial from
a strength point of view, and this special case can be easily removed:

STC:
LLR: 2.93 (-2.94,2.94) {-1.25,0.25}
Total: 22472 W: 2458 L: 2357 D: 17657
Ptnml(0-2): 106, 1733, 7453, 1842, 102
https://tests.stockfishchess.org/tests/view/5f926cbc81eda81bd78cb6df

LTC:
LLR: 2.94 (-2.94,2.94) {-0.75,0.25}
Total: 37880 W: 1736 L: 1682 D: 34462
Ptnml(0-2): 22, 1392, 16057, 1448, 21
https://tests.stockfishchess.org/tests/view/5f92a26081eda81bd78cb6fe

The advantage of using the normal time management for a single legal move is that scores
reported for that move are reasonable, not searching leads to artifacts during games
(see e.g. https://tcec-chess.com/#div=sf&game=96&season=19)

The disadvantage of using normal time management of a single legal move is that thinking
times can be unnaturally long, making it 'painful to watch' in online tournaments.

This patch uses normal time management, but caps the used time to 500ms.
This should lead to reasonable scores, and be hardly perceptible.

closes https://github.com/official-stockfish/Stockfish/pull/3195
closes https://github.com/official-stockfish/Stockfish/pull/3183

variant of a patch suggested by SFisGOD

No functional change.
---
 src/search.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index 12c32194..6e37fba1 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -521,10 +521,14 @@ void Thread::search() {
           }
           double bestMoveInstability = 1 + 2 * totBestMoveChanges / Threads.size();
 
-          double totalTime = rootMoves.size() == 1 ? 0 :
-                             Time.optimum() * fallingEval * reduction * bestMoveInstability;
+          double totalTime = Time.optimum() * fallingEval * reduction * bestMoveInstability;
 
-          // Stop the search if we have exceeded the totalTime, at least 1ms search
+          // Cap used time in case of a single legal move for a better viewer experience in tournaments
+          // yielding correct scores and sufficiently fast moves.
+          if (rootMoves.size() == 1)
+              totalTime = std::min(500.0, totalTime);
+
+          // Stop the search if we have exceeded the totalTime
           if (Time.elapsed() > totalTime)
           {
               // If we are allowed to ponder do not stop the search now but

From 7fc47eeb6f6b5f3c5ff697e974093ff14413e42c Mon Sep 17 00:00:00 2001
From: FauziAkram <fauzi.dabat@hotmail.com>
Date: Thu, 5 Nov 2020 01:54:53 +0200
Subject: [PATCH 309/398] Introducing King On File

this new concept calculates bonuses/penalties for the king when the king is in a semiopen or open file.

Passed STC:
LLR: 2.95 (-2.94,2.94) {-0.25,1.25}
Total: 44904 W: 9365 L: 9028 D: 26511
Ptnml(0-2): 857, 5309, 9841, 5530, 915
https://tests.stockfishchess.org/tests/view/5fa343625d72639a7acef72b

Passed LTC:
LLR: 2.94 (-2.94,2.94) {0.25,1.25}
Total: 60552 W: 8449 L: 8051 D: 44052
Ptnml(0-2): 466, 5772, 17481, 6012, 545
https://tests.stockfishchess.org/tests/view/5fa40e365d72639a7acef79e

closes https://github.com/official-stockfish/Stockfish/pull/3219

Bench: 3689484
---
 src/pawns.cpp | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/pawns.cpp b/src/pawns.cpp
index a5102db8..fde70ba5 100644
--- a/src/pawns.cpp
+++ b/src/pawns.cpp
@@ -49,10 +49,10 @@ namespace {
   // Strength of pawn shelter for our king by [distance from edge][rank].
   // RANK_1 = 0 is used for files where we have no pawn, or pawn is behind our king.
   constexpr Value ShelterStrength[int(FILE_NB) / 2][RANK_NB] = {
-    { V( -6), V( 81), V( 93), V( 58), V( 39), V( 18), V(  25) },
-    { V(-43), V( 61), V( 35), V(-49), V(-29), V(-11), V( -63) },
-    { V(-10), V( 75), V( 23), V( -2), V( 32), V(  3), V( -45) },
-    { V(-39), V(-13), V(-29), V(-52), V(-48), V(-67), V(-166) }
+    { V( -5), V( 82), V( 92), V( 54), V( 36), V( 22), V(  28) },
+    { V(-44), V( 63), V( 33), V(-50), V(-30), V(-12), V( -62) },
+    { V(-11), V( 77), V( 22), V( -6), V( 31), V(  8), V( -45) },
+    { V(-39), V(-12), V(-29), V(-50), V(-43), V(-68), V(-164) }
   };
 
   // Danger of enemy pawns moving toward our king by [distance from edge][rank].
@@ -60,12 +60,17 @@ namespace {
   // is behind our king. Note that UnblockedStorm[0][1-2] accommodate opponent pawn
   // on edge, likely blocked by our king.
   constexpr Value UnblockedStorm[int(FILE_NB) / 2][RANK_NB] = {
-    { V( 85), V(-289), V(-166), V(97), V(50), V( 45), V( 50) },
-    { V( 46), V( -25), V( 122), V(45), V(37), V(-10), V( 20) },
-    { V( -6), V(  51), V( 168), V(34), V(-2), V(-22), V(-14) },
-    { V(-15), V( -11), V( 101), V( 4), V(11), V(-15), V(-29) }
+    { V( 87), V(-288), V(-168), V( 96), V( 47), V( 44), V( 46) },
+    { V( 42), V( -25), V( 120), V( 45), V( 34), V( -9), V( 24) },
+    { V( -8), V(  51), V( 167), V( 35), V( -4), V(-16), V(-12) },
+    { V(-17), V( -13), V( 100), V(  4), V(  9), V(-16), V(-31) }
   };
 
+  // KingOnFile[semi-open Us][semi-open Them] contains bonuses/penalties
+  // for king when the king is on a semi-open or open file.
+  constexpr Score KingOnFile[2][2] = {{ S(-19,12), S(-6, 7)  },
+                                     {  S(  0, 2), S( 6,-5) }};
+
   #undef S
   #undef V
 
@@ -237,6 +242,9 @@ Score Entry::evaluate_shelter(const Position& pos, Square ksq) const {
           bonus -= make_score(UnblockedStorm[d][theirRank], 0);
   }
 
+  // King On File
+  bonus -= KingOnFile[pos.is_on_semiopen_file(Us, ksq)][pos.is_on_semiopen_file(Them, ksq)];
+
   return bonus;
 }
 

From ba35c88ab84b959d41a67b3d8fcb40adc6537ec8 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Tue, 3 Nov 2020 22:49:10 +0100
Subject: [PATCH 310/398] AVX-512 for smaller affine and feature transforms.

For the feature transformer the code is analogical to AVX2 since there was room for easy adaptation of wider simd registers.

For the smaller affine transforms that have 32 byte stride we keep 2 columns in one zmm register. We also unroll more aggressively so that in the end we have to do 16 parallel horizontal additions on ymm slices each consisting of 4 32-bit integers. The slices are embedded in 8 zmm registers.

These changes provide about 1.5% speedup for AVX-512 builds.

Closes https://github.com/official-stockfish/Stockfish/pull/3218

No functional change.
---
 src/nnue/layers/affine_transform.h  | 129 +++++++++++++++++++++++++++-
 src/nnue/nnue_feature_transformer.h |  27 ++++--
 2 files changed, 148 insertions(+), 8 deletions(-)

diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index f0292e45..47c9c488 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -83,7 +83,21 @@ namespace Eval::NNUE::Layers {
         return _mm512_reduce_add_epi32(sum) + bias;
       };
 
-      [[maybe_unused]] auto m512_haddx4 = [](__m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3, __m128i bias) -> __m128i {
+      // This function takes
+      //   sum0 = [xmm0a, xmm0b, xmm0c, xmm0d]
+      //   sum1 = [xmm1a, xmm1b, xmm1c, xmm1d]
+      //   sum2 = [xmm2a, xmm2b, xmm2c, xmm2d]
+      //   sum3 = [xmm3a, xmm3b, xmm3c, xmm3d]
+      // and returns
+      //   ret = [
+      //     reduce_add_epi32(xmm0a), reduce_add_epi32(xmm1a), reduce_add_epi32(xmm2a), reduce_add_epi32(xmm3a),
+      //     reduce_add_epi32(xmm0b), reduce_add_epi32(xmm1b), reduce_add_epi32(xmm2b), reduce_add_epi32(xmm3b),
+      //     reduce_add_epi32(xmm0c), reduce_add_epi32(xmm1c), reduce_add_epi32(xmm2c), reduce_add_epi32(xmm3c),
+      //     reduce_add_epi32(xmm0d), reduce_add_epi32(xmm1d), reduce_add_epi32(xmm2d), reduce_add_epi32(xmm3d)
+      //   ]
+      [[maybe_unused]] auto m512_hadd128x16_interleave = [](
+        __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3) -> __m512i {
+
         __m512i sum01a = _mm512_unpacklo_epi32(sum0, sum1);
         __m512i sum01b = _mm512_unpackhi_epi32(sum0, sum1);
 
@@ -96,7 +110,13 @@ namespace Eval::NNUE::Layers {
         __m512i sum0123a = _mm512_unpacklo_epi64(sum01, sum23);
         __m512i sum0123b = _mm512_unpackhi_epi64(sum01, sum23);
 
-        __m512i sum = _mm512_add_epi32(sum0123a, sum0123b);
+        return _mm512_add_epi32(sum0123a, sum0123b);
+      };
+
+      [[maybe_unused]] auto m512_haddx4 = [m512_hadd128x16_interleave](
+        __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3, __m128i bias) -> __m128i {
+
+        __m512i sum = m512_hadd128x16_interleave(sum0, sum1, sum2, sum3);
 
         __m256i sum256lo = _mm512_castsi512_si256(sum);
         __m256i sum256hi = _mm512_extracti64x4_epi64(sum, 1);
@@ -109,6 +129,58 @@ namespace Eval::NNUE::Layers {
         return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias);
       };
 
+      [[maybe_unused]] auto m512_haddx8 = [m512_hadd128x16_interleave](
+        __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3,
+        __m512i sum4, __m512i sum5, __m512i sum6, __m512i sum7, __m256i bias) -> __m256i {
+
+        __m512i suma = m512_hadd128x16_interleave(sum0, sum1, sum2, sum3);
+        __m512i sumb = m512_hadd128x16_interleave(sum4, sum5, sum6, sum7);
+
+        __m512i indices0 = _mm512_setr_epi64(0, 1, 8, 9, 4, 5, 12, 13);
+        __m512i indices1 = _mm512_setr_epi64(2, 3, 10, 11, 6, 7, 14, 15);
+        __m512i x = _mm512_add_epi32(
+          _mm512_permutex2var_epi64(suma, indices0, sumb),
+          _mm512_permutex2var_epi64(suma, indices1, sumb));
+
+        __m256i sum256lo = _mm512_castsi512_si256(x);
+        __m256i sum256hi = _mm512_extracti64x4_epi64(x, 1);
+
+        return _mm256_add_epi32(_mm256_add_epi32(sum256lo, sum256hi), bias);
+      };
+
+      [[maybe_unused]] auto m512_hadd256x8 =[m512_hadd128x16_interleave](
+        __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3, __m256i bias) -> __m256i {
+
+        __m512i sum = m512_hadd128x16_interleave(sum0, sum1, sum2, sum3);
+
+        __m512i indices = _mm512_setr_epi32(
+          0, 4, 8, 12, 2, 6, 10, 14,
+          1, 5, 9, 13, 3, 7, 11, 15);
+        sum = _mm512_permutexvar_epi32(indices, sum);
+
+        __m256i sum256lo = _mm512_castsi512_si256(sum);
+        __m256i sum256hi = _mm512_extracti64x4_epi64(sum, 1);
+
+        return _mm256_add_epi32(_mm256_hadd_epi32(sum256lo, sum256hi), bias);
+      };
+
+      [[maybe_unused]] auto m512_hadd256x16 = [m512_hadd128x16_interleave](
+        __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3,
+        __m512i sum4, __m512i sum5, __m512i sum6, __m512i sum7, __m512i bias) -> __m512i {
+
+        __m512i suma = m512_hadd128x16_interleave(sum0, sum1, sum2, sum3);
+        __m512i sumb = m512_hadd128x16_interleave(sum4, sum5, sum6, sum7);
+
+        __m512i indices0 = _mm512_setr_epi64(0, 1, 8, 9, 4, 5, 12, 13);
+        __m512i indices1 = _mm512_setr_epi64(2, 3, 10, 11, 6, 7, 14, 15);
+        __m512i x = _mm512_add_epi32(
+          _mm512_permutex2var_epi64(suma, indices0, sumb),
+          _mm512_permutex2var_epi64(suma, indices1, sumb));
+
+        __m512i indices = _mm512_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15);
+        return _mm512_add_epi32(_mm512_permutexvar_epi32(indices, x), bias);
+      };
+
       [[maybe_unused]] auto m512_add_dpbusd_epi32 = [=](__m512i& acc, __m512i a, __m512i b) {
 #if defined (USE_VNNI)
         acc = _mm512_dpbusd_epi32(acc, a, b);
@@ -205,7 +277,58 @@ namespace Eval::NNUE::Layers {
 
       // kOutputDimensions is either 1 or a multiple of kSimdWidth
       // because then it is also an input dimension.
-      if constexpr (kOutputDimensions % 4 == 0)
+      if constexpr (kOutputDimensions % 16 == 0 && kNumChunks256 == 1)
+      {
+        for (IndexType i = 0; i < kOutputDimensions; i += 16)
+        {
+          const IndexType offset01a = (i + 0) * kPaddedInputDimensions;
+          const IndexType offset23a = (i + 2) * kPaddedInputDimensions;
+          const IndexType offset45a = (i + 4) * kPaddedInputDimensions;
+          const IndexType offset67a = (i + 6) * kPaddedInputDimensions;
+          const IndexType offset01b = (i + 8) * kPaddedInputDimensions;
+          const IndexType offset23b = (i + 10) * kPaddedInputDimensions;
+          const IndexType offset45b = (i + 12) * kPaddedInputDimensions;
+          const IndexType offset67b = (i + 14) * kPaddedInputDimensions;
+
+          const __m512i bias = *reinterpret_cast<const __m512i*>(&biases_[i]);
+          __m512i* outptr = reinterpret_cast<__m512i*>(&output[i]);
+
+          __m512i sum01a = _mm512_setzero_si512();
+          __m512i sum23a = _mm512_setzero_si512();
+          __m512i sum45a = _mm512_setzero_si512();
+          __m512i sum67a = _mm512_setzero_si512();
+          __m512i sum01b = _mm512_setzero_si512();
+          __m512i sum23b = _mm512_setzero_si512();
+          __m512i sum45b = _mm512_setzero_si512();
+          __m512i sum67b = _mm512_setzero_si512();
+
+          const auto row01a = *reinterpret_cast<const __m512i*>(&weights_[offset01a]);
+          const auto row23a = *reinterpret_cast<const __m512i*>(&weights_[offset23a]);
+          const auto row45a = *reinterpret_cast<const __m512i*>(&weights_[offset45a]);
+          const auto row67a = *reinterpret_cast<const __m512i*>(&weights_[offset67a]);
+          const auto row01b = *reinterpret_cast<const __m512i*>(&weights_[offset01b]);
+          const auto row23b = *reinterpret_cast<const __m512i*>(&weights_[offset23b]);
+          const auto row45b = *reinterpret_cast<const __m512i*>(&weights_[offset45b]);
+          const auto row67b = *reinterpret_cast<const __m512i*>(&weights_[offset67b]);
+
+          const __m256i in256 = input_vector256[0];
+          const __m512i in = _mm512_inserti64x4(_mm512_castsi256_si512(in256), in256, 1);
+
+          m512_add_dpbusd_epi32(sum01a, in, row01a);
+          m512_add_dpbusd_epi32(sum23a, in, row23a);
+          m512_add_dpbusd_epi32(sum45a, in, row45a);
+          m512_add_dpbusd_epi32(sum67a, in, row67a);
+          m512_add_dpbusd_epi32(sum01b, in, row01b);
+          m512_add_dpbusd_epi32(sum23b, in, row23b);
+          m512_add_dpbusd_epi32(sum45b, in, row45b);
+          m512_add_dpbusd_epi32(sum67b, in, row67b);
+
+          *outptr = m512_hadd256x16(
+            sum01a, sum23a, sum45a, sum67a,
+            sum01b, sum23b, sum45b, sum67b, bias);
+        }
+      }
+      else if constexpr (kOutputDimensions % 4 == 0)
       {
         for (IndexType i = 0; i < kOutputDimensions; i += 4)
         {
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index c3f012e4..f49777b5 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -127,7 +127,13 @@ namespace Eval::NNUE {
 
       const auto& accumulation = pos.state()->accumulator.accumulation;
 
-  #if defined(USE_AVX2)
+  #if defined(USE_AVX512)
+      constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth * 2);
+      static_assert(kHalfDimensions % (kSimdWidth * 2) == 0);
+      const __m512i kControl = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
+      const __m512i kZero = _mm512_setzero_si512();
+
+  #elif defined(USE_AVX2)
       constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
       constexpr int kControl = 0b11011000;
       const __m256i kZero = _mm256_setzero_si256();
@@ -154,13 +160,24 @@ namespace Eval::NNUE {
       for (IndexType p = 0; p < 2; ++p) {
         const IndexType offset = kHalfDimensions * p;
 
-  #if defined(USE_AVX2)
+  #if defined(USE_AVX512)
+        auto out = reinterpret_cast<__m512i*>(&output[offset]);
+        for (IndexType j = 0; j < kNumChunks; ++j) {
+          __m512i sum0 = _mm512_load_si512(
+              &reinterpret_cast<const __m512i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
+          __m512i sum1 = _mm512_load_si512(
+              &reinterpret_cast<const __m512i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
+          _mm512_store_si512(&out[j], _mm512_permutexvar_epi64(kControl,
+              _mm512_max_epi8(_mm512_packs_epi16(sum0, sum1), kZero)));
+        }
+
+  #elif defined(USE_AVX2)
         auto out = reinterpret_cast<__m256i*>(&output[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
           __m256i sum0 = _mm256_load_si256(
               &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
           __m256i sum1 = _mm256_load_si256(
-            &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
+              &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
           _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
               _mm256_packs_epi16(sum0, sum1), kZero), kControl));
         }
@@ -177,9 +194,9 @@ namespace Eval::NNUE {
           _mm_store_si128(&out[j],
 
   #ifdef USE_SSE41
-            _mm_max_epi8(packedbytes, kZero)
+              _mm_max_epi8(packedbytes, kZero)
   #else
-            _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
+              _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
   #endif
 
           );

From 32edb1d009e09a9442cb7393920e072ffd08005d Mon Sep 17 00:00:00 2001
From: SFisGOD <jonathandumale@gmail.com>
Date: Sat, 7 Nov 2020 08:50:02 +0800
Subject: [PATCH 311/398] Update default net to nn-c3ca321c51c9.nnue

Optimization of the net biases of the 32 x 32 layer and the output layer.

Tuning of 32 x 32 layer (200k games, 5 seconds TC)
https://tests.stockfishchess.org/tests/view/5f9aaf266a2c112b60691c68

STC:
LLR: 2.95 (-2.94,2.94) {-0.25,1.25}
Total: 41848 W: 4665 L: 4461 D: 32722
Ptnml(0-2): 239, 3308, 13659, 3446, 272
https://tests.stockfishchess.org/tests/view/5fa5ef5a936c54e11ec9954f

LTC:
LLR: 2.94 (-2.94,2.94) {0.25,1.25}
Total: 88008 W: 4045 L: 3768 D: 80195
Ptnml(0-2): 69, 3339, 36908, 3622, 66
https://tests.stockfishchess.org/tests/view/5fa62a78936c54e11ec99577

closes https://github.com/official-stockfish/Stockfish/pull/3220

Bench: 3649288
---
 src/evaluate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/evaluate.h b/src/evaluate.h
index 6bec27db..06c66f71 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -36,7 +36,7 @@ namespace Eval {
   // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
   // for the build process (profile-build and fishtest) to work. Do not change the
   // name of the macro, as it is used in the Makefile.
-  #define EvalFileDefaultName   "nn-cb26f10b1fd9.nnue"
+  #define EvalFileDefaultName   "nn-c3ca321c51c9.nnue"
 
   namespace NNUE {
 

From 392b529c3f52103ad47ad096b86103c17758cb4f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ste=CC=81phane=20Nicolet?= <cassio@free.fr>
Date: Fri, 6 Nov 2020 19:20:27 +0100
Subject: [PATCH 312/398] Qsearch pruning: follow-up

This is a follow-up of the recent qsearch pruning patch in
https://github.com/official-stockfish/Stockfish/commit/a260c9a8a24a2630a900efc3821000c3481b0c5d

We now use the same guard condition (testing that we already have a defense with
a score better  score than a TB loss) for all pruning heuristics in qsearch().
This allows some pruning when in check, but  in a controlled way to ensure that
no wrong mate scores appear.

Tested with Elo-gaining bounds:

STC:
LLR: 2.97 (-2.94,2.94) {-0.25,1.25}
Total: 22632 W: 2433 L: 2264 D: 17935
Ptnml(0-2): 98, 1744, 7487, 1865, 122
https://tests.stockfishchess.org/tests/view/5fa59405936c54e11ec99515

LTC:
LLR: 2.94 (-2.94,2.94) {0.25,1.25}
Total: 105432 W: 4965 L: 4648 D: 95819
Ptnml(0-2): 85, 4110, 44011, 4423, 87
https://tests.stockfishchess.org/tests/view/5fa5b609936c54e11ec9952a

closes https://github.com/official-stockfish/Stockfish/pull/3221

Bench: 3578092
---
 src/search.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index 6e37fba1..b5b93bf0 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1525,7 +1525,7 @@ moves_loop: // When in check, search starts from here
       moveCount++;
 
       // Futility pruning
-      if (   !ss->inCheck
+      if (    bestValue > VALUE_TB_LOSS_IN_MAX_PLY
           && !givesCheck
           &&  futilityBase > -VALUE_KNOWN_WIN
           && !pos.advanced_pawn_push(move))
@@ -1552,7 +1552,7 @@ moves_loop: // When in check, search starts from here
       }
 
       // Do not search moves with negative SEE values
-      if (   !ss->inCheck
+      if (    bestValue > VALUE_TB_LOSS_IN_MAX_PLY
           && !(givesCheck && pos.is_discovery_check_on_king(~pos.side_to_move(), move))
           && !pos.see_ge(move))
           continue;

From 5d88e7bce8aa2478db3c5c3ea9a0651b2339d34c Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 4 Nov 2020 20:17:58 +0100
Subject: [PATCH 313/398] Add optional move validation to training data
 conversion. No longer rely on static initialization order for magics
 initialization.

---
 src/extra/nnue_data_binpack_format.h | 770 ++++++++++++++++++++++++---
 src/learn/convert.cpp                |  16 +-
 2 files changed, 709 insertions(+), 77 deletions(-)

diff --git a/src/extra/nnue_data_binpack_format.h b/src/extra/nnue_data_binpack_format.h
index b9e45c3e..ceb5c415 100644
--- a/src/extra/nnue_data_binpack_format.h
+++ b/src/extra/nnue_data_binpack_format.h
@@ -3132,7 +3132,11 @@ namespace chess
                 };
             }
 
-            static const EnumArray2<PieceType, Square, Bitboard> pseudoAttacks = generatePseudoAttacks();
+            static const EnumArray2<PieceType, Square, Bitboard>& pseudoAttacks()
+            {
+                static const EnumArray2<PieceType, Square, Bitboard> s_pseudoAttacks = generatePseudoAttacks();
+                return s_pseudoAttacks;
+            }
 
             [[nodiscard]] static Bitboard generatePositiveRayAttacks(Direction dir, Square fromSq)
             {
@@ -3187,24 +3191,29 @@ namespace chess
                 return bbs;
             }
 
-            static const std::array<EnumArray<Square, Bitboard>, 8> positiveRayAttacks = generatePositiveRayAttacks();
+
+            static const std::array<EnumArray<Square, Bitboard>, 8>& positiveRayAttacks()
+            {
+                static const std::array<EnumArray<Square, Bitboard>, 8> s_positiveRayAttacks = generatePositiveRayAttacks();
+                return s_positiveRayAttacks;
+            }
 
             template <Direction DirV>
             [[nodiscard]] static Bitboard slidingAttacks(Square sq, Bitboard occupied)
             {
                 assert(sq.isOk());
 
-                Bitboard attacks = positiveRayAttacks[DirV][sq];
+                Bitboard attacks = positiveRayAttacks()[DirV][sq];
 
                 if constexpr (DirV == NorthWest || DirV == North || DirV == NorthEast || DirV == East)
                 {
                     Bitboard blocker = (attacks & occupied) | h8; // set highest bit (H8) so msb never fails
-                    return attacks ^ positiveRayAttacks[DirV][blocker.first()];
+                    return attacks ^ positiveRayAttacks()[DirV][blocker.first()];
                 }
                 else
                 {
                     Bitboard blocker = (attacks & occupied) | a1;
-                    return attacks ^ positiveRayAttacks[DirV][blocker.last()];
+                    return attacks ^ positiveRayAttacks()[DirV][blocker.last()];
                 }
             }
 
@@ -3290,10 +3299,10 @@ namespace chess
             {
                 for (PieceType pt : { PieceType::Bishop, PieceType::Rook })
                 {
-                    const Bitboard s1Attacks = pseudoAttacks[pt][s1];
+                    const Bitboard s1Attacks = pseudoAttacks()[pt][s1];
                     if (s1Attacks.isSet(s2))
                     {
-                        const Bitboard s2Attacks = pseudoAttacks[pt][s2];
+                        const Bitboard s2Attacks = pseudoAttacks()[pt][s2];
                         return (s1Attacks & s2Attacks) | s1 | s2;
                     }
                 }
@@ -3420,14 +3429,14 @@ namespace chess
 
             assert(sq.isOk());
 
-            return detail::pseudoAttacks[PieceTypeV][sq];
+            return detail::pseudoAttacks()[PieceTypeV][sq];
         }
 
         [[nodiscard]] inline Bitboard pseudoAttacks(PieceType pt, Square sq)
         {
             assert(sq.isOk());
 
-            return detail::pseudoAttacks[pt][sq];
+            return detail::pseudoAttacks()[pt][sq];
         }
 
         [[nodiscard]] inline Bitboard pawnAttacks(Bitboard pawns, Color color)
@@ -4373,6 +4382,22 @@ namespace chess
         std::uint64_t low;
     };
 
+    struct Position;
+
+    struct MoveLegalityChecker
+    {
+        MoveLegalityChecker(const Position& position);
+
+        [[nodiscard]] bool isPseudoLegalMoveLegal(const Move& move) const;
+
+    private:
+        const Position* m_position;
+        Bitboard m_checkers;
+        Bitboard m_ourBlockersForKing;
+        Bitboard m_potentialCheckRemovals;
+        Square m_ksq;
+    };
+
     struct Position : public Board
     {
         using BaseType = Board;
@@ -4412,6 +4437,11 @@ namespace chess
 
         [[nodiscard]] inline std::string fen() const;
 
+        [[nodiscard]] MoveLegalityChecker moveLegalityChecker() const
+        {
+            return { *this };
+        }
+
         constexpr void setEpSquareUnchecked(Square sq)
         {
             m_epSquare = sq;
@@ -4498,6 +4528,8 @@ namespace chess
 
         [[nodiscard]] inline bool isCheckAfterMove(Move move) const;
 
+        [[nodiscard]] inline bool isMoveLegal(Move move) const;
+
         [[nodiscard]] inline bool isPseudoLegalMoveLegal(Move move) const;
 
         [[nodiscard]] inline bool isMovePseudoLegal(Move move) const;
@@ -4665,6 +4697,592 @@ namespace chess
         std::uint8_t m_packedState[16];
     };
 
+    namespace movegen
+    {
+        // For a pseudo-legal move the following are true:
+        //  - the moving piece has the pos.sideToMove() color
+        //  - the destination square is either empty or has a piece of the opposite color
+        //  - if it is a pawn move it is valid (but may be illegal due to discovered checks)
+        //  - if it is not a pawn move then the destination square is contained in attacks()
+        //  - if it is a castling it is legal
+        //  - a move other than castling may create a discovered attack on the king
+        //  - a king may walk into a check
+
+        template <typename FuncT>
+        inline void forEachPseudoLegalPawnMove(const Position& pos, Square from, FuncT&& f)
+        {
+            const Color sideToMove = pos.sideToMove();
+            const Square epSquare = pos.epSquare();
+            const Bitboard ourPieces = pos.piecesBB(sideToMove);
+            const Bitboard theirPieces = pos.piecesBB(!sideToMove);
+            const Bitboard occupied = ourPieces | theirPieces;
+
+            Bitboard attackTargets = theirPieces;
+            if (epSquare != Square::none())
+            {
+                attackTargets |= epSquare;
+            }
+
+            const Bitboard attacks = bb::pawnAttacks(Bitboard::square(from), sideToMove) & attackTargets;
+
+            const Rank secondToLastRank = sideToMove == Color::White ? rank7 : rank2;
+            const auto forward = sideToMove == Color::White ? FlatSquareOffset(0, 1) : FlatSquareOffset(0, -1);
+
+            // promotions
+            if (from.rank() == secondToLastRank)
+            {
+                // capture promotions
+                for (Square toSq : attacks)
+                {
+                    for (PieceType pt : { PieceType::Knight, PieceType::Bishop, PieceType::Rook, PieceType::Queen })
+                    {
+                        Move move{ from, toSq, MoveType::Promotion, Piece(pt, sideToMove) };
+                        f(move);
+                    }
+                }
+
+                // push promotions
+                const Square toSq = from + forward;
+                if (!occupied.isSet(toSq))
+                {
+                    for (PieceType pt : { PieceType::Knight, PieceType::Bishop, PieceType::Rook, PieceType::Queen })
+                    {
+                        Move move{ from, toSq, MoveType::Promotion, Piece(pt, sideToMove) };
+                        f(move);
+                    }
+                }
+            }
+            else
+            {
+                // captures
+                for (Square toSq : attacks)
+                {
+                    Move move{ from, toSq, (toSq == epSquare) ? MoveType::EnPassant : MoveType::Normal };
+                    f(move);
+                }
+
+                const Square toSq = from + forward;
+
+                // single push
+                if (!occupied.isSet(toSq))
+                {
+                    const Rank startRank = sideToMove == Color::White ? rank2 : rank7;
+                    if (from.rank() == startRank)
+                    {
+                        // double push
+                        const Square toSq2 = toSq + forward;
+                        if (!occupied.isSet(toSq2))
+                        {
+                            Move move{ from, toSq2 };
+                            f(move);
+                        }
+                    }
+
+                    Move move{ from, toSq };
+                    f(move);
+                }
+            }
+        }
+
+        template <Color SideToMoveV, typename FuncT>
+        inline void forEachPseudoLegalPawnMove(const Position& pos, FuncT&& f)
+        {
+            const Square epSquare = pos.epSquare();
+            const Bitboard ourPieces = pos.piecesBB(SideToMoveV);
+            const Bitboard theirPieces = pos.piecesBB(!SideToMoveV);
+            const Bitboard occupied = ourPieces | theirPieces;
+            const Bitboard pawns = pos.piecesBB(Piece(PieceType::Pawn, SideToMoveV));
+
+            const Bitboard secondToLastRank = SideToMoveV == Color::White ? bb::rank7 : bb::rank2;
+            const Bitboard secondRank = SideToMoveV == Color::White ? bb::rank2 : bb::rank7;
+
+            const auto singlePawnMoveDestinationOffset = SideToMoveV == Color::White ? FlatSquareOffset(0, 1) : FlatSquareOffset(0, -1);
+            const auto doublePawnMoveDestinationOffset = SideToMoveV == Color::White ? FlatSquareOffset(0, 2) : FlatSquareOffset(0, -2);
+
+            {
+                const int backward = SideToMoveV == Color::White ? -1 : 1;
+                const int backward2 = backward * 2;
+
+                const Bitboard doublePawnMoveStarts =
+                    pawns
+                    & secondRank
+                    & ~(occupied.shiftedVertically(backward) | occupied.shiftedVertically(backward2));
+
+                const Bitboard singlePawnMoveStarts =
+                    pawns
+                    & ~secondToLastRank
+                    & ~occupied.shiftedVertically(backward);
+
+                for (Square from : doublePawnMoveStarts)
+                {
+                    const Square to = from + doublePawnMoveDestinationOffset;
+                    f(Move::normal(from, to));
+                }
+
+                for (Square from : singlePawnMoveStarts)
+                {
+                    const Square to = from + singlePawnMoveDestinationOffset;
+                    f(Move::normal(from, to));
+                }
+            }
+
+            {
+                const Bitboard lastRank = SideToMoveV == Color::White ? bb::rank8 : bb::rank1;
+                const FlatSquareOffset westCaptureOffset = SideToMoveV == Color::White ? FlatSquareOffset(-1, 1) : FlatSquareOffset(-1, -1);
+                const FlatSquareOffset eastCaptureOffset = SideToMoveV == Color::White ? FlatSquareOffset(1, 1) : FlatSquareOffset(1, -1);
+
+                const Bitboard pawnsWithWestCapture = bb::eastPawnAttacks(theirPieces & ~lastRank, !SideToMoveV) & pawns;
+                const Bitboard pawnsWithEastCapture = bb::westPawnAttacks(theirPieces & ~lastRank, !SideToMoveV) & pawns;
+
+                for (Square from : pawnsWithWestCapture)
+                {
+                    f(Move::normal(from, from + westCaptureOffset));
+                }
+
+                for (Square from : pawnsWithEastCapture)
+                {
+                    f(Move::normal(from, from + eastCaptureOffset));
+                }
+            }
+
+            if (epSquare != Square::none())
+            {
+                const Bitboard pawnsThatCanCapture = bb::pawnAttacks(Bitboard::square(epSquare), !SideToMoveV) & pawns;
+                for (Square from : pawnsThatCanCapture)
+                {
+                    f(Move::enPassant(from, epSquare));
+                }
+            }
+
+            for (Square from : pawns & secondToLastRank)
+            {
+                const Bitboard attacks = bb::pawnAttacks(Bitboard::square(from), SideToMoveV) & theirPieces;
+
+                // capture promotions
+                for (Square to : attacks)
+                {
+                    for (PieceType pt : { PieceType::Knight, PieceType::Bishop, PieceType::Rook, PieceType::Queen })
+                    {
+                        Move move{ from, to, MoveType::Promotion, Piece(pt, SideToMoveV) };
+                        f(move);
+                    }
+                }
+
+                // push promotions
+                const Square to = from + singlePawnMoveDestinationOffset;
+                if (!occupied.isSet(to))
+                {
+                    for (PieceType pt : { PieceType::Knight, PieceType::Bishop, PieceType::Rook, PieceType::Queen })
+                    {
+                        Move move{ from, to, MoveType::Promotion, Piece(pt, SideToMoveV) };
+                        f(move);
+                    }
+                }
+            }
+        }
+
+        template <typename FuncT>
+        inline void forEachPseudoLegalPawnMove(const Position& pos, FuncT&& f)
+        {
+            if (pos.sideToMove() == Color::White)
+            {
+                forEachPseudoLegalPawnMove<Color::White>(pos, std::forward<FuncT>(f));
+            }
+            else
+            {
+                forEachPseudoLegalPawnMove<Color::Black>(pos, std::forward<FuncT>(f));
+            }
+        }
+
+        template <PieceType PieceTypeV, typename FuncT>
+        inline void forEachPseudoLegalPieceMove(const Position& pos, Square from, FuncT&& f)
+        {
+            static_assert(PieceTypeV != PieceType::None);
+
+            if constexpr (PieceTypeV == PieceType::Pawn)
+            {
+                forEachPseudoLegalPawnMove(pos, from, f);
+            }
+            else
+            {
+                const Color sideToMove = pos.sideToMove();
+                const Bitboard ourPieces = pos.piecesBB(sideToMove);
+                const Bitboard theirPieces = pos.piecesBB(!sideToMove);
+                const Bitboard occupied = ourPieces | theirPieces;
+                const Bitboard attacks = bb::attacks<PieceTypeV>(from, occupied) & ~ourPieces;
+
+                for (Square toSq : attacks)
+                {
+                    Move move{ from, toSq };
+                    f(move);
+                }
+            }
+        }
+
+        template <PieceType PieceTypeV, typename FuncT>
+        inline void forEachPseudoLegalPieceMove(const Position& pos, FuncT&& f)
+        {
+            static_assert(PieceTypeV != PieceType::None);
+
+            if constexpr (PieceTypeV == PieceType::Pawn)
+            {
+                forEachPseudoLegalPawnMove(pos, f);
+            }
+            else
+            {
+                const Color sideToMove = pos.sideToMove();
+                const Bitboard ourPieces = pos.piecesBB(sideToMove);
+                const Bitboard theirPieces = pos.piecesBB(!sideToMove);
+                const Bitboard occupied = ourPieces | theirPieces;
+                const Bitboard pieces = pos.piecesBB(Piece(PieceTypeV, sideToMove));
+                for (Square fromSq : pieces)
+                {
+                    const Bitboard attacks = bb::attacks<PieceTypeV>(fromSq, occupied) & ~ourPieces;
+                    for (Square toSq : attacks)
+                    {
+                        Move move{ fromSq, toSq };
+                        f(move);
+                    }
+                }
+            }
+        }
+
+        template <typename FuncT>
+        inline void forEachCastlingMove(const Position& pos, FuncT&& f)
+        {
+            CastlingRights rights = pos.castlingRights();
+            if (rights == CastlingRights::None)
+            {
+                return;
+            }
+
+            const Color sideToMove = pos.sideToMove();
+            const Bitboard ourPieces = pos.piecesBB(sideToMove);
+            const Bitboard theirPieces = pos.piecesBB(!sideToMove);
+            const Bitboard occupied = ourPieces | theirPieces;
+
+            // we first reduce the set of legal castlings by checking the paths for pieces
+            if (sideToMove == Color::White)
+            {
+                if ((CastlingTraits::castlingPath[Color::White][CastleType::Short] & occupied).any()) rights &= ~CastlingRights::WhiteKingSide;
+                if ((CastlingTraits::castlingPath[Color::White][CastleType::Long] & occupied).any()) rights &= ~CastlingRights::WhiteQueenSide;
+                rights &= ~CastlingRights::Black;
+            }
+            else
+            {
+                if ((CastlingTraits::castlingPath[Color::Black][CastleType::Short] & occupied).any()) rights &= ~CastlingRights::BlackKingSide;
+                if ((CastlingTraits::castlingPath[Color::Black][CastleType::Long] & occupied).any()) rights &= ~CastlingRights::BlackQueenSide;
+                rights &= ~CastlingRights::White;
+            }
+
+            if (rights == CastlingRights::None)
+            {
+                return;
+            }
+
+            // King must not be in check. Done here because it is quite expensive.
+            const Square ksq = pos.kingSquare(sideToMove);
+            if (pos.isSquareAttacked(ksq, !sideToMove))
+            {
+                return;
+            }
+
+            // Loop through all possible castlings.
+            for (CastleType castlingType : values<CastleType>())
+            {
+                const CastlingRights right = CastlingTraits::castlingRights[sideToMove][castlingType];
+
+                if (!contains(rights, right))
+                {
+                    continue;
+                }
+
+                // If we have this castling right
+                // we check whether the king passes an attacked square.
+                const Square passedSquare = CastlingTraits::squarePassedByKing[sideToMove][castlingType];
+                if (pos.isSquareAttacked(passedSquare, !sideToMove))
+                {
+                    continue;
+                }
+
+                // If it's a castling move then the change in square occupation
+                // cannot have an effect because otherwise there would be
+                // a slider attacker attacking the castling king.
+                if (pos.isSquareAttacked(CastlingTraits::kingDestination[sideToMove][castlingType], !sideToMove))
+                {
+                    continue;
+                }
+
+                // If not we can castle.
+                Move move = Move::castle(castlingType, sideToMove);
+                f(move);
+            }
+        }
+
+        // Calls a given function for all pseudo legal moves for the position.
+        // `pos` must be a legal chess position
+        template <typename FuncT>
+        inline void forEachPseudoLegalMove(const Position& pos, FuncT&& func)
+        {
+            forEachPseudoLegalPieceMove<PieceType::Pawn>(pos, func);
+            forEachPseudoLegalPieceMove<PieceType::Knight>(pos, func);
+            forEachPseudoLegalPieceMove<PieceType::Bishop>(pos, func);
+            forEachPseudoLegalPieceMove<PieceType::Rook>(pos, func);
+            forEachPseudoLegalPieceMove<PieceType::Queen>(pos, func);
+            forEachPseudoLegalPieceMove<PieceType::King>(pos, func);
+            forEachCastlingMove(pos, func);
+        }
+
+        // Calls a given function for all legal moves for the position.
+        // `pos` must be a legal chess position
+        template <typename FuncT>
+        inline void forEachLegalMove(const Position& pos, FuncT&& func)
+        {
+            auto funcIfLegal = [&func, checker = pos.moveLegalityChecker()](Move move) {
+                if (checker.isPseudoLegalMoveLegal(move))
+                {
+                    func(move);
+                }
+            };
+
+            forEachPseudoLegalPieceMove<PieceType::Pawn>(pos, funcIfLegal);
+            forEachPseudoLegalPieceMove<PieceType::Knight>(pos, funcIfLegal);
+            forEachPseudoLegalPieceMove<PieceType::Bishop>(pos, funcIfLegal);
+            forEachPseudoLegalPieceMove<PieceType::Rook>(pos, funcIfLegal);
+            forEachPseudoLegalPieceMove<PieceType::Queen>(pos, funcIfLegal);
+            forEachPseudoLegalPieceMove<PieceType::King>(pos, funcIfLegal);
+            forEachCastlingMove(pos, func);
+        }
+
+        // Generates all pseudo legal moves for the position.
+        // `pos` must be a legal chess position
+        [[nodiscard]] std::vector<Move> generatePseudoLegalMoves(const Position& pos);
+
+        // Generates all legal moves for the position.
+        // `pos` must be a legal chess position
+        [[nodiscard]] std::vector<Move> generateLegalMoves(const Position& pos);
+    }
+
+    [[nodiscard]] inline bool Position::isCheck() const
+    {
+        return BaseType::isSquareAttacked(kingSquare(m_sideToMove), !m_sideToMove);
+    }
+
+    [[nodiscard]] inline Bitboard Position::checkers() const
+    {
+        return BaseType::attackers(kingSquare(m_sideToMove), !m_sideToMove);
+    }
+
+    [[nodiscard]] inline bool Position::isCheckAfterMove(Move move) const
+    {
+        return BaseType::isSquareAttackedAfterMove(move, kingSquare(!m_sideToMove), m_sideToMove);
+    }
+
+    [[nodiscard]] inline bool Position::isMoveLegal(Move move) const
+    {
+        return
+            isMovePseudoLegal(move)
+            && isPseudoLegalMoveLegal(move);
+    }
+
+    [[nodiscard]] inline bool Position::isPseudoLegalMoveLegal(Move move) const
+    {
+        return
+            (move.type == MoveType::Castle)
+            || !isOwnKingAttackedAfterMove(move);
+    }
+
+    [[nodiscard]] inline bool Position::isMovePseudoLegal(Move move) const
+    {
+        if (!move.from.isOk() || !move.to.isOk())
+        {
+            return false;
+        }
+
+        if (move.from == move.to)
+        {
+            return false;
+        }
+
+        if (move.type != MoveType::Promotion && move.promotedPiece != Piece::none())
+        {
+            return false;
+        }
+
+        const Piece movedPiece = pieceAt(move.from);
+        if (movedPiece == Piece::none())
+        {
+            return false;
+        }
+
+        if (movedPiece.color() != m_sideToMove)
+        {
+            return false;
+        }
+
+        const Bitboard occupied = piecesBB();
+        const Bitboard ourPieces = piecesBB(m_sideToMove);
+        const bool isNormal = move.type == MoveType::Normal;
+
+        switch (movedPiece.type())
+        {
+        case PieceType::Pawn:
+        {
+            bool isValid = false;
+            // TODO: use iterators so we don't loop over all moves
+            //       when we can avoid it.
+            movegen::forEachPseudoLegalPawnMove(*this, move.from, [&isValid, &move](const Move& genMove) {
+                if (move == genMove)
+                {
+                    isValid = true;
+                }
+                });
+            return isValid;
+        }
+
+        case PieceType::Bishop:
+            return isNormal && (bb::attacks<PieceType::Bishop>(move.from, occupied) & ~ourPieces).isSet(move.to);
+
+        case PieceType::Knight:
+            return isNormal && (bb::pseudoAttacks<PieceType::Knight>(move.from) & ~ourPieces).isSet(move.to);
+
+        case PieceType::Rook:
+            return isNormal && (bb::attacks<PieceType::Rook>(move.from, occupied) & ~ourPieces).isSet(move.to);
+
+        case PieceType::Queen:
+            return isNormal && (bb::attacks<PieceType::Queen>(move.from, occupied) & ~ourPieces).isSet(move.to);
+
+        case PieceType::King:
+        {
+            if (move.type == MoveType::Castle)
+            {
+                bool isValid = false;
+                movegen::forEachCastlingMove(*this, [&isValid, &move](const Move& genMove) {
+                    if (move == genMove)
+                    {
+                        isValid = true;
+                    }
+                    });
+                return isValid;
+            }
+            else
+            {
+                return isNormal && (bb::pseudoAttacks<PieceType::King>(move.from) & ~ourPieces).isSet(move.to);
+            }
+        }
+
+        default:
+            return false;
+        }
+    }
+
+    [[nodiscard]] inline Bitboard Position::blockersForKing(Color color) const
+    {
+        const Color attackerColor = !color;
+
+        const Bitboard occupied = piecesBB();
+
+        const Bitboard bishops = piecesBB(Piece(PieceType::Bishop, attackerColor));
+        const Bitboard rooks = piecesBB(Piece(PieceType::Rook, attackerColor));
+        const Bitboard queens = piecesBB(Piece(PieceType::Queen, attackerColor));
+
+        const Square ksq = kingSquare(color);
+
+        const Bitboard opponentBishopLikePieces = (bishops | queens);
+        const Bitboard bishopPseudoAttacks = bb::pseudoAttacks<PieceType::Bishop>(ksq);
+
+        const Bitboard opponentRookLikePieces = (rooks | queens);
+        const Bitboard rookPseudoAttacks = bb::pseudoAttacks<PieceType::Rook>(ksq);
+
+        const Bitboard xrayers =
+            (bishopPseudoAttacks & opponentBishopLikePieces)
+            | (rookPseudoAttacks & opponentRookLikePieces);
+
+        Bitboard allBlockers = Bitboard::none();
+
+        for (Square xrayer : xrayers)
+        {
+            const Bitboard blockers = bb::between(xrayer, ksq) & occupied;
+            if (blockers.exactlyOne())
+            {
+                allBlockers |= blockers;
+            }
+        }
+
+        return allBlockers;
+    }
+
+    inline MoveLegalityChecker::MoveLegalityChecker(const Position& position) :
+        m_position(&position),
+        m_checkers(position.checkers()),
+        m_ourBlockersForKing(
+            position.blockersForKing(position.sideToMove())
+            & position.piecesBB(position.sideToMove())
+        ),
+        m_ksq(position.kingSquare(position.sideToMove()))
+    {
+        if (m_checkers.exactlyOne())
+        {
+            const Bitboard knightCheckers = m_checkers & bb::pseudoAttacks<PieceType::Knight>(m_ksq);
+            if (knightCheckers.any())
+            {
+                // We're checked by a knight, we have to remove it or move the king.
+                m_potentialCheckRemovals = knightCheckers;
+            }
+            else
+            {
+                // If we're not checked by a knight we can block it.
+                m_potentialCheckRemovals = bb::between(m_ksq, m_checkers.first()) | m_checkers;
+            }
+        }
+        else
+        {
+            // Double check, king has to move.
+            m_potentialCheckRemovals = Bitboard::none();
+        }
+    }
+
+    [[nodiscard]] inline bool MoveLegalityChecker::isPseudoLegalMoveLegal(const Move& move) const
+    {
+        if (m_checkers.any())
+        {
+            if (move.from == m_ksq || move.type == MoveType::EnPassant)
+            {
+                return m_position->isPseudoLegalMoveLegal(move);
+            }
+            else
+            {
+                // This means there's only one check and we either
+                // blocked it or removed the piece that attacked
+                // our king. So the only threat is if it's a discovered check.
+                return
+                    m_potentialCheckRemovals.isSet(move.to)
+                    && !m_ourBlockersForKing.isSet(move.from);
+            }
+        }
+        else
+        {
+            if (move.from == m_ksq)
+            {
+                return m_position->isPseudoLegalMoveLegal(move);
+            }
+            else if (move.type == MoveType::EnPassant)
+            {
+                return !m_position->createsDiscoveredAttackOnOwnKing(move);
+            }
+            else if (m_ourBlockersForKing.isSet(move.from))
+            {
+                // If it was a blocker it may have only moved in line with our king.
+                // Otherwise it's a discovered check.
+                return bb::line(m_ksq, move.from).isSet(move.to);
+            }
+            else
+            {
+                return true;
+            }
+        }
+    }
+
     static_assert(sizeof(CompressedPosition) == 24);
     static_assert(std::is_trivially_copyable_v<CompressedPosition>);
 
@@ -5483,57 +6101,6 @@ namespace chess
         return { move, captured, oldEpSquare, oldCastlingRights };
     }
 
-    [[nodiscard]] inline bool Position::isCheck() const
-    {
-        return BaseType::isSquareAttacked(kingSquare(m_sideToMove), !m_sideToMove);
-    }
-
-    [[nodiscard]] inline Bitboard Position::checkers() const
-    {
-        return BaseType::attackers(kingSquare(m_sideToMove), !m_sideToMove);
-    }
-
-    [[nodiscard]] bool Position::isCheckAfterMove(Move move) const
-    {
-        return BaseType::isSquareAttackedAfterMove(move, kingSquare(!m_sideToMove), m_sideToMove);
-    }
-
-    [[nodiscard]] inline Bitboard Position::blockersForKing(Color color) const
-    {
-        const Color attackerColor = !color;
-
-        const Bitboard occupied = piecesBB();
-
-        const Bitboard bishops = piecesBB(Piece(PieceType::Bishop, attackerColor));
-        const Bitboard rooks = piecesBB(Piece(PieceType::Rook, attackerColor));
-        const Bitboard queens = piecesBB(Piece(PieceType::Queen, attackerColor));
-
-        const Square ksq = kingSquare(color);
-
-        const Bitboard opponentBishopLikePieces = (bishops | queens);
-        const Bitboard bishopPseudoAttacks = bb::pseudoAttacks<PieceType::Bishop>(ksq);
-
-        const Bitboard opponentRookLikePieces = (rooks | queens);
-        const Bitboard rookPseudoAttacks = bb::pseudoAttacks<PieceType::Rook>(ksq);
-
-        const Bitboard xrayers =
-            (bishopPseudoAttacks & opponentBishopLikePieces)
-            | (rookPseudoAttacks & opponentRookLikePieces);
-
-        Bitboard allBlockers = Bitboard::none();
-
-        for (Square xrayer : xrayers)
-        {
-            const Bitboard blockers = bb::between(xrayer, ksq) & occupied;
-            if (blockers.exactlyOne())
-            {
-                allBlockers |= blockers;
-            }
-        }
-
-        return allBlockers;
-    }
-
     [[nodiscard]] inline Position Position::afterMove(Move move) const
     {
         Position cpy(*this);
@@ -5756,6 +6323,25 @@ namespace binpack
                 return chess::Move{from, to, type};
             }
 
+            [[nodiscard]] std::string toString() const
+            {
+                const chess::Square to = static_cast<chess::Square>((m_raw & (0b111111 << 0) >> 0));
+                const chess::Square from = static_cast<chess::Square>((m_raw & (0b111111 << 6)) >> 6);
+
+                const unsigned promotionIndex = (m_raw & (0b11 << 12)) >> 12;
+                const chess::PieceType promotionType = static_cast<chess::PieceType>(static_cast<int>(chess::PieceType::Knight) + promotionIndex);
+
+                std::string r;
+                chess::parser_bits::appendSquareToString(from, r);
+                chess::parser_bits::appendSquareToString(to, r);
+                if (promotionType != chess::PieceType::None)
+                {
+                    r += chess::EnumTraits<chess::PieceType>::toChar(promotionType, chess::Color::Black);
+                }
+
+                return r;
+            }
+
         private:
             std::uint16_t m_raw;
         };
@@ -6233,6 +6819,11 @@ namespace binpack
         std::int16_t score;
         std::uint16_t ply;
         std::int16_t result;
+
+        [[nodiscard]] bool isValid() const
+        {
+            return pos.isMoveLegal(move);
+        }
     };
 
     [[nodiscard]] inline TrainingDataEntry packedSfenValueToTrainingDataEntry(const nodchip::PackedSfenValue& psv)
@@ -6921,7 +7512,7 @@ namespace binpack
         buffer.insert(buffer.end(), data, data+sizeof(psv));
     }
 
-    inline void convertPlainToBinpack(std::string inputPath, std::string outputPath, std::ios_base::openmode om)
+    inline void convertPlainToBinpack(std::string inputPath, std::string outputPath, std::ios_base::openmode om, bool validate)
     {
         constexpr std::size_t reportEveryNPositions = 100'000;
 
@@ -6949,6 +7540,11 @@ namespace binpack
             if (key == "e"sv)
             {
                 e.move = chess::uci::uciToMove(e.pos, move);
+                if (validate && !e.isValid())
+                {
+                    std::cerr << "Illegal move " << chess::uci::moveToUci(e.pos, e.move) << " for position " << e.pos.fen() << '\n';
+                    return;
+                }
 
                 writer.addTrainingDataEntry(e);
 
@@ -6975,7 +7571,7 @@ namespace binpack
         std::cout << "Finished. Converted " << numProcessedPositions << " positions.\n";
     }
 
-    inline void convertBinpackToPlain(std::string inputPath, std::string outputPath, std::ios_base::openmode om)
+    inline void convertBinpackToPlain(std::string inputPath, std::string outputPath, std::ios_base::openmode om, bool validate)
     {
         constexpr std::size_t bufferSize = MiB;
 
@@ -6990,7 +7586,14 @@ namespace binpack
 
         while(reader.hasNext())
         {
-            emitPlainEntry(buffer, reader.next());
+            auto e = reader.next();
+            if (validate && !e.isValid())
+            {
+                std::cerr << "Illegal move " << chess::uci::moveToUci(e.pos, e.move) << " for position " << e.pos.fen() << '\n';
+                return;
+            }
+
+            emitPlainEntry(buffer, e);
 
             ++numProcessedPositions;
 
@@ -7016,7 +7619,7 @@ namespace binpack
     }
 
 
-    inline void convertBinToBinpack(std::string inputPath, std::string outputPath, std::ios_base::openmode om)
+    inline void convertBinToBinpack(std::string inputPath, std::string outputPath, std::ios_base::openmode om, bool validate)
     {
         constexpr std::size_t reportEveryNPositions = 100'000;
 
@@ -7037,7 +7640,15 @@ namespace binpack
                 break;
             }
 
-            writer.addTrainingDataEntry(packedSfenValueToTrainingDataEntry(psv));
+            auto e = packedSfenValueToTrainingDataEntry(psv);
+            if (validate && !e.isValid())
+            {
+                std::cerr << "Illegal move " << chess::uci::moveToUci(e.pos, e.move) << " for position " << e.pos.fen() << '\n';
+                std::cerr << static_cast<int>(e.move.type) << '\n';
+                return;
+            }
+
+            writer.addTrainingDataEntry(e);
 
             ++numProcessedPositions;
             const auto cur = inputFile.tellg();
@@ -7050,7 +7661,7 @@ namespace binpack
         std::cout << "Finished. Converted " << numProcessedPositions << " positions.\n";
     }
 
-    inline void convertBinpackToBin(std::string inputPath, std::string outputPath, std::ios_base::openmode om)
+    inline void convertBinpackToBin(std::string inputPath, std::string outputPath, std::ios_base::openmode om, bool validate)
     {
         constexpr std::size_t bufferSize = MiB;
 
@@ -7065,7 +7676,14 @@ namespace binpack
 
         while(reader.hasNext())
         {
-            emitBinEntry(buffer, reader.next());
+            auto e = reader.next();
+            if (validate && !e.isValid())
+            {
+                std::cerr << "Illegal move " << chess::uci::moveToUci(e.pos, e.move) << " for position " << e.pos.fen() << '\n';
+                return;
+            }
+
+            emitBinEntry(buffer, e);
 
             ++numProcessedPositions;
 
@@ -7090,7 +7708,7 @@ namespace binpack
         std::cout << "Finished. Converted " << numProcessedPositions << " positions.\n";
     }
 
-    inline void convertBinToPlain(std::string inputPath, std::string outputPath, std::ios_base::openmode om)
+    inline void convertBinToPlain(std::string inputPath, std::string outputPath, std::ios_base::openmode om, bool validate)
     {
         constexpr std::size_t bufferSize = MiB;
 
@@ -7113,7 +7731,14 @@ namespace binpack
                 break;
             }
 
-            emitPlainEntry(buffer, packedSfenValueToTrainingDataEntry(psv));
+            auto e = packedSfenValueToTrainingDataEntry(psv);
+            if (validate && !e.isValid())
+            {
+                std::cerr << "Illegal move " << chess::uci::moveToUci(e.pos, e.move) << " for position " << e.pos.fen() << '\n';
+                return;
+            }
+
+            emitPlainEntry(buffer, e);
 
             ++numProcessedPositions;
 
@@ -7138,7 +7763,7 @@ namespace binpack
         std::cout << "Finished. Converted " << numProcessedPositions << " positions.\n";
     }
 
-    inline void convertPlainToBin(std::string inputPath, std::string outputPath, std::ios_base::openmode om)
+    inline void convertPlainToBin(std::string inputPath, std::string outputPath, std::ios_base::openmode om, bool validate)
     {
         constexpr std::size_t bufferSize = MiB;
 
@@ -7169,6 +7794,11 @@ namespace binpack
             if (key == "e"sv)
             {
                 e.move = chess::uci::uciToMove(e.pos, move);
+                if (validate && !e.isValid())
+                {
+                    std::cerr << "Illegal move " << chess::uci::moveToUci(e.pos, e.move) << " for position " << e.pos.fen() << '\n';
+                    return;
+                }
 
                 emitBinEntry(buffer, e);
 
diff --git a/src/learn/convert.cpp b/src/learn/convert.cpp
index dfd30509..5fe7ea1d 100644
--- a/src/learn/convert.cpp
+++ b/src/learn/convert.cpp
@@ -525,7 +525,7 @@ namespace Learner
             && ends_with(output_path, expected_output_extension);
     }
 
-    using ConvertFunctionType = void(std::string inputPath, std::string outputPath, std::ios_base::openmode om);
+    using ConvertFunctionType = void(std::string inputPath, std::string outputPath, std::ios_base::openmode om, bool validate);
 
     static ConvertFunctionType* get_convert_function(const std::string& input_path, const std::string& output_path)
     {
@@ -547,7 +547,7 @@ namespace Learner
         return nullptr;
     }
 
-    static void convert(const std::string& input_path, const std::string& output_path, std::ios_base::openmode om)
+    static void convert(const std::string& input_path, const std::string& output_path, std::ios_base::openmode om, bool validate)
     {
         if(!file_exists(input_path))
         {
@@ -558,7 +558,7 @@ namespace Learner
         auto func = get_convert_function(input_path, output_path);
         if (func != nullptr)
         {
-            func(input_path, output_path, om);
+            func(input_path, output_path, om, validate);
         }
         else
         {
@@ -568,20 +568,22 @@ namespace Learner
 
     static void convert(const std::vector<std::string>& args)
     {
-        if (args.size() < 2 || args.size() > 3)
+        if (args.size() < 2 || args.size() > 4)
         {
             std::cerr << "Invalid arguments.\n";
-            std::cerr << "Usage: convert from_path to_path [append]\n";
+            std::cerr << "Usage: convert from_path to_path [append] [validate]\n";
             return;
         }
 
-        const bool append = (args.size() == 3) && (args[2] == "append");
+        const bool append = std::find(args.begin() + 2, args.end(), "append") != args.end();
+        const bool validate = std::find(args.begin() + 2, args.end(), "validate") != args.end();
+
         const std::ios_base::openmode openmode =
             append
             ? std::ios_base::app
             : std::ios_base::trunc;
 
-        convert(args[0], args[1], openmode);
+        convert(args[0], args[1], openmode, validate);
     }
 
     void convert(istringstream& is)

From 8069963c56df0c0bb9fc785fe9d688f19f11c706 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 4 Nov 2020 20:23:36 +0100
Subject: [PATCH 314/398] Update convert docs.

---
 docs/convert.md | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/docs/convert.md b/docs/convert.md
index 2e07ec52..132f66e0 100644
--- a/docs/convert.md
+++ b/docs/convert.md
@@ -6,10 +6,13 @@ As all commands in stockfish `convert` can be invoked either from command line (
 
 The syntax of this command is as follows:
 ```
-convert from_path to_path [append]
+convert from_path to_path [append] [validate]
 ```
 
 `from_path` is the path to the file to convert from. The type of the data is deduced based on its extension (one of `.plain`, `.bin`, `.binpack`).
 `to_path` is the path to an output file. The type of the data is deduced from its extension. If the file does not exist it is created.
 
-The last argument is optional. If not specified then the output file will be truncated prior to any writes. If the last argument is `append` then the converted training data will be appended to the end of the output file.
\ No newline at end of file
+`append` and `validate` can come in any order and are optional.
+If `append` not specified then the output file will be truncated prior to any writes. If `append` is specified then the converted training data will be appended to the end of the output file.
+
+If `validate` is specified then the conversion will stop on the first illegal move found and a diagnostic will be shown.
\ No newline at end of file

From 2a8576b80445afa60faef6f16d024d50a49ffd05 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 9 Nov 2020 19:08:28 +0100
Subject: [PATCH 315/398] Fix compilation issues.

---
 src/extra/nnue_data_binpack_format.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/extra/nnue_data_binpack_format.h b/src/extra/nnue_data_binpack_format.h
index ceb5c415..440ae885 100644
--- a/src/extra/nnue_data_binpack_format.h
+++ b/src/extra/nnue_data_binpack_format.h
@@ -5629,13 +5629,13 @@ namespace chess
                     king ^= CastlingTraits::kingDestination[attackerColor][castleType];
                     rooks ^= move.to;
                     rooks ^= CastlingTraits::rookDestination[attackerColor][castleType];
-
-                    break;
                 }
                 else
                 {
                     king ^= occupiedChange;
                 }
+
+                break;
             }
             case PieceType::None:
                 assert(false);

From b5781150ea8557e2030f8bc8b4eadede0ecec6bd Mon Sep 17 00:00:00 2001
From: lonfom169 <50217346+lonfom169@users.noreply.github.com>
Date: Sun, 8 Nov 2020 23:43:32 -0300
Subject: [PATCH 316/398] Increase reduction based on the number of best move
 changes.

Thanks to Vizvezdenec for the PvNode idea and also to vondele the !PvNode idea.

Passed STC:
LLR: 2.94 (-2.94,2.94) {-0.25,1.25}
Total: 19120 W: 1998 L: 1839 D: 15283
Ptnml(0-2): 76, 1445, 6375, 1572, 92
https://tests.stockfishchess.org/tests/view/5fa8af3e67cbf42301d6a6c9

Passed LTC:
LLR: 2.94 (-2.94,2.94) {0.25,1.25}
Total: 75584 W: 3454 L: 3205 D: 68925
Ptnml(0-2): 54, 2832, 31771, 3081, 54

closes https://github.com/official-stockfish/Stockfish/pull/3224

Bench: 3595418
---
 AUTHORS        | 1 +
 src/search.cpp | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/AUTHORS b/AUTHORS
index 198dfa5a..f0356090 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -19,6 +19,7 @@ Alain Savard (Rocky640)
 Alayan Feh (Alayan-stk-2)
 Alexander Kure
 Alexander Pagel (Lolligerhans)
+Alfredo Menezes (lonfom169)
 Ali AlZhrani (Cooffe)
 Andrew Grant (AndyGrant)
 Andrey Neporada (nepal)
diff --git a/src/search.cpp b/src/search.cpp
index b5b93bf0..56b56733 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1169,6 +1169,9 @@ moves_loop: // When in check, search starts from here
           if (ss->ttPv)
               r -= 2;
 
+          if (!PvNode && depth > 10 && thisThread->bestMoveChanges <= 2)
+              r++;
+
           if (moveCountPruning && !formerPv)
               r++;
 

From 285bf7041ad214156188823eb9118e6af7f4b2e4 Mon Sep 17 00:00:00 2001
From: SFisGOD <jonathandumale@gmail.com>
Date: Tue, 10 Nov 2020 18:28:43 +0100
Subject: [PATCH 317/398] Increase reduction at root

when the best move does not change frequently

STC:
LLR: 2.94 (-2.94,2.94) {-0.25,1.25}
Total: 51320 W: 5159 L: 4956 D: 41205
Ptnml(0-2): 215, 3897, 17242, 4082, 224
https://tests.stockfishchess.org/tests/view/5faa072367cbf42301d6a767

LTC:
LLR: 2.98 (-2.94,2.94) {0.25,1.25}
Total: 15952 W: 762 L: 642 D: 14548
Ptnml(0-2): 8, 561, 6725, 667, 15
https://tests.stockfishchess.org/tests/view/5faa4c3567cbf42301d6a794

closes https://github.com/official-stockfish/Stockfish/pull/3225

Bench: 3954692
---
 AUTHORS        | 2 +-
 src/search.cpp | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/AUTHORS b/AUTHORS
index f0356090..f30be4de 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -86,7 +86,7 @@ Jekaa
 Jerry Donald Watson (jerrydonaldwatson)
 jjoshua2
 Jonathan Calovski (Mysseno)
-Jonathan Dumale (SFisGOD)
+Jonathan Buladas Dumale (SFisGOD)
 Joost VandeVondele (vondele)
 Jörg Oster (joergoster)
 Joseph Ellis (jhellis3)
diff --git a/src/search.cpp b/src/search.cpp
index 56b56733..66ef5043 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1161,7 +1161,7 @@ moves_loop: // When in check, search starts from here
           if (thisThread->ttHitAverage > 509 * TtHitAverageResolution * TtHitAverageWindow / 1024)
               r--;
 
-          // Reduction if other threads are searching this position
+          // Increase reduction if other threads are searching this position
           if (th.marked())
               r++;
 
@@ -1169,7 +1169,8 @@ moves_loop: // When in check, search starts from here
           if (ss->ttPv)
               r -= 2;
 
-          if (!PvNode && depth > 10 && thisThread->bestMoveChanges <= 2)
+          // Increase reduction at root and non-PV nodes when the best move does not change frequently
+          if ((rootNode || !PvNode) && depth > 10 && thisThread->bestMoveChanges <= 2)
               r++;
 
           if (moveCountPruning && !formerPv)

From a71623f74c7056242ce2d152613fe90fa0aa9ff8 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 13 Nov 2020 10:56:52 +0100
Subject: [PATCH 318/398] Add explicit read head seek to the start of the
 binpack file. Otherwise on MACOS the read head is placed at the end when app
 is specified.

---
 src/extra/nnue_data_binpack_format.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/extra/nnue_data_binpack_format.h b/src/extra/nnue_data_binpack_format.h
index 440ae885..31c6f7bb 100644
--- a/src/extra/nnue_data_binpack_format.h
+++ b/src/extra/nnue_data_binpack_format.h
@@ -6717,6 +6717,9 @@ namespace binpack
             m_path(std::move(path)),
             m_file(m_path, std::ios_base::binary | std::ios_base::in | std::ios_base::out | om)
         {
+            // Necessary for MAC because app mode makes it put the reading
+            // head at the end.
+            m_file.seekg(0);
         }
 
         void append(const char* data, std::uint32_t size)

From 69bc3ef9be0592627908877a1c2d3b2eb2131776 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 12 Nov 2020 22:24:59 +0100
Subject: [PATCH 319/398] Output loss more often.

---
 src/learn/learn.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 66461cc5..317f6da0 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -793,6 +793,8 @@ namespace Learner
 
             out << "  - norm = " << sum_norm << endl;
             out << "  - move accuracy = " << (move_accord_count * 100.0 / psv.size()) << "%" << endl;
+            out << "  - loss (current) = " << (test_loss_sum.cross_entropy - test_loss_sum.entropy) / psv.size() << endl;
+            out << "  - loss (average) = " << latest_loss_sum / latest_loss_count << endl;
         }
         else
         {

From 4e1653d53a44affe6f5a56ae4f9cb737df861f2e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 13 Nov 2020 11:05:07 +0100
Subject: [PATCH 320/398] Fix reliance on transitive includes for factorizers
 in trainer feature transformer. Add a file that includes all factorizers.

---
 src/nnue/evaluate_nnue_learner.cpp             |  5 ++---
 src/nnue/trainer/features/all_factorizers.h    | 10 ++++++++++
 src/nnue/trainer/trainer_feature_transformer.h |  2 +-
 3 files changed, 13 insertions(+), 4 deletions(-)
 create mode 100644 src/nnue/trainer/features/all_factorizers.h

diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 6294865d..43282494 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -5,9 +5,8 @@
 #include "evaluate_nnue.h"
 #include "evaluate_nnue_learner.h"
 
-#include "trainer/features/factorizer_feature_set.h"
-#include "trainer/features/factorizer_half_kp.h"
-#include "trainer/features/factorizer_half_ka.h"
+#include "trainer/features/all_factorizers.h"
+
 #include "trainer/trainer_feature_transformer.h"
 #include "trainer/trainer_input_slice.h"
 #include "trainer/trainer_affine_transform.h"
diff --git a/src/nnue/trainer/features/all_factorizers.h b/src/nnue/trainer/features/all_factorizers.h
new file mode 100644
index 00000000..75d62ec8
--- /dev/null
+++ b/src/nnue/trainer/features/all_factorizers.h
@@ -0,0 +1,10 @@
+#ifndef _NNUE_TRAINER_FEATURES_ALL_FACTORIZERS_H_
+#define _NNUE_TRAINER_FEATURES_ALL_FACTORIZERS_H_
+
+#include "factorizer.h"
+#include "factorizer_feature_set.h"
+
+#include "factorizer_half_kp.h"
+#include "factorizer_half_ka.h"
+
+#endif
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 419cdf5e..80f914f2 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -5,7 +5,7 @@
 
 #include "extra/stockfish_blas.h"
 
-#include "features/factorizer_feature_set.h"
+#include "features/all_factorizers.h"
 
 #include "learn/learn.h"
 

From 691da3bdad9890cf7b2ae4f279a264dba7104c0a Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 13 Nov 2020 11:14:19 +0100
Subject: [PATCH 321/398] Add more information for factorizers at the start of
 training.

---
 src/nnue/trainer/features/factorizer.h             |  2 +-
 src/nnue/trainer/features/factorizer_feature_set.h |  2 +-
 src/nnue/trainer/features/factorizer_half_ka.h     | 14 +++++++-------
 src/nnue/trainer/features/factorizer_half_kp.h     |  2 +-
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/nnue/trainer/features/factorizer.h b/src/nnue/trainer/features/factorizer.h
index 15ce8022..b64b0c74 100644
--- a/src/nnue/trainer/features/factorizer.h
+++ b/src/nnue/trainer/features/factorizer.h
@@ -14,7 +14,7 @@ namespace Eval::NNUE::Features {
     class Factorizer {
     public:
         static constexpr std::string get_name() {
-            return std::string("No factorizer");
+            return "Factorizer<" + FeatureType::get_name() + "> -> " + std::string("No factorizer");
         }
 
         static constexpr std::string get_factorizers_string() {
diff --git a/src/nnue/trainer/features/factorizer_feature_set.h b/src/nnue/trainer/features/factorizer_feature_set.h
index f5ee3c5c..60f42166 100644
--- a/src/nnue/trainer/features/factorizer_feature_set.h
+++ b/src/nnue/trainer/features/factorizer_feature_set.h
@@ -82,7 +82,7 @@ namespace Eval::NNUE::Features {
         static constexpr IndexType kBaseDimensions = FeatureType::kDimensions;
 
         static constexpr std::string get_name() {
-            return FeatureType::kName;
+            return Factorizer<FeatureType>::get_name();
         }
 
         static constexpr std::string get_factorizers_string() {
diff --git a/src/nnue/trainer/features/factorizer_half_ka.h b/src/nnue/trainer/features/factorizer_half_ka.h
index 90bd9d97..36d36a2d 100644
--- a/src/nnue/trainer/features/factorizer_half_ka.h
+++ b/src/nnue/trainer/features/factorizer_half_ka.h
@@ -31,11 +31,11 @@ namespace Eval::NNUE::Features {
 
         // Learning feature information
         static constexpr FeatureProperties kProperties[] = {
-            // kFeaturesHalfKPK
+            // kFeaturesHalfA
             {true, FeatureType::kDimensions},
-            // kFeaturesPK
+            // kFeaturesA
             {true, Factorizer<A>::get_dimensions()},
-            // kFeaturesHalfRelativeKPK
+            // kFeaturesHalfRelativeKA
             {true, Factorizer<HalfRelativeKA<AssociatedKing>>::get_dimensions()},
         };
 
@@ -43,7 +43,7 @@ namespace Eval::NNUE::Features {
 
     public:
         static constexpr std::string get_name() {
-            return std::string("Factorizer<") + FeatureType::kName + ">";
+            return std::string("Factorizer<") + FeatureType::kName + "> -> " + "A, HalfRelativeKA";
         }
 
         static constexpr std::string get_factorizers_string() {
@@ -59,18 +59,18 @@ namespace Eval::NNUE::Features {
         static void append_training_features(
             IndexType base_index, std::vector<TrainingFeature>* training_features) {
 
-            // kFeaturesHalfKPK
+            // kFeaturesHalfA
             IndexType index_offset = append_base_feature<FeatureType>(
                 kProperties[kFeaturesHalfKA], base_index, training_features);
 
             const auto sq_k = static_cast<Square>(base_index / PS_END2);
             const auto a = static_cast<IndexType>(base_index % PS_END2);
 
-            // kFeaturesPK
+            // kFeaturesA
             index_offset += inherit_features_if_required<A>(
                 index_offset, kProperties[kFeaturesA], a, training_features);
 
-            // kFeaturesHalfRelativeKPK
+            // kFeaturesHalfRelativeKA
             if (a >= PS_W_PAWN) {
                 index_offset += inherit_features_if_required<HalfRelativeKA<AssociatedKing>>(
                     index_offset, kProperties[kFeaturesHalfRelativeKA],
diff --git a/src/nnue/trainer/features/factorizer_half_kp.h b/src/nnue/trainer/features/factorizer_half_kp.h
index 601ddfa5..c554f0fc 100644
--- a/src/nnue/trainer/features/factorizer_half_kp.h
+++ b/src/nnue/trainer/features/factorizer_half_kp.h
@@ -46,7 +46,7 @@ namespace Eval::NNUE::Features {
 
     public:
         static constexpr std::string get_name() {
-            return std::string("Factorizer<") + FeatureType::kName + ">";
+            return std::string("Factorizer<") + FeatureType::kName + "> -> " + "HalfK, P, HalfRelativeKP";
         }
 
         static constexpr std::string get_factorizers_string() {

From 9b930023fb42589d2169f9ef77670206c539d76e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <ts.tomeksopel@gmail.com>
Date: Sat, 14 Nov 2020 15:32:06 +0100
Subject: [PATCH 322/398] Fix default value for batchsize in learn docs.

---
 docs/learn.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/learn.md b/docs/learn.md
index 7051a173..037e149c 100644
--- a/docs/learn.md
+++ b/docs/learn.md
@@ -16,7 +16,7 @@ Currently the following options are available:
 
 `set_recommended_uci_options` - this is a modifier not a parameter, no value follows it. If specified then some UCI options are set to recommended values.
 
-`bat` - the size of a batch in multiples of 10000. This determines how many entries are read and shuffled at once during training. Default: 1000 (meaning batch size of 1000000).
+`bat` - the size of a batch in multiples of 10000. This determines how many entries are read and shuffled at once during training. Default: 100 (meaning batch size of 1000000).
 
 `targetdir` - path to the direction from which training data will be read. All files in this directory are read sequentially. If not specified then only the list of files from positional arguments will be used. If specified then files from the given directory will be used after the explicitly specified files.
 
@@ -24,7 +24,7 @@ Currently the following options are available:
 
 `basedir` - the base directory for the paths. Default: "" (current directory)
 
-`batchsize` - same as `bat` but doesn't scale by 10000
+`batchsize` - same as `bat` but doesn't scale by 10000. Default: 1000000
 
 `lr` - initial learning rate. Default: 1.
 
@@ -105,4 +105,4 @@ Currently the following options are available:
 `buffer_size`
 `shuffleq`
 `shufflem`
-`output_file_name`
\ No newline at end of file
+`output_file_name`

From f9595828eb7e5e970b0be3ee5f84ddd726845523 Mon Sep 17 00:00:00 2001
From: FauziAkram <fauzi.dabat@hotmail.com>
Date: Wed, 11 Nov 2020 20:56:29 +0200
Subject: [PATCH 323/398] Rook Mobility Tweak

Passed STC:
LLR: 2.94 (-2.94,2.94) {-0.25,1.25}
Total: 171152 W: 34715 L: 34202 D: 102235
Ptnml(0-2): 3278, 20155, 38228, 20606, 3309
https://tests.stockfishchess.org/tests/view/5fa861f467cbf42301d6a68e

Passed LTC:
LLR: 2.94 (-2.94,2.94) {0.25,1.25}
Total: 149616 W: 20471 L: 19882 D: 109263
Ptnml(0-2): 1172, 14434, 43102, 14833, 1267
https://tests.stockfishchess.org/tests/view/5fa9c8ff67cbf42301d6a74f

closes https://github.com/official-stockfish/Stockfish/pull/3226

Bench: 3597730
---
 src/evaluate.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 4ade46fa..34ebe6c3 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -212,9 +212,9 @@ namespace {
     { S(-47,-59), S(-20,-25), S( 14, -8), S( 29, 12), S( 39, 21), S( 53, 40), // Bishop
       S( 53, 56), S( 60, 58), S( 62, 65), S( 69, 72), S( 78, 78), S( 83, 87),
       S( 91, 88), S( 96, 98) },
-    { S(-61,-82), S(-20,-17), S(  2, 23) ,S(  3, 40), S(  4, 72), S( 11,100), // Rook
-      S( 22,104), S( 31,120), S( 39,134), S(40 ,138), S( 41,158), S( 47,163),
-      S( 59,168), S( 60,169), S( 64,173) },
+    { S(-60,-82), S(-24,-15), S(  0, 17) ,S(  3, 43), S(  4, 72), S( 14,100), // Rook
+      S( 20,102), S( 30,122), S( 41,133), S(41 ,139), S( 41,153), S( 45,160),
+      S( 57,165), S( 58,170), S( 67,175) },
     { S(-29,-49), S(-16,-29), S( -8, -8), S( -8, 17), S( 18, 39), S( 25, 54), // Queen
       S( 23, 59), S( 37, 73), S( 41, 76), S( 54, 95), S( 65, 95) ,S( 68,101),
       S( 69,124), S( 70,128), S( 70,132), S( 70,133) ,S( 71,136), S( 72,140),

From 00797a3d86976e4e91e7bc76509b4c305ce23e3f Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 9 Nov 2020 19:21:27 +0100
Subject: [PATCH 324/398] add option `ensure_quiet` for gensfen that makes the
 generated position quiet

---
 src/learn/gensfen.cpp | 93 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 86 insertions(+), 7 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 4accb882..e1aec654 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -94,6 +94,8 @@ namespace Learner
             bool detect_draw_by_consecutive_low_score = true;
             bool detect_draw_by_insufficient_mating_material = true;
 
+            bool ensure_quiet = false;
+
             uint64_t num_threads;
 
             void enforce_constraints()
@@ -316,19 +318,86 @@ namespace Learner
                 // Discard stuff before write_minply is reached
                 // because it can harm training due to overfitting.
                 // Initial positions would be too common.
-                if (ply >= params.write_minply && !was_seen_before(pos))
+                if (ply >= params.write_minply)
                 {
                     packed_sfens.emplace_back(PackedSfenValue());
 
                     auto& psv = packed_sfens.back();
 
-                    // Here we only write the position data.
-                    // Result is added after the whole game is done.
-                    pos.sfen_pack(psv.sfen);
+                    if (params.ensure_quiet)
+                    {
+                        auto [qsearch_value, qsearch_pv] = Search::qsearch(pos);
+                        if (qsearch_pv.empty())
+                        {
+                            // Here we only write the position data.
+                            // Result is added after the whole game is done.
+                            pos.sfen_pack(psv.sfen);
 
-                    psv.score = search_value;
-                    psv.gamePly = ply;
-                    psv.move = search_pv[0];
+                            // Already a quiet position
+                            psv.score = search_value;
+                            psv.move = search_pv[0];
+                            psv.gamePly = ply;
+                        }
+                        else
+                        {
+                            // Navigate to a quiet
+                            int old_ply = ply;
+                            for (auto m : qsearch_pv)
+                            {
+                                pos.do_move(m, states[ply++]);
+                            }
+
+                            if (was_seen_before(pos))
+                            {
+                                // Just skip the move.
+                                packed_sfens.pop_back();
+                            }
+                            else
+                            {
+                                // Reevaluate
+                                auto [quiet_search_value, quiet_search_pv] = Search::search(pos, depth, 1, params.nodes);
+                                if (quiet_search_pv.empty())
+                                {
+                                    // Just skip the move.
+                                    packed_sfens.pop_back();
+                                }
+                                else
+                                {
+                                    // Here we only write the position data.
+                                    // Result is added after the whole game is done.
+                                    pos.sfen_pack(psv.sfen);
+
+                                    psv.score = quiet_search_value;
+                                    psv.move = quiet_search_pv[0];
+                                    psv.gamePly = ply;
+                                }
+                            }
+
+                            // Get back to the game
+                            for (auto it = qsearch_pv.rbegin(); it != qsearch_pv.rend(); ++it)
+                            {
+                                pos.undo_move(*it);
+                            }
+                            ply = old_ply;
+                        }
+                    }
+                    else
+                    {
+                        if (was_seen_before(pos))
+                        {
+                            packed_sfens.pop_back();
+                        }
+                        else
+                        {
+                            // Here we only write the position data.
+                            // Result is added after the whole game is done.
+                            pos.sfen_pack(psv.sfen);
+
+                            psv.score = search_value;
+                            psv.move = search_pv[0];
+                            psv.gamePly = ply;
+                        }
+                    }
                 }
 
                 // Update the next move according to best search result or random move.
@@ -777,6 +846,10 @@ namespace Learner
                 UCI::setoption("PruneAtShallowDepth", "false");
                 UCI::setoption("EnableTranspositionTable", "true");
             }
+            else if (token == "ensure_quiet")
+            {
+                params.ensure_quiet = true;
+            }
             else
                 cout << "ERROR: Ignoring unknown option " << token << endl;
         }
@@ -791,6 +864,12 @@ namespace Learner
                 cout << "WARNING: Unknown sfen format `" << sfen_format << "`. Using bin\n";
         }
 
+        if (params.ensure_quiet)
+        {
+            // Otherwise we can't ensure quiet positions...
+            UCI::setoption("EnableTranspositionTable", "false");
+        }
+
         if (random_file_name)
         {
             // Give a random number to output_file_name at this point.

From 00bc80c3c4d162ebfdc4176c9b01e9124c0a473a Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 9 Nov 2020 19:27:53 +0100
Subject: [PATCH 325/398] Add `assume_quiet` option to the learner.

---
 src/learn/learn.cpp | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 317f6da0..7f18ff28 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -397,6 +397,8 @@ namespace Learner
             bool use_draw_games_in_validation = true;
             bool skip_duplicated_positions_in_training = true;
 
+            bool assume_quiet = false;
+
             double learning_rate = 1.0;
 
             string validation_set_file_name;
@@ -676,19 +678,22 @@ namespace Learner
                 goto RETRY_READ;
             }
 
-            int ply = 0;
-            pos.do_move((Move)ps.move, state[ply++]);
-
-            // We want to position being trained on not to be terminal
-            if (MoveList<LEGAL>(pos).size() == 0)
-                goto RETRY_READ;
-
-            // Evaluation value of shallow search (qsearch)
-            const auto [_, pv] = Search::qsearch(pos);
-
-            for (auto m : pv)
+            if (!params.assume_quiet)
             {
-                pos.do_move(m, state[ply++]);
+                int ply = 0;
+                pos.do_move((Move)ps.move, state[ply++]);
+
+                // We want to position being trained on not to be terminal
+                if (MoveList<LEGAL>(pos).size() == 0)
+                    goto RETRY_READ;
+
+                // Evaluation value of shallow search (qsearch)
+                const auto [_, pv] = Search::qsearch(pos);
+
+                for (auto m : pv)
+                {
+                    pos.do_move(m, state[ply++]);
+                }
             }
 
             // Since we have reached the end phase of PV, add the slope here.
@@ -1106,6 +1111,7 @@ namespace Learner
                 UCI::setoption("EnableTranspositionTable", "false");
             }
             else if (option == "verbose") params.verbose = true;
+            else if (option == "assume_quiet") params.assume_quiet = true;
             else
             {
                 out << "INFO: Unknown option: " << option << ". Ignoring.\n";

From 50358e26c77a7c51ea1c17a948c36b09cb18239d Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 13 Nov 2020 22:28:28 +0100
Subject: [PATCH 326/398] Fix searching terminal nodes in gensfen.

---
 src/search.cpp | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index 436e11fd..1aa86bf3 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1976,7 +1976,7 @@ namespace Search
 
   // Initialization for learning.
   // Called from Learner::search(),Learner::qsearch().
-  static void init_for_search(Position& pos, Stack* ss)
+  static bool init_for_search(Position& pos, Stack* ss)
   {
 
     // RootNode requires ss->ply == 0.
@@ -2026,7 +2026,10 @@ namespace Search
       for (auto m: MoveList<LEGAL>(pos))
         rootMoves.push_back(Search::RootMove(m));
 
-      assert(!rootMoves.empty());
+      // Check if we're at a terminal node. Otherwise we end up returning
+      // malformed PV later on.
+      if (rootMoves.empty())
+        return false;
 
       th->UseRule50 = bool(Options["Syzygy50MoveRule"]);
       th->ProbeDepth = int(Options["SyzygyProbeDepth"]);
@@ -2042,6 +2045,8 @@ namespace Search
 
       Tablebases::rank_root_moves(pos, rootMoves);
     }
+
+    return true;
   }
 
   // Stationary search.
@@ -2061,7 +2066,9 @@ namespace Search
     Stack stack[MAX_PLY+10], *ss = stack+7;
     Move  pv[MAX_PLY+1];
 
-    init_for_search(pos, ss);
+    if (!init_for_search(pos, ss))
+      return {};
+
     ss->pv = pv; // For the time being, it must be a dummy and somewhere with a buffer.
 
     if (pos.is_draw(0)) {
@@ -2116,7 +2123,8 @@ namespace Search
     Stack stack[MAX_PLY + 10], * ss = stack + 7;
     Move pv[MAX_PLY + 1];
 
-    init_for_search(pos, ss);
+    if (!init_for_search(pos, ss))
+      return {};
 
 	ss->pv = pv; // For the time being, it must be a dummy and somewhere with a buffer.
 

From 3dbc45bdfc232b549c28269896ccc3760f937378 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 16 Nov 2020 00:41:05 +0100
Subject: [PATCH 327/398] Add gradient clipping.

---
 src/learn/learn.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 7f18ff28..3942b606 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -93,6 +93,8 @@ namespace Learner
     static double elmo_lambda_high = 1.0;
     static double elmo_lambda_limit = 32000;
 
+    static double max_grad = 1.0;
+
     // Using stockfish's WDL with win rate model instead of sigmoid
     static bool use_wdl = false;
 
@@ -315,7 +317,7 @@ namespace Learner
             grad = lambda * (q - p) + (1.0 - lambda) * (q - t);
         }
 
-        return grad;
+        return std::clamp(grad, -max_grad, max_grad);
     }
 
     // Calculate cross entropy during learning
@@ -1072,6 +1074,7 @@ namespace Learner
             else if (option == "lambda") is >> elmo_lambda_low;
             else if (option == "lambda2") is >> elmo_lambda_high;
             else if (option == "lambda_limit") is >> elmo_lambda_limit;
+            else if (option == "max_grad") is >> max_grad;
 
             else if (option == "reduction_gameply") is >> params.reduction_gameply;
 
@@ -1175,6 +1178,7 @@ namespace Learner
         out << "  - elmo_lambda_low          : " << elmo_lambda_low << endl;
         out << "  - elmo_lambda_high         : " << elmo_lambda_high << endl;
         out << "  - elmo_lambda_limit        : " << elmo_lambda_limit << endl;
+        out << "  - max_grad                 : " << max_grad << endl;
         out << "  - eval_save_interval       : " << params.eval_save_interval << " sfens" << endl;
         out << "  - loss_output_interval     : " << params.loss_output_interval << " sfens" << endl;
 

From d793663188729272cd0b8ecb8a841f7b50ea8345 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 16 Nov 2020 00:43:43 +0100
Subject: [PATCH 328/398] Add docs for max_grad option for learn

---
 docs/learn.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/learn.md b/docs/learn.md
index 037e149c..6de81521 100644
--- a/docs/learn.md
+++ b/docs/learn.md
@@ -50,6 +50,8 @@ Currently the following options are available:
 
 `lambda_limit` - the maximum absolute score value for which `lambda` is used as opposed to `lambda2`. For positions with absolute evaluation higher than `lambda_limit` `lambda2` will be used. Default: 32000 (so always `lambda`).
 
+`max_grad` - the maximum allowed loss gradient for backpropagation. Effectively a form of gradient clipping. Useful for the first iterations with a randomly generated net as with higher lr backpropagation often overshoots and kills the net. The default value is fairly conservative, values as low as 0.25 could be used with lr of 1.0 without problems. Default: 1.0.
+
 `reduction_gameply` - the minimum ply after which positions won't be skipped. Positions at plies below this value are skipped with a probability that lessens linearly with the ply (reaching 0 at `reduction_gameply`). Default: 1.
 
 `eval_limit` - positions with absolute evaluation higher than this will be skipped. Default: 32000 (nothing is skipped).

From d4350a16f32eaedf0e5bd207a4e1293a7a4e1f2c Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 16 Nov 2020 11:32:42 +0100
Subject: [PATCH 329/398] Add representation of an opening book.

---
 src/Makefile               |  1 +
 src/learn/opening_book.cpp | 43 +++++++++++++++++++++++++++++
 src/learn/opening_book.h   | 56 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 100 insertions(+)
 create mode 100644 src/learn/opening_book.cpp
 create mode 100644 src/learn/opening_book.h

diff --git a/src/Makefile b/src/Makefile
index cba4e351..51a9654a 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -63,6 +63,7 @@ SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp
 	learn/sfen_packer.cpp \
 	learn/learn.cpp \
 	learn/gensfen.cpp \
+	learn/opening_book.cpp \
 	learn/convert.cpp
 
 OBJS = $(notdir $(SRCS:.cpp=.o))
diff --git a/src/learn/opening_book.cpp b/src/learn/opening_book.cpp
new file mode 100644
index 00000000..fb569bda
--- /dev/null
+++ b/src/learn/opening_book.cpp
@@ -0,0 +1,43 @@
+#include "opening_book.h"
+
+#include <fstream>
+
+namespace Learner {
+
+    EpdOpeningBook::EpdOpeningBook(const std::string& file, PRNG& prng) :
+        OpeningBook(file)
+    {
+        std::ifstream in(file);
+        if (!in)
+        {
+            return;
+        }
+
+        std::string line;
+        while (std::getline(in, line))
+        {
+            if (line.empty())
+                continue;
+
+            fens.emplace_back(line);
+        }
+
+        Algo::shuffle(fens, prng);
+    }
+
+    static bool ends_with(const std::string& lhs, const std::string& end)
+    {
+        if (end.size() > lhs.size()) return false;
+
+        return std::equal(end.rbegin(), end.rend(), lhs.rbegin());
+    }
+
+    std::unique_ptr<OpeningBook> open_opening_book(const std::string& filename, PRNG& prng)
+    {
+        if (ends_with(filename, ".epd"))
+            return std::make_unique<EpdOpeningBook>(filename, prng);
+
+        return nullptr;
+    }
+
+}
diff --git a/src/learn/opening_book.h b/src/learn/opening_book.h
new file mode 100644
index 00000000..16207f13
--- /dev/null
+++ b/src/learn/opening_book.h
@@ -0,0 +1,56 @@
+#ifndef LEARN_OPENING_BOOK_H
+#define LEARN_OPENING_BOOK_H
+
+#include "misc.h"
+#include "position.h"
+#include "thread.h"
+
+#include <vector>
+#include <random>
+#include <optional>
+#include <string>
+#include <cstdint>
+#include <memory>
+
+namespace Learner {
+
+    struct OpeningBook {
+
+        const std::string& next_fen()
+        {
+            assert(fens.size() > 0);
+
+            auto& fen = fens[current_index++];
+            if (current_index >= fens.size())
+                current_index = 0;
+
+            return fen;
+        }
+
+        std::size_t size() const { return fens.size(); }
+
+        const std::string& get_filename() const { return filename; }
+
+    protected:
+        OpeningBook(const std::string& file) :
+            filename(file),
+            current_index(0)
+        {
+        }
+
+
+        std::string filename;
+        std::vector<std::string> fens;
+        std::size_t current_index;
+    };
+
+    struct EpdOpeningBook : OpeningBook {
+
+        EpdOpeningBook(const std::string& file, PRNG& prng);
+    };
+
+    std::unique_ptr<OpeningBook> open_opening_book(const std::string& filename, PRNG& prng);
+
+}
+
+#endif

From e1dbad47cef574dc39d9d768308c9bf96de95c5b Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 16 Nov 2020 14:13:34 +0100
Subject: [PATCH 330/398] Add support for opening book to gensfen.

---
 src/learn/gensfen.cpp | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index e1aec654..b265da71 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -2,6 +2,7 @@
 
 #include "sfen_writer.h"
 #include "packed_sfen.h"
+#include "opening_book.h"
 
 #include "misc.h"
 #include "position.h"
@@ -98,6 +99,8 @@ namespace Learner
 
             uint64_t num_threads;
 
+            std::string book;
+
             void enforce_constraints()
             {
                 search_depth_max = std::max(search_depth_min, search_depth_max);
@@ -130,6 +133,15 @@ namespace Learner
         {
             hash.resize(GENSFEN_HASH_SIZE);
 
+            if (!prm.book.empty())
+            {
+                opening_book = open_opening_book(prm.book, prng);
+                if (opening_book == nullptr)
+                {
+                    std::cout << "WARNING: Failed to open opening book " << prm.book << ". Falling back to startpos.\n";
+                }
+            }
+
             // Output seed to veryfy by the user if it's not identical by chance.
             std::cout << prng << std::endl;
         }
@@ -151,6 +163,8 @@ namespace Learner
 
         vector<Key> hash; // 64MB*sizeof(HASH_KEY) = 512MB
 
+        std::unique_ptr<OpeningBook> opening_book;
+
         static void set_gensfen_search_limits();
 
         void generate_worker(
@@ -248,7 +262,15 @@ namespace Learner
             // When parallelizing, Threads (since this is a vector<Thread*>,
             // Do the same for up to Threads[0]...Threads[thread_num-1].
             auto& pos = th.rootPos;
-            pos.set(StartFEN, false, &si, &th);
+            if (opening_book != nullptr)
+            {
+                auto& fen = opening_book->next_fen();
+                pos.set(fen, false, &si, &th);
+            }
+            else
+            {
+                pos.set(StartFEN, false, &si, &th);
+            }
 
             int resign_counter = 0;
             bool should_resign = prng.rand(10) > 1;
@@ -822,6 +844,8 @@ namespace Learner
                 is >> params.write_maxply;
             else if (token == "save_every")
                 is >> params.save_every;
+            else if (token == "book")
+                is >> params.book;
             else if (token == "random_file_name")
                 is >> random_file_name;
             // Accept also the old option name.
@@ -911,6 +935,7 @@ namespace Learner
             << "  - random_multi_pv_depth  = " << params.random_multi_pv_depth << endl
             << "  - write_minply           = " << params.write_minply << endl
             << "  - write_maxply           = " << params.write_maxply << endl
+            << "  - book                   = " << params.book << endl
             << "  - output_file_name       = " << params.output_file_name << endl
             << "  - save_every             = " << params.save_every << endl
             << "  - random_file_name       = " << random_file_name << endl

From 5f18c88b3d5683321d4a84d948e94053770f6291 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 16 Nov 2020 14:14:39 +0100
Subject: [PATCH 331/398] Docs for book in gensfen.

---
 docs/gensfen.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/gensfen.md b/docs/gensfen.md
index ce0f365c..16fead59 100644
--- a/docs/gensfen.md
+++ b/docs/gensfen.md
@@ -44,6 +44,8 @@ Currently the following options are available:
 
 `write_maxply` - maximum ply for which the training data entry will be emitted. Default: 400.
 
+`book` - a path to an opening book to use for the starting positions. Currently only .epd format is supported. If not specified then the starting position is always the standard chess starting position.
+
 `save_every` - the number of training data entries per file. If not specified then there will be always one file. If specified there may be more than one file generated (each having at most `save_every` training data entries) and each file will have a unique number attached.
 
 `random_file_name` - if specified then the output filename will be chosen randomly. Overrides `output_file_name`.

From d9dcdc2b73314ec7507801e1e23562cd9d49f4b2 Mon Sep 17 00:00:00 2001
From: JWmer <26392242+NightlyKing@users.noreply.github.com>
Date: Fri, 20 Nov 2020 15:06:14 +0100
Subject: [PATCH 332/398] Delete k-p_256x2-32-32.h

---
 src/nnue/architectures/k-p_256x2-32-32.h | 35 ------------------------
 1 file changed, 35 deletions(-)
 delete mode 100644 src/nnue/architectures/k-p_256x2-32-32.h

diff --git a/src/nnue/architectures/k-p_256x2-32-32.h b/src/nnue/architectures/k-p_256x2-32-32.h
deleted file mode 100644
index 92c9efcd..00000000
--- a/src/nnue/architectures/k-p_256x2-32-32.h
+++ /dev/null
@@ -1,35 +0,0 @@
-﻿// Definition of input features and network structure used in NNUE evaluation function
-
-#ifndef K_P_256X2_32_32_H
-#define K_P_256X2_32_32_H
-
-#include "nnue/features/feature_set.h"
-#include "nnue/features/k.h"
-#include "nnue/features/p.h"
-
-#include "nnue/layers/input_slice.h"
-#include "nnue/layers/affine_transform.h"
-#include "nnue/layers/clipped_relu.h"
-
-namespace Eval::NNUE {
-
-    // Input features used in evaluation function
-    using RawFeatures = Features::FeatureSet<Features::K, Features::P>;
-
-    // Number of input feature dimensions after conversion
-    constexpr IndexType kTransformedFeatureDimensions = 256;
-
-    namespace Layers {
-
-        // define network structure
-        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-        using OutputLayer = AffineTransform<HiddenLayer2, 1>;
-
-    }  // namespace Layers
-
-    using Network = Layers::OutputLayer;
-
-}  // namespace Eval::NNUE
-#endif // K_P_256X2_32_32_H

From 72fee2f7a41d9c265a8d214c814dee1afb18e67c Mon Sep 17 00:00:00 2001
From: JWmer <26392242+NightlyKing@users.noreply.github.com>
Date: Fri, 20 Nov 2020 15:06:32 +0100
Subject: [PATCH 333/398] Delete k-p-cr_256x2-32-32.h

---
 src/nnue/architectures/k-p-cr_256x2-32-32.h | 37 ---------------------
 1 file changed, 37 deletions(-)
 delete mode 100644 src/nnue/architectures/k-p-cr_256x2-32-32.h

diff --git a/src/nnue/architectures/k-p-cr_256x2-32-32.h b/src/nnue/architectures/k-p-cr_256x2-32-32.h
deleted file mode 100644
index 1db34b22..00000000
--- a/src/nnue/architectures/k-p-cr_256x2-32-32.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Definition of input features and network structure used in NNUE evaluation function
-
-#ifndef K_P_CR_256X2_32_32_H
-#define K_P_CR_256X2_32_32_H
-
-#include "nnue/features/feature_set.h"
-#include "nnue/features/k.h"
-#include "nnue/features/p.h"
-#include "nnue/features/castling_right.h"
-
-#include "nnue/layers/input_slice.h"
-#include "nnue/layers/affine_transform.h"
-#include "nnue/layers/clipped_relu.h"
-
-namespace Eval::NNUE {
-
-    // Input features used in evaluation function
-    using RawFeatures = Features::FeatureSet<Features::K, Features::P,
-        Features::CastlingRight>;
-
-    // Number of input feature dimensions after conversion
-    constexpr IndexType kTransformedFeatureDimensions = 256;
-
-    namespace Layers {
-
-        // define network structure
-        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-        using OutputLayer = AffineTransform<HiddenLayer2, 1>;
-
-    }  // namespace Layers
-
-    using Network = Layers::OutputLayer;
-
-}  // namespace Eval::NNUE
-#endif // K_P_CR_256X2_32_32_H

From b27c51b5cf1d8440270db19a2d3c105c7950c91f Mon Sep 17 00:00:00 2001
From: JWmer <26392242+NightlyKing@users.noreply.github.com>
Date: Fri, 20 Nov 2020 15:06:45 +0100
Subject: [PATCH 334/398] Delete k-p-cr-ep_256x2-32-32.h

---
 .../architectures/k-p-cr-ep_256x2-32-32.h     | 38 -------------------
 1 file changed, 38 deletions(-)
 delete mode 100644 src/nnue/architectures/k-p-cr-ep_256x2-32-32.h

diff --git a/src/nnue/architectures/k-p-cr-ep_256x2-32-32.h b/src/nnue/architectures/k-p-cr-ep_256x2-32-32.h
deleted file mode 100644
index 14eeba54..00000000
--- a/src/nnue/architectures/k-p-cr-ep_256x2-32-32.h
+++ /dev/null
@@ -1,38 +0,0 @@
-// Definition of input features and network structure used in NNUE evaluation function
-
-#ifndef K_P_CR_EP_256X2_32_32_H
-#define K_P_CR_EP_256X2_32_32_H
-
-#include "nnue/features/feature_set.h"
-#include "nnue/features/k.h"
-#include "nnue/features/p.h"
-#include "nnue/features/castling_right.h"
-#include "nnue/features/enpassant.h"
-
-#include "nnue/layers/input_slice.h"
-#include "nnue/layers/affine_transform.h"
-#include "nnue/layers/clipped_relu.h"
-
-namespace Eval::NNUE {
-
-    // Input features used in evaluation function
-    using RawFeatures = Features::FeatureSet<Features::K, Features::P,
-        Features::CastlingRight, Features::EnPassant>;
-
-    // Number of input feature dimensions after conversion
-    constexpr IndexType kTransformedFeatureDimensions = 256;
-
-    namespace Layers {
-
-        // define network structure
-        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-        using OutputLayer = AffineTransform<HiddenLayer2, 1>;
-
-    }  // namespace Layers
-
-    using Network = Layers::OutputLayer;
-
-}  // namespace Eval::NNUE
-#endif // K_P_CR_EP_256X2_32_32_H

From c04c5b6658b790c0cb75076517415200a32e3bba Mon Sep 17 00:00:00 2001
From: JWmer <26392242+NightlyKing@users.noreply.github.com>
Date: Fri, 20 Nov 2020 15:11:02 +0100
Subject: [PATCH 335/398] Update nnue_common.h

---
 src/nnue/nnue_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h
index bd4294a3..9bce9fe9 100644
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -71,7 +71,7 @@
 namespace Eval::NNUE {
 
     // Version of the evaluation file
-    constexpr std::uint32_t kVersion = 0x7AF32F17u;
+    constexpr std::uint32_t kVersion = 0x7AF32F16u;
 
     // Constant used in evaluation value calculation
     constexpr int FV_SCALE = 16;

From 5b3e9b0eb31b7b0ebf7031bedf7f8a11a9763483 Mon Sep 17 00:00:00 2001
From: JWmer <26392242+NightlyKing@users.noreply.github.com>
Date: Fri, 20 Nov 2020 15:15:25 +0100
Subject: [PATCH 336/398] Update p.cpp

---
 src/nnue/features/p.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/nnue/features/p.cpp b/src/nnue/features/p.cpp
index 1621e8b2..a17e304f 100644
--- a/src/nnue/features/p.cpp
+++ b/src/nnue/features/p.cpp
@@ -4,9 +4,11 @@
 //Definition of input feature P of NNUE evaluation function
 namespace Eval::NNUE::Features {
 
-    // Orient a square according to perspective (flip rank for black)
+    // Orient a square according to perspective (rotate the board 180° for black)
+    // this has to stay until we find a better arch that works with "flip".
+    // allows us to use current master net for gensfen (primarily needed for higher quality data)
     inline Square orient(Color perspective, Square s) {
-        return Square(int(s) ^ (bool(perspective) * SQ_A8));
+        return Square(int(s) ^ (bool(perspective) * 63));
     }
 
     // Find the index of the feature quantity from the king position and PieceSquare

From 36c801699f2b8f8243a587e3b37ba0f24ba86776 Mon Sep 17 00:00:00 2001
From: JWmer <26392242+NightlyKing@users.noreply.github.com>
Date: Fri, 20 Nov 2020 15:15:54 +0100
Subject: [PATCH 337/398] Update k.cpp

---
 src/nnue/features/k.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/nnue/features/k.cpp b/src/nnue/features/k.cpp
index f01a6ce0..7b62a75a 100644
--- a/src/nnue/features/k.cpp
+++ b/src/nnue/features/k.cpp
@@ -4,9 +4,11 @@
 //Definition of input feature quantity K of NNUE evaluation function
 namespace Eval::NNUE::Features {
 
-    // Orient a square according to perspective (flip rank for black)
+    // Orient a square according to perspective (rotate the board 180° for black)
+    // this has to stay until we find a better arch that works with "flip".
+    // allows us to use current master net for gensfen (primarily needed for higher quality data)
     inline Square orient(Color perspective, Square s) {
-        return Square(int(s) ^ (bool(perspective) * SQ_A8));
+        return Square(int(s) ^ (bool(perspective) * 63));
     }
 
     // Index of a feature for a given king position.

From 021f47b00eaf407854f5074ed804abdd1be620c7 Mon Sep 17 00:00:00 2001
From: JWmer <26392242+NightlyKing@users.noreply.github.com>
Date: Fri, 20 Nov 2020 15:16:37 +0100
Subject: [PATCH 338/398] Update half_relative_kp.cpp

---
 src/nnue/features/half_relative_kp.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/nnue/features/half_relative_kp.cpp b/src/nnue/features/half_relative_kp.cpp
index 240e20c0..2ebccd59 100644
--- a/src/nnue/features/half_relative_kp.cpp
+++ b/src/nnue/features/half_relative_kp.cpp
@@ -4,9 +4,11 @@
 //Definition of input features HalfRelativeKP of NNUE evaluation function
 namespace Eval::NNUE::Features {
 
-    // Orient a square according to perspective (flip rank for black)
+    // Orient a square according to perspective (rotate the board 180° for black)
+    // this has to stay until we find a better arch that works with "flip".
+    // allows us to use current master net for gensfen (primarily needed for higher quality data)
     inline Square orient(Color perspective, Square s) {
-        return Square(int(s) ^ (bool(perspective) * SQ_A8));
+        return Square(int(s) ^ (bool(perspective) * 63));
     }
 
     // Find the index of the feature quantity from the ball position and PieceSquare

From be4cd561467362de6d4648d8413044e13314678b Mon Sep 17 00:00:00 2001
From: JWmer <26392242+NightlyKing@users.noreply.github.com>
Date: Fri, 20 Nov 2020 15:17:27 +0100
Subject: [PATCH 339/398] Update half_kp.cpp

---
 src/nnue/features/half_kp.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/nnue/features/half_kp.cpp b/src/nnue/features/half_kp.cpp
index 18e82004..743a6378 100644
--- a/src/nnue/features/half_kp.cpp
+++ b/src/nnue/features/half_kp.cpp
@@ -23,9 +23,11 @@
 
 namespace Eval::NNUE::Features {
 
-    // Orient a square according to perspective (flip rank for black)
+    // Orient a square according to perspective (rotate the board 180° for black)
+    // this has to stay until we find a better arch that works with "flip".
+    // allows us to use current master net for gensfen (primarily needed for higher quality data)
     inline Square orient(Color perspective, Square s) {
-        return Square(int(s) ^ (bool(perspective) * SQ_A8));
+        return Square(int(s) ^ (bool(perspective) * 63));
     }
 
     // Find the index of the feature quantity from the king position and PieceSquare

From f832aa6b6becb9b5b88e120b78db8936ef028962 Mon Sep 17 00:00:00 2001
From: JWmer <26392242+NightlyKing@users.noreply.github.com>
Date: Fri, 20 Nov 2020 15:19:36 +0100
Subject: [PATCH 340/398] Update evaluate.h

---
 src/evaluate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/evaluate.h b/src/evaluate.h
index fc626698..f5d3efa7 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -32,7 +32,7 @@ namespace Eval {
   // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
   // for the build process (profile-build and fishtest) to work. Do not change the
   // name of the macro, as it is used in the Makefile.
-  #define EvalFileDefaultName   "nn-98a7585c85e9.nnue"
+  #define EvalFileDefaultName   "nn-c3ca321c51c9.nnue"
 
 } // namespace Eval
 

From 777c3a08ab5ab248958b45955053d1b735c7eb42 Mon Sep 17 00:00:00 2001
From: JWmer <26392242+NightlyKing@users.noreply.github.com>
Date: Fri, 20 Nov 2020 15:30:17 +0100
Subject: [PATCH 341/398] Update README.md

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 5fa8179e..d894e649 100644
--- a/README.md
+++ b/README.md
@@ -113,6 +113,8 @@ There is a builting converted that support all 3 formats described above. Any of
 
 ## Resources
 
+- [Training NNUE for SF](https://docs.google.com/document/d/1os5GH8GGJbV0nKAfXD-qySBclFzKKtXKHbAnA-un8tA/edit) google document with important information and coding priorities
+- [Gensfen data (vondele)](https://drive.google.com/drive/folders/1mftuzYdl9o6tBaceR3d_VBQIrgKJsFpl) over 2b fens available
 - [Stockfish NNUE Wiki](https://www.qhapaq.org/shogi/shogiwiki/stockfish-nnue/)
 - [Training instructions](https://twitter.com/mktakizawa/status/1273042640280252416) from the creator of the Elmo shogi engine
 - [Original Talkchess thread](http://talkchess.com/forum3/viewtopic.php?t=74059) discussing Stockfish NNUE

From ea70e378cdf15ea44ba943cc0a1257eb9a31d55a Mon Sep 17 00:00:00 2001
From: JWmer <26392242+NightlyKing@users.noreply.github.com>
Date: Fri, 20 Nov 2020 15:47:44 +0100
Subject: [PATCH 342/398] Update a.cpp

---
 src/nnue/features/a.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/nnue/features/a.cpp b/src/nnue/features/a.cpp
index 6ceb4efa..1bfb583f 100644
--- a/src/nnue/features/a.cpp
+++ b/src/nnue/features/a.cpp
@@ -4,9 +4,13 @@
 // Definition of input feature A of NNUE evaluation function
 namespace Eval::NNUE::Features {
 
-    // Orient a square according to perspective (flip rank for black)
+    // Orient a square according to perspective (rotate the board 180° for black)
+    // Important note for "halfka": this arch was designed with "flip" in mind 
+    // although it still is untested which approach is better.
+    // this has to stay until we find a better arch that works with "flip".
+    // allows us to use current master net for gensfen (primarily needed for higher quality data)
     inline Square orient(Color perspective, Square s) {
-        return Square(int(s) ^ (bool(perspective) * SQ_A8));
+        return Square(int(s) ^ (bool(perspective) * 63));
     }
 
     // Find the index of the feature quantity from the king position and PieceSquare

From b0429237a86ea303d087e433783360f5858fb0f6 Mon Sep 17 00:00:00 2001
From: JWmer <26392242+NightlyKing@users.noreply.github.com>
Date: Fri, 20 Nov 2020 15:48:18 +0100
Subject: [PATCH 343/398] Update half_ka.cpp

---
 src/nnue/features/half_ka.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/nnue/features/half_ka.cpp b/src/nnue/features/half_ka.cpp
index 83e59067..08124b96 100644
--- a/src/nnue/features/half_ka.cpp
+++ b/src/nnue/features/half_ka.cpp
@@ -23,9 +23,13 @@
 
 namespace Eval::NNUE::Features {
 
-    // Orient a square according to perspective (flip rank for black)
+    // Orient a square according to perspective (rotate the board 180° for black)
+    // Important note for "halfka": this arch was designed with "flip" in mind 
+    // although it still is untested which approach is better.
+    // this has to stay until we find a better arch that works with "flip".
+    // allows us to use current master net for gensfen (primarily needed for higher quality data)
     inline Square orient(Color perspective, Square s) {
-        return Square(int(s) ^ (bool(perspective) * SQ_A8));
+        return Square(int(s) ^ (bool(perspective) * 63));
     }
 
     // Find the index of the feature quantity from the king position and PieceSquare

From 3975fc9c0dc1f896ae20339bb14c83572971f9a6 Mon Sep 17 00:00:00 2001
From: JWmer <26392242+NightlyKing@users.noreply.github.com>
Date: Fri, 20 Nov 2020 15:49:02 +0100
Subject: [PATCH 344/398] Update half_relative_ka.cpp

---
 src/nnue/features/half_relative_ka.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/nnue/features/half_relative_ka.cpp b/src/nnue/features/half_relative_ka.cpp
index ba3edbcf..d2ad31e6 100644
--- a/src/nnue/features/half_relative_ka.cpp
+++ b/src/nnue/features/half_relative_ka.cpp
@@ -4,9 +4,13 @@
 //Definition of input features HalfRelativeKA of NNUE evaluation function
 namespace Eval::NNUE::Features {
 
-    // Orient a square according to perspective (flip rank for black)
+    // Orient a square according to perspective (rotate the board 180° for black)
+    // Important note for "halfka": this arch was designed with "flip" in mind 
+    // although it still is untested which approach is better.
+    // this has to stay until we find a better arch that works with "flip".
+    // allows us to use current master net for gensfen (primarily needed for higher quality data)
     inline Square orient(Color perspective, Square s) {
-        return Square(int(s) ^ (bool(perspective) * SQ_A8));
+        return Square(int(s) ^ (bool(perspective) * 63));
     }
 
     // Find the index of the feature quantity from the ball position and PieceSquare

From 38d19eca143d1e0f6b2f42b8eafd27078d11f164 Mon Sep 17 00:00:00 2001
From: JWmer <26392242+NightlyKing@users.noreply.github.com>
Date: Fri, 20 Nov 2020 23:58:04 +0100
Subject: [PATCH 345/398] Update instrumented.sh

---
 tests/instrumented.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/instrumented.sh b/tests/instrumented.sh
index 07ecbb9c..dffc257a 100755
--- a/tests/instrumented.sh
+++ b/tests/instrumented.sh
@@ -24,7 +24,7 @@ case $1 in
     echo "valgrind-thread testing started"
     prefix=''
     exeprefix='valgrind --fair-sched=try --error-exitcode=42'
-    postfix='1>/dev/null'
+    postfix=''
     threads="2"
     bench_depth=5
     go_depth=10

From d43cd104b6549c2372d85c95410cfb8d16cfeb33 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <ts.tomeksopel@gmail.com>
Date: Sat, 21 Nov 2020 21:14:15 +0100
Subject: [PATCH 346/398] Fix uninitialized variable when searching from a
 terminal position.

---
 src/thread.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/thread.cpp b/src/thread.cpp
index e867048d..f035186b 100644
--- a/src/thread.cpp
+++ b/src/thread.cpp
@@ -232,6 +232,9 @@ void ThreadPool::start_thinking(Position& pos, StateListPtr& states,
       th->rootMoves = rootMoves;
       th->rootPos.set(pos.fen(), pos.is_chess960(), &th->rootState, th);
       th->rootState = setupStates->back();
+      // This is also set by rank_root_moves but we need to set it
+      // also when there is no legal moves.
+      th->rootInTB = false;
       th->UseRule50 = bool(Options["Syzygy50MoveRule"]);
       th->ProbeDepth = int(Options["SyzygyProbeDepth"]);
       th->Cardinality = int(Options["SyzygyProbeLimit"]);

From 3cee6881ee1639bd22d89ef43387c83c95f5067e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 22 Nov 2020 17:18:06 +0100
Subject: [PATCH 347/398] Move the terminal position check to after qsearch,
 otherwise qsearch may end up in a terminal position.

---
 src/learn/learn.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 3942b606..cab5a9b5 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -685,10 +685,6 @@ namespace Learner
                 int ply = 0;
                 pos.do_move((Move)ps.move, state[ply++]);
 
-                // We want to position being trained on not to be terminal
-                if (MoveList<LEGAL>(pos).size() == 0)
-                    goto RETRY_READ;
-
                 // Evaluation value of shallow search (qsearch)
                 const auto [_, pv] = Search::qsearch(pos);
 
@@ -698,6 +694,10 @@ namespace Learner
                 }
             }
 
+            // We want to position being trained on not to be terminal
+            if (MoveList<LEGAL>(pos).size() == 0)
+                goto RETRY_READ;
+
             // Since we have reached the end phase of PV, add the slope here.
             pos_add_grad();
         }

From ee13cfce67222faafca4e93f8af39fad3429d4bd Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 22 Nov 2020 17:02:33 +0100
Subject: [PATCH 348/398] Fix result assigned for a psvector when the positions
 are not continuous.

---
 src/learn/gensfen.cpp | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index b265da71..5f8bbba1 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -191,7 +191,8 @@ namespace Learner
             PSVector& sfens,
             int8_t lastTurnIsWin,
             std::atomic<uint64_t>& counter,
-            uint64_t limit);
+            uint64_t limit,
+            Color result_color);
 
         void report(uint64_t done, uint64_t new_done);
 
@@ -291,7 +292,7 @@ namespace Learner
             vector<int> move_hist_scores;
 
             auto flush_psv = [&](int8_t result) {
-                quit = commit_psv(th, packed_sfens, result, counter, limit);
+                quit = commit_psv(th, packed_sfens, result, counter, limit, pos.side_to_move());
             };
 
             for (int ply = 0; ; ++ply)
@@ -717,7 +718,8 @@ namespace Learner
         PSVector& sfens,
         int8_t result,
         std::atomic<uint64_t>& counter,
-        uint64_t limit)
+        uint64_t limit,
+        Color result_color)
     {
         if (!params.write_out_draw_game_in_training_data_generation && result == 0)
         {
@@ -725,13 +727,17 @@ namespace Learner
             return false;
         }
 
+        auto side_to_move_from_sfen = [](auto& sfen){
+            return (Color)(sfen.sfen.data[0] & 1);
+        };
+
         // From the final stage (one step before) to the first stage, give information on the outcome of the game for each stage.
         // The phases stored in sfens are assumed to be continuous (in order).
         for (auto it = sfens.rbegin(); it != sfens.rend(); ++it)
         {
-            // If is_win == 0 (draw), multiply by -1 and it will remain 0 (draw)
-            result = -result;
-            it->game_result = result;
+            // The side to move is packed as the lowest bit of the first byte
+            const Color side_to_move = side_to_move_from_sfen(*it);
+            it->game_result = side_to_move == result_color ? result : -result;
         }
 
         // Write sfens in move order to make potential compression easier

From 9030020a854f81b4441c3f0157e66ab72b5c02af Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 22 Nov 2020 18:05:59 +0100
Subject: [PATCH 349/398] Add smart_fen_skipping option to learn.

---
 src/learn/learn.cpp | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index cab5a9b5..f7358f8e 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -400,6 +400,7 @@ namespace Learner
             bool skip_duplicated_positions_in_training = true;
 
             bool assume_quiet = false;
+            bool smart_fen_skipping = false;
 
             double learning_rate = 1.0;
 
@@ -680,7 +681,8 @@ namespace Learner
                 goto RETRY_READ;
             }
 
-            if (!params.assume_quiet)
+            // We don't need to qsearch when doing smart skipping
+            if (!params.assume_quiet && !params.smart_fen_skipping)
             {
                 int ply = 0;
                 pos.do_move((Move)ps.move, state[ply++]);
@@ -694,6 +696,13 @@ namespace Learner
                 }
             }
 
+            if (params.smart_fen_skipping
+                && (pos.capture_or_promotion((Move)ps.move)
+                    || pos.checkers()))
+            {
+                goto RETRY_READ;
+            }
+            
             // We want to position being trained on not to be terminal
             if (MoveList<LEGAL>(pos).size() == 0)
                 goto RETRY_READ;
@@ -1115,6 +1124,7 @@ namespace Learner
             }
             else if (option == "verbose") params.verbose = true;
             else if (option == "assume_quiet") params.assume_quiet = true;
+            else if (option == "smart_fen_skipping") params.smart_fen_skipping = true;
             else
             {
                 out << "INFO: Unknown option: " << option << ". Ignoring.\n";

From 45e3335ee843e11b838efbc507214e5bcd7313a4 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 22 Nov 2020 18:08:14 +0100
Subject: [PATCH 350/398] Add missing docs.

---
 docs/gensfen.md | 2 ++
 docs/learn.md   | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/docs/gensfen.md b/docs/gensfen.md
index 16fead59..48f7f5e7 100644
--- a/docs/gensfen.md
+++ b/docs/gensfen.md
@@ -62,4 +62,6 @@ Currently the following options are available:
 
 `sfen_format` - format of the training data to use. Either `bin` or `binpack`. Default: `binpack`.
 
+`ensure_quiet` - this is a flag option. When specified the positions will be from the qsearch leaf.
+
 `seed` - seed for the PRNG. Can be either a number or a string. If it's a string then its hash will be used. If not specified then the current time will be used.
diff --git a/docs/learn.md b/docs/learn.md
index 6de81521..30a7c951 100644
--- a/docs/learn.md
+++ b/docs/learn.md
@@ -64,6 +64,10 @@ Currently the following options are available:
 
 `newbob_decay` - learning rate will be multiplied by this factor every time a net is rejected (so in other words it controls LR drops). Default: 0.5 (no LR drops)
 
+`assume_quiet` - this is a flag option. When specified learn will not perform qsearch to reach a quiet position.
+
+`smart_fen_skipping` - this is a flag option. When specified some position that are not good candidates for teaching are skipped. This includes positions where the best move is a capture or promotion, and position where a king is in check.
+
 `newbob_num_trials` - determines after how many subsequent rejected nets the training process will be terminated. Default: 4.
 
 `auto_lr_drop` - every time this many positions are processed the learning rate is multiplied by `newbob_decay`. In other words this value specifies for how many positions a single learning rate stage lasts. If 0 then doesn't have any effect. Default: 0.

From 027626db1e449597ba2211a0819f251beda37b88 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ste=CC=81phane=20Nicolet?= <cassio@free.fr>
Date: Thu, 12 Nov 2020 14:05:28 +0100
Subject: [PATCH 351/398] Small cleanups 13

No functional change
---
 AUTHORS                             |  2 +-
 src/evaluate.cpp                    | 10 +++++-----
 src/misc.cpp                        |  3 +--
 src/nnue/nnue_feature_transformer.h |  2 +-
 src/pawns.cpp                       |  4 ++--
 src/search.cpp                      |  2 +-
 src/types.h                         |  4 ++--
 7 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/AUTHORS b/AUTHORS
index f30be4de..71b718b8 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -44,6 +44,7 @@ Daniel Dugovic (ddugovic)
 Dariusz Orzechowski (dorzechowski)
 David Zar
 Daylen Yang (daylen)
+Deshawn Mohan-Smith (GoldenRare)
 DiscanX
 Dominik Schlösser (domschl)
 double-beep
@@ -64,7 +65,6 @@ Gary Heckman (gheckman)
 George Sobala (gsobala)
 gguliash
 Gian-Carlo Pascutto (gcp)
-Deshawn Mohan-Smith (GoldenRare)
 Gontran Lemaire (gonlem)
 Goodkov Vasiliy Aleksandrovich (goodkov)
 Gregor Cramer
diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 34ebe6c3..1a8cf662 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -1025,7 +1025,7 @@ Value Eval::evaluate(const Position& pos) {
   {
       // Scale and shift NNUE for compatibility with search and classical evaluation
       auto  adjusted_NNUE = [&](){
-         int mat = pos.non_pawn_material() + PieceValue[MG][PAWN] * pos.count<PAWN>();
+         int mat = pos.non_pawn_material() + PawnValueMg * pos.count<PAWN>();
          return NNUE::evaluate(pos) * (720 + mat / 32) / 1024 + Tempo;
       };
 
@@ -1041,10 +1041,10 @@ Value Eval::evaluate(const Position& pos) {
       // For the case of opposite colored bishops, switch to NNUE eval with
       // small probability if the classical eval is less than the threshold.
       if (   largePsq
-          && (abs(v) * 16 < NNUEThreshold2 * r50
-          || (   pos.opposite_bishops()
-              && abs(v) * 16 < (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50
-              && !(pos.this_thread()->nodes & 0xB))))
+          && (   abs(v) * 16 < NNUEThreshold2 * r50
+              || (   pos.opposite_bishops()
+                  && abs(v) * 16 < (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50
+                  && !(pos.this_thread()->nodes & 0xB))))
           v = adjusted_NNUE();
   }
 
diff --git a/src/misc.cpp b/src/misc.cpp
index a16a6e90..f2bce6b0 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -583,11 +583,10 @@ namespace CommandLine {
 string argv0;            // path+name of the executable binary, as given by argv[0]
 string binaryDirectory;  // path of the executable directory
 string workingDirectory; // path of the working directory
-string pathSeparator;    // Separator for our current OS
 
 void init(int argc, char* argv[]) {
     (void)argc;
-    string separator;
+    string pathSeparator;
 
     // extract the path+name of the executable binary
     argv0 = argv[0];
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index f49777b5..85bc2bc8 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -247,7 +247,7 @@ namespace Eval::NNUE {
       // Look for a usable accumulator of an earlier position. We keep track
       // of the estimated gain in terms of features to be added/subtracted.
       StateInfo *st = pos.state(), *next = nullptr;
-      int gain = popcount(pos.pieces()) - 2;
+      int gain = pos.count<ALL_PIECES>() - 2;
       while (st->accumulator.state[c] == EMPTY)
       {
         auto& dp = st->dirtyPiece;
diff --git a/src/pawns.cpp b/src/pawns.cpp
index fde70ba5..68aaf331 100644
--- a/src/pawns.cpp
+++ b/src/pawns.cpp
@@ -176,8 +176,8 @@ namespace {
             score -=  Doubled * doubled
                     + WeakLever * more_than_one(lever);
 
-        if (blocked && r > RANK_4)
-            score += BlockedPawn[r-4];
+        if (blocked && r >= RANK_5)
+            score += BlockedPawn[r - RANK_5];
     }
 
     return score;
diff --git a/src/search.cpp b/src/search.cpp
index 66ef5043..78a1f7b6 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1058,7 +1058,7 @@ moves_loop: // When in check, search starts from here
                   && captureHistory[movedPiece][to_sq(move)][type_of(pos.piece_on(to_sq(move)))] < 0)
                   continue;
 
-              // See based pruning
+              // SEE based pruning
               if (!pos.see_ge(move, Value(-221) * depth)) // (~25 Elo)
                   continue;
           }
diff --git a/src/types.h b/src/types.h
index bf692f7e..8506b06e 100644
--- a/src/types.h
+++ b/src/types.h
@@ -202,8 +202,8 @@ enum PieceType {
 
 enum Piece {
   NO_PIECE,
-  W_PAWN = 1, W_KNIGHT, W_BISHOP, W_ROOK, W_QUEEN, W_KING,
-  B_PAWN = 9, B_KNIGHT, B_BISHOP, B_ROOK, B_QUEEN, B_KING,
+  W_PAWN = PAWN,     W_KNIGHT, W_BISHOP, W_ROOK, W_QUEEN, W_KING,
+  B_PAWN = PAWN + 8, B_KNIGHT, B_BISHOP, B_ROOK, B_QUEEN, B_KING,
   PIECE_NB = 16
 };
 

From 9fb6383ed804d0bc86d52b07def14352f44eb5b4 Mon Sep 17 00:00:00 2001
From: Unai Corzo <corzounai@gmail.com>
Date: Tue, 24 Nov 2020 17:06:30 +0100
Subject: [PATCH 352/398] Assorted search and eval parameter tune

Search and eval parameter tune.

STC https://tests.stockfishchess.org/tests/view/5fba850a67cbf42301d6b07d
LLR: 2.94 (-2.94,2.94) {-0.25,1.25}
Total: 24312 W: 2388 L: 2228 D: 19696
Ptnml(0-2): 85, 1800, 8241, 1930, 100

LTC https://tests.stockfishchess.org/tests/view/5fbad5ea67cbf42301d6b0fa
LLR: 2.95 (-2.94,2.94) {0.25,1.25}
Total: 88376 W: 3619 L: 3351 D: 81406
Ptnml(0-2): 56, 2977, 37849, 3255, 51

closes https://github.com/official-stockfish/Stockfish/pull/3232

bench: 3600361
---
 src/evaluate.cpp | 10 +++++-----
 src/search.cpp   | 34 +++++++++++++++++-----------------
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 1a8cf662..3d887119 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -187,11 +187,11 @@ using namespace Trace;
 namespace {
 
   // Threshold for lazy and space evaluation
-  constexpr Value LazyThreshold1 =  Value(1400);
-  constexpr Value LazyThreshold2 =  Value(1300);
-  constexpr Value SpaceThreshold = Value(12222);
-  constexpr Value NNUEThreshold1 =   Value(550);
-  constexpr Value NNUEThreshold2 =   Value(150);
+  constexpr Value LazyThreshold1 =  Value(1565);
+  constexpr Value LazyThreshold2 =  Value(1102);
+  constexpr Value SpaceThreshold = Value(11551);
+  constexpr Value NNUEThreshold1 =   Value(682);
+  constexpr Value NNUEThreshold2 =   Value(176);
 
   // KingAttackWeights[PieceType] contains king attack weights by piece type
   constexpr int KingAttackWeights[PIECE_TYPE_NB] = { 0, 0, 81, 52, 44, 10 };
diff --git a/src/search.cpp b/src/search.cpp
index 78a1f7b6..7c797bef 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -65,7 +65,7 @@ namespace {
   // Razor and futility margins
   constexpr int RazorMargin = 510;
   Value futility_margin(Depth d, bool improving) {
-    return Value(223 * (d - improving));
+    return Value(234 * (d - improving));
   }
 
   // Reductions lookup table, initialized at startup
@@ -73,7 +73,7 @@ namespace {
 
   Depth reduction(bool i, Depth d, int mn) {
     int r = Reductions[d] * Reductions[mn];
-    return (r + 509) / 1024 + (!i && r > 894);
+    return (r + 503) / 1024 + (!i && r > 915);
   }
 
   constexpr int futility_move_count(bool improving, Depth depth) {
@@ -194,7 +194,7 @@ namespace {
 void Search::init() {
 
   for (int i = 1; i < MAX_MOVES; ++i)
-      Reductions[i] = int((22.0 + 2 * std::log(Threads.size())) * std::log(i + 0.25 * std::log(i)));
+      Reductions[i] = int((21.3 + 2 * std::log(Threads.size())) * std::log(i + 0.25 * std::log(i)));
 }
 
 
@@ -410,7 +410,7 @@ void Thread::search() {
               beta  = std::min(prev + delta, VALUE_INFINITE);
 
               // Adjust contempt based on root move's previousScore (dynamic contempt)
-              int dct = ct + (105 - ct / 2) * prev / (abs(prev) + 149);
+              int dct = ct + (113 - ct / 2) * prev / (abs(prev) + 147);
 
               contempt = (us == WHITE ?  make_score(dct, dct / 2)
                                       : -make_score(dct, dct / 2));
@@ -830,7 +830,7 @@ namespace {
         && (ss-1)->statScore < 22977
         &&  eval >= beta
         &&  eval >= ss->staticEval
-        &&  ss->staticEval >= beta - 30 * depth - 28 * improving + 84 * ss->ttPv + 182
+        &&  ss->staticEval >= beta - 30 * depth - 28 * improving + 84 * ss->ttPv + 168
         && !excludedMove
         &&  pos.non_pawn_material(us)
         && (ss->ply >= thisThread->nmpMinPly || us != thisThread->nmpColor))
@@ -838,7 +838,7 @@ namespace {
         assert(eval - beta >= 0);
 
         // Null move dynamic reduction based on depth and value
-        Depth R = (982 + 85 * depth) / 256 + std::min(int(eval - beta) / 192, 3);
+        Depth R = (1015 + 85 * depth) / 256 + std::min(int(eval - beta) / 191, 3);
 
         ss->currentMove = MOVE_NULL;
         ss->continuationHistory = &thisThread->continuationHistory[0][0][NO_PIECE][0];
@@ -855,7 +855,7 @@ namespace {
             if (nullValue >= VALUE_TB_WIN_IN_MAX_PLY)
                 nullValue = beta;
 
-            if (thisThread->nmpMinPly || (abs(beta) < VALUE_KNOWN_WIN && depth < 13))
+            if (thisThread->nmpMinPly || (abs(beta) < VALUE_KNOWN_WIN && depth < 14))
                 return nullValue;
 
             assert(!thisThread->nmpMinPly); // Recursive verification is not allowed
@@ -874,7 +874,7 @@ namespace {
         }
     }
 
-    probCutBeta = beta + 176 - 49 * improving;
+    probCutBeta = beta + 183 - 49 * improving;
 
     // Step 10. ProbCut (~10 Elo)
     // If we have a good enough capture and a reduced search returns a value
@@ -1039,7 +1039,7 @@ moves_loop: // When in check, search starts from here
               // Futility pruning: parent node (~5 Elo)
               if (   lmrDepth < 7
                   && !ss->inCheck
-                  && ss->staticEval + 283 + 170 * lmrDepth <= alpha
+                  && ss->staticEval + 266 + 170 * lmrDepth <= alpha
                   &&  (*contHist[0])[movedPiece][to_sq(move)]
                     + (*contHist[1])[movedPiece][to_sq(move)]
                     + (*contHist[3])[movedPiece][to_sq(move)]
@@ -1047,7 +1047,7 @@ moves_loop: // When in check, search starts from here
                   continue;
 
               // Prune moves with negative SEE (~20 Elo)
-              if (!pos.see_ge(move, Value(-(29 - std::min(lmrDepth, 18)) * lmrDepth * lmrDepth)))
+              if (!pos.see_ge(move, Value(-(30 - std::min(lmrDepth, 18)) * lmrDepth * lmrDepth)))
                   continue;
           }
           else
@@ -1059,7 +1059,7 @@ moves_loop: // When in check, search starts from here
                   continue;
 
               // SEE based pruning
-              if (!pos.see_ge(move, Value(-221) * depth)) // (~25 Elo)
+              if (!pos.see_ge(move, Value(-213) * depth)) // (~25 Elo)
                   continue;
           }
       }
@@ -1153,12 +1153,12 @@ moves_loop: // When in check, search starts from here
               || moveCountPruning
               || ss->staticEval + PieceValue[EG][pos.captured_piece()] <= alpha
               || cutNode
-              || thisThread->ttHitAverage < 427 * TtHitAverageResolution * TtHitAverageWindow / 1024))
+              || thisThread->ttHitAverage < 432 * TtHitAverageResolution * TtHitAverageWindow / 1024))
       {
           Depth r = reduction(improving, depth, moveCount);
 
           // Decrease reduction if the ttHit running average is large
-          if (thisThread->ttHitAverage > 509 * TtHitAverageResolution * TtHitAverageWindow / 1024)
+          if (thisThread->ttHitAverage > 537 * TtHitAverageResolution * TtHitAverageWindow / 1024)
               r--;
 
           // Increase reduction if other threads are searching this position
@@ -1211,10 +1211,10 @@ moves_loop: // When in check, search starts from here
                              - 5287;
 
               // Decrease/increase reduction by comparing opponent's stat score (~10 Elo)
-              if (ss->statScore >= -106 && (ss-1)->statScore < -104)
+              if (ss->statScore >= -105 && (ss-1)->statScore < -103)
                   r--;
 
-              else if ((ss-1)->statScore >= -119 && ss->statScore < -140)
+              else if ((ss-1)->statScore >= -122 && ss->statScore < -129)
                   r++;
 
               // Decrease/increase reduction for moves with a good/bad history (~30 Elo)
@@ -1228,7 +1228,7 @@ moves_loop: // When in check, search starts from here
 
               // Unless giving check, this capture is likely bad
               if (   !givesCheck
-                  && ss->staticEval + PieceValue[EG][pos.captured_piece()] + 213 * depth <= alpha)
+                  && ss->staticEval + PieceValue[EG][pos.captured_piece()] + 210 * depth <= alpha)
                   r++;
           }
 
@@ -1502,7 +1502,7 @@ moves_loop: // When in check, search starts from here
         if (PvNode && bestValue > alpha)
             alpha = bestValue;
 
-        futilityBase = bestValue + 145;
+        futilityBase = bestValue + 155;
     }
 
     const PieceToHistory* contHist[] = { (ss-1)->continuationHistory, (ss-2)->continuationHistory,

From 7615e3485e75c2f1715d372f7bb1f546738a5c76 Mon Sep 17 00:00:00 2001
From: MaximMolchanov <maksym.n.molchanov@gmail.com>
Date: Sat, 14 Nov 2020 02:55:29 +0200
Subject: [PATCH 353/398] Calculate sum from first elements

in affine transform for AVX512/AVX2/SSSE3

The idea is to initialize sum with the first element instead of zero.
Reduce one add_epi32 and one set_zero SIMD instructions for each output dimension.

sum = 0; for i = 1 to n sum += a[i] ->
sum = a[1]; for i = 2 to n sum += a[i]

STC:
LLR: 2.95 (-2.94,2.94) {-0.25,1.25}
Total: 69048 W: 7024 L: 6799 D: 55225
Ptnml(0-2): 260, 5175, 23458, 5342, 289
https://tests.stockfishchess.org/tests/view/5faf2cf467cbf42301d6aa06

closes https://github.com/official-stockfish/Stockfish/pull/3227

No functional change.
---
 AUTHORS                            |   1 +
 src/nnue/layers/affine_transform.h | 211 ++++++++++++++++++++---------
 2 files changed, 148 insertions(+), 64 deletions(-)

diff --git a/AUTHORS b/AUTHORS
index 71b718b8..b31a36e9 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -112,6 +112,7 @@ Mark Tenzer (31m059)
 marotear
 Matthew Lai (matthewlai)
 Matthew Sullivan (Matt14916)
+Maxim Molchanov (Maxim)
 Michael An (man)
 Michael Byrne (MichaelB7)
 Michael Chaly (Vizvezdenec)
diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index 47c9c488..caf315b2 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -181,13 +181,13 @@ namespace Eval::NNUE::Layers {
         return _mm512_add_epi32(_mm512_permutexvar_epi32(indices, x), bias);
       };
 
-      [[maybe_unused]] auto m512_add_dpbusd_epi32 = [=](__m512i& acc, __m512i a, __m512i b) {
 #if defined (USE_VNNI)
+      [[maybe_unused]] auto m512_add_dpbusd_epi32 = [=](__m512i& acc, __m512i a, __m512i b) {
         acc = _mm512_dpbusd_epi32(acc, a, b);
 #else
+      [[maybe_unused]] auto m512_dpbusd_epi32 = [=](__m512i a, __m512i b) -> __m512i {
         __m512i product0 = _mm512_maddubs_epi16(a, b);
-        product0 = _mm512_madd_epi16(product0, kOnes512);
-        acc = _mm512_add_epi32(acc, product0);
+        return _mm512_madd_epi16(product0, kOnes512);
 #endif
       };
 
@@ -214,14 +214,13 @@ namespace Eval::NNUE::Layers {
 
         return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias);
       };
-
-      [[maybe_unused]] auto m256_add_dpbusd_epi32 = [=](__m256i& acc, __m256i a, __m256i b) {
 #if defined (USE_VNNI)
+      [[maybe_unused]] auto m256_add_dpbusd_epi32 = [=](__m256i& acc, __m256i a, __m256i b) {
         acc = _mm256_dpbusd_epi32(acc, a, b);
 #else
+      [[maybe_unused]] auto m256_dpbusd_epi32 = [=](__m256i a, __m256i b) -> __m256i {
         __m256i product0 = _mm256_maddubs_epi16(a, b);
-        product0 = _mm256_madd_epi16(product0, kOnes256);
-        acc = _mm256_add_epi32(acc, product0);
+        return _mm256_madd_epi16(product0, kOnes256);
 #endif
       };
 
@@ -246,10 +245,9 @@ namespace Eval::NNUE::Layers {
         return _mm_add_epi32(sum0, bias);
       };
 
-      [[maybe_unused]] auto m128_add_dpbusd_epi32 = [=](__m128i& acc, __m128i a, __m128i b) {
+      [[maybe_unused]] auto m128_dpbusd_epi32 = [=](__m128i a, __m128i b) -> __m128i {
         __m128i product0 = _mm_maddubs_epi16(a, b);
-        product0 = _mm_madd_epi16(product0, kOnes128);
-        acc = _mm_add_epi32(acc, product0);
+        return _mm_madd_epi16(product0, kOnes128);
       };
 
 #endif
@@ -293,15 +291,6 @@ namespace Eval::NNUE::Layers {
           const __m512i bias = *reinterpret_cast<const __m512i*>(&biases_[i]);
           __m512i* outptr = reinterpret_cast<__m512i*>(&output[i]);
 
-          __m512i sum01a = _mm512_setzero_si512();
-          __m512i sum23a = _mm512_setzero_si512();
-          __m512i sum45a = _mm512_setzero_si512();
-          __m512i sum67a = _mm512_setzero_si512();
-          __m512i sum01b = _mm512_setzero_si512();
-          __m512i sum23b = _mm512_setzero_si512();
-          __m512i sum45b = _mm512_setzero_si512();
-          __m512i sum67b = _mm512_setzero_si512();
-
           const auto row01a = *reinterpret_cast<const __m512i*>(&weights_[offset01a]);
           const auto row23a = *reinterpret_cast<const __m512i*>(&weights_[offset23a]);
           const auto row45a = *reinterpret_cast<const __m512i*>(&weights_[offset45a]);
@@ -314,6 +303,16 @@ namespace Eval::NNUE::Layers {
           const __m256i in256 = input_vector256[0];
           const __m512i in = _mm512_inserti64x4(_mm512_castsi256_si512(in256), in256, 1);
 
+#if defined (USE_VNNI)
+          __m512i sum01a = _mm512_setzero_si512();
+          __m512i sum23a = _mm512_setzero_si512();
+          __m512i sum45a = _mm512_setzero_si512();
+          __m512i sum67a = _mm512_setzero_si512();
+          __m512i sum01b = _mm512_setzero_si512();
+          __m512i sum23b = _mm512_setzero_si512();
+          __m512i sum45b = _mm512_setzero_si512();
+          __m512i sum67b = _mm512_setzero_si512();
+
           m512_add_dpbusd_epi32(sum01a, in, row01a);
           m512_add_dpbusd_epi32(sum23a, in, row23a);
           m512_add_dpbusd_epi32(sum45a, in, row45a);
@@ -322,6 +321,16 @@ namespace Eval::NNUE::Layers {
           m512_add_dpbusd_epi32(sum23b, in, row23b);
           m512_add_dpbusd_epi32(sum45b, in, row45b);
           m512_add_dpbusd_epi32(sum67b, in, row67b);
+#else
+          __m512i sum01a = m512_dpbusd_epi32(in, row01a);
+          __m512i sum23a = m512_dpbusd_epi32(in, row23a);
+          __m512i sum45a = m512_dpbusd_epi32(in, row45a);
+          __m512i sum67a = m512_dpbusd_epi32(in, row67a);
+          __m512i sum01b = m512_dpbusd_epi32(in, row01b);
+          __m512i sum23b = m512_dpbusd_epi32(in, row23b);
+          __m512i sum45b = m512_dpbusd_epi32(in, row45b);
+          __m512i sum67b = m512_dpbusd_epi32(in, row67b);
+#endif
 
           *outptr = m512_hadd256x16(
             sum01a, sum23a, sum45a, sum67a,
@@ -342,48 +351,80 @@ namespace Eval::NNUE::Layers {
 
           if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) == 0)
           {
-            __m512i sum0 = _mm512_setzero_si512();
-            __m512i sum1 = _mm512_setzero_si512();
-            __m512i sum2 = _mm512_setzero_si512();
-            __m512i sum3 = _mm512_setzero_si512();
-
             const auto row0 = reinterpret_cast<const __m512i*>(&weights_[offset0]);
             const auto row1 = reinterpret_cast<const __m512i*>(&weights_[offset1]);
             const auto row2 = reinterpret_cast<const __m512i*>(&weights_[offset2]);
             const auto row3 = reinterpret_cast<const __m512i*>(&weights_[offset3]);
 
-            for (IndexType j = 0; j < kNumChunks512; ++j)
+#if defined (USE_VNNI)
+            __m512i sum0 = _mm512_setzero_si512();
+            __m512i sum1 = _mm512_setzero_si512();
+            __m512i sum2 = _mm512_setzero_si512();
+            __m512i sum3 = _mm512_setzero_si512();
+            const IndexType kStart = 0;
+#else
+            __m512i sum0 = m512_dpbusd_epi32(input_vector512[0], row0[0]);
+            __m512i sum1 = m512_dpbusd_epi32(input_vector512[0], row1[0]);
+            __m512i sum2 = m512_dpbusd_epi32(input_vector512[0], row2[0]);
+            __m512i sum3 = m512_dpbusd_epi32(input_vector512[0], row3[0]);
+            const IndexType kStart = 1;
+#endif
+
+            for (IndexType j = kStart; j < kNumChunks512; ++j)
             {
               const __m512i in = input_vector512[j];
 
+#if defined (USE_VNNI)
               m512_add_dpbusd_epi32(sum0, in, row0[j]);
               m512_add_dpbusd_epi32(sum1, in, row1[j]);
               m512_add_dpbusd_epi32(sum2, in, row2[j]);
               m512_add_dpbusd_epi32(sum3, in, row3[j]);
+#else
+              sum0 = _mm512_add_epi32(sum0, m512_dpbusd_epi32(in, row0[j]));
+              sum1 = _mm512_add_epi32(sum1, m512_dpbusd_epi32(in, row1[j]));
+              sum2 = _mm512_add_epi32(sum2, m512_dpbusd_epi32(in, row2[j]));
+              sum3 = _mm512_add_epi32(sum3, m512_dpbusd_epi32(in, row3[j]));
+#endif
             }
 
             *outptr = m512_haddx4(sum0, sum1, sum2, sum3, bias);
           }
           else
           {
-            __m256i sum0 = _mm256_setzero_si256();
-            __m256i sum1 = _mm256_setzero_si256();
-            __m256i sum2 = _mm256_setzero_si256();
-            __m256i sum3 = _mm256_setzero_si256();
-
             const auto row0 = reinterpret_cast<const __m256i*>(&weights_[offset0]);
             const auto row1 = reinterpret_cast<const __m256i*>(&weights_[offset1]);
             const auto row2 = reinterpret_cast<const __m256i*>(&weights_[offset2]);
             const auto row3 = reinterpret_cast<const __m256i*>(&weights_[offset3]);
 
-            for (IndexType j = 0; j < kNumChunks256; ++j)
+#if defined (USE_VNNI)
+            __m256i sum0 = _mm256_setzero_si256();
+            __m256i sum1 = _mm256_setzero_si256();
+            __m256i sum2 = _mm256_setzero_si256();
+            __m256i sum3 = _mm256_setzero_si256();
+            const IndexType kStart = 0;
+#else
+            __m256i sum0 = m256_dpbusd_epi32(input_vector256[0], row0[0]);
+            __m256i sum1 = m256_dpbusd_epi32(input_vector256[0], row1[0]);
+            __m256i sum2 = m256_dpbusd_epi32(input_vector256[0], row2[0]);
+            __m256i sum3 = m256_dpbusd_epi32(input_vector256[0], row3[0]);
+            const IndexType kStart = 1;
+#endif
+
+            for (IndexType j = kStart; j < kNumChunks256; ++j)
             {
               const __m256i in = input_vector256[j];
 
+#if defined (USE_VNNI)
               m256_add_dpbusd_epi32(sum0, in, row0[j]);
               m256_add_dpbusd_epi32(sum1, in, row1[j]);
               m256_add_dpbusd_epi32(sum2, in, row2[j]);
               m256_add_dpbusd_epi32(sum3, in, row3[j]);
+#else
+              sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
+              sum1 = _mm256_add_epi32(sum1, m256_dpbusd_epi32(in, row1[j]));
+              sum2 = _mm256_add_epi32(sum2, m256_dpbusd_epi32(in, row2[j]));
+              sum3 = _mm256_add_epi32(sum3, m256_dpbusd_epi32(in, row3[j]));
+#endif
             }
 
             *outptr = m256_haddx4(sum0, sum1, sum2, sum3, bias);
@@ -394,30 +435,50 @@ namespace Eval::NNUE::Layers {
       {
         if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) == 0)
         {
-          __m512i sum0 = _mm512_setzero_si512();
-
           const auto row0 = reinterpret_cast<const __m512i*>(&weights_[0]);
 
-          for (IndexType j = 0; j < kNumChunks512; ++j)
+#if defined (USE_VNNI)
+          __m512i sum0 = _mm512_setzero_si512();
+          const IndexType kStart = 0;
+#else
+          __m512i sum0 = m512_dpbusd_epi32(input_vector512[0], row0[0]);
+          const IndexType kStart = 1;
+#endif
+
+          for (IndexType j = kStart; j < kNumChunks512; ++j)
           {
             const __m512i in = input_vector512[j];
 
+#if defined (USE_VNNI)
             m512_add_dpbusd_epi32(sum0, in, row0[j]);
+#else
+            sum0 = _mm512_add_epi32(sum0, m512_dpbusd_epi32(in, row0[j]));
+#endif
           }
 
           output[0] = m512_hadd(sum0, biases_[0]);
         }
         else
         {
-          __m256i sum0 = _mm256_setzero_si256();
-
           const auto row0 = reinterpret_cast<const __m256i*>(&weights_[0]);
 
-          for (IndexType j = 0; j < kNumChunks256; ++j)
+#if defined (USE_VNNI)
+          __m256i sum0 = _mm256_setzero_si256();
+          const IndexType kStart = 0;
+#else
+          __m256i sum0 = m256_dpbusd_epi32(input_vector256[0], row0[0]);
+          const IndexType kStart = 1;
+#endif
+
+          for (IndexType j = kStart; j < kNumChunks256; ++j)
           {
             const __m256i in = input_vector256[j];
 
+#if defined (USE_VNNI)
             m256_add_dpbusd_epi32(sum0, in, row0[j]);
+#else
+            sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
+#endif
           }
 
           output[0] = m256_hadd(sum0, biases_[0]);
@@ -451,24 +512,40 @@ namespace Eval::NNUE::Layers {
           const __m128i bias = *reinterpret_cast<const __m128i*>(&biases_[i]);
           __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]);
 
-          __m256i sum0 = _mm256_setzero_si256();
-          __m256i sum1 = _mm256_setzero_si256();
-          __m256i sum2 = _mm256_setzero_si256();
-          __m256i sum3 = _mm256_setzero_si256();
-
           const auto row0 = reinterpret_cast<const __m256i*>(&weights_[offset0]);
           const auto row1 = reinterpret_cast<const __m256i*>(&weights_[offset1]);
           const auto row2 = reinterpret_cast<const __m256i*>(&weights_[offset2]);
           const auto row3 = reinterpret_cast<const __m256i*>(&weights_[offset3]);
 
-          for (IndexType j = 0; j < kNumChunks; ++j)
+#if defined (USE_VNNI)
+          __m256i sum0 = _mm256_setzero_si256();
+          __m256i sum1 = _mm256_setzero_si256();
+          __m256i sum2 = _mm256_setzero_si256();
+          __m256i sum3 = _mm256_setzero_si256();
+          const IndexType kStart = 0;
+#else
+          __m256i sum0 = m256_dpbusd_epi32(input_vector[0], row0[0]);
+          __m256i sum1 = m256_dpbusd_epi32(input_vector[0], row1[0]);
+          __m256i sum2 = m256_dpbusd_epi32(input_vector[0], row2[0]);
+          __m256i sum3 = m256_dpbusd_epi32(input_vector[0], row3[0]);
+          const IndexType kStart = 1;
+#endif
+
+          for (IndexType j = kStart; j < kNumChunks; ++j)
           {
             const __m256i in = input_vector[j];
 
+#if defined (USE_VNNI)
             m256_add_dpbusd_epi32(sum0, in, row0[j]);
             m256_add_dpbusd_epi32(sum1, in, row1[j]);
             m256_add_dpbusd_epi32(sum2, in, row2[j]);
             m256_add_dpbusd_epi32(sum3, in, row3[j]);
+#else
+            sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
+            sum1 = _mm256_add_epi32(sum1, m256_dpbusd_epi32(in, row1[j]));
+            sum2 = _mm256_add_epi32(sum2, m256_dpbusd_epi32(in, row2[j]));
+            sum3 = _mm256_add_epi32(sum3, m256_dpbusd_epi32(in, row3[j]));
+#endif
           }
 
           *outptr = m256_haddx4(sum0, sum1, sum2, sum3, bias);
@@ -476,15 +553,25 @@ namespace Eval::NNUE::Layers {
       }
       else if constexpr (kOutputDimensions == 1)
       {
-        __m256i sum0 = _mm256_setzero_si256();
-
         const auto row0 = reinterpret_cast<const __m256i*>(&weights_[0]);
 
-        for (IndexType j = 0; j < kNumChunks; ++j)
+#if defined (USE_VNNI)
+        __m256i sum0 = _mm256_setzero_si256();
+        const IndexType kStart = 0;
+#else
+        __m256i sum0 = m256_dpbusd_epi32(input_vector[0], row0[0]);
+        const IndexType kStart = 1;
+#endif
+
+        for (IndexType j = kStart; j < kNumChunks; ++j)
         {
           const __m256i in = input_vector[j];
 
-            m256_add_dpbusd_epi32(sum0, in, row0[j]);
+#if defined (USE_VNNI)
+          m256_add_dpbusd_epi32(sum0, in, row0[j]);
+#else
+          sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
+#endif
         }
 
         output[0] = m256_hadd(sum0, biases_[0]);
@@ -517,24 +604,24 @@ namespace Eval::NNUE::Layers {
           const __m128i bias = *reinterpret_cast<const __m128i*>(&biases_[i]);
           __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]);
 
-          __m128i sum0 = _mm_setzero_si128();
-          __m128i sum1 = _mm_setzero_si128();
-          __m128i sum2 = _mm_setzero_si128();
-          __m128i sum3 = _mm_setzero_si128();
-
           const auto row0 = reinterpret_cast<const __m128i*>(&weights_[offset0]);
           const auto row1 = reinterpret_cast<const __m128i*>(&weights_[offset1]);
           const auto row2 = reinterpret_cast<const __m128i*>(&weights_[offset2]);
           const auto row3 = reinterpret_cast<const __m128i*>(&weights_[offset3]);
 
-          for (int j = 0; j < (int)kNumChunks; j += 1)
+          __m128i sum0 = m128_dpbusd_epi32(input_vector[0], row0[0]);
+          __m128i sum1 = m128_dpbusd_epi32(input_vector[0], row1[0]);
+          __m128i sum2 = m128_dpbusd_epi32(input_vector[0], row2[0]);
+          __m128i sum3 = m128_dpbusd_epi32(input_vector[0], row3[0]);
+
+          for (int j = 1; j < (int)kNumChunks; ++j)
           {
             const __m128i in = input_vector[j];
 
-            m128_add_dpbusd_epi32(sum0, in, row0[j]);
-            m128_add_dpbusd_epi32(sum1, in, row1[j]);
-            m128_add_dpbusd_epi32(sum2, in, row2[j]);
-            m128_add_dpbusd_epi32(sum3, in, row3[j]);
+            sum0 = _mm_add_epi32(sum0, m128_dpbusd_epi32(in, row0[j]));
+            sum1 = _mm_add_epi32(sum1, m128_dpbusd_epi32(in, row1[j]));
+            sum2 = _mm_add_epi32(sum2, m128_dpbusd_epi32(in, row2[j]));
+            sum3 = _mm_add_epi32(sum3, m128_dpbusd_epi32(in, row3[j]));
           }
 
           *outptr = m128_haddx4(sum0, sum1, sum2, sum3, bias);
@@ -542,16 +629,12 @@ namespace Eval::NNUE::Layers {
       }
       else if constexpr (kOutputDimensions == 1)
       {
-        __m128i sum0 = _mm_setzero_si128();
-
         const auto row0 = reinterpret_cast<const __m128i*>(&weights_[0]);
 
-        for (int j = 0; j < (int)kNumChunks; j += 1)
-        {
-          const __m128i in = input_vector[j];
+        __m128i sum0 = m128_dpbusd_epi32(input_vector[0], row0[0]);
 
-          m128_add_dpbusd_epi32(sum0, in, row0[j]);
-        }
+        for (int j = 1; j < (int)kNumChunks; ++j)
+          sum0 = _mm_add_epi32(sum0, m128_dpbusd_epi32(input_vector[j], row0[j]));
 
         output[0] = m128_hadd(sum0, biases_[0]);
       }

From 190dd26b9f1bc6442acf7b2ae4750eb4ab8b90bd Mon Sep 17 00:00:00 2001
From: Vizvezdenec <Vizvezdenec@gmail.com>
Date: Thu, 26 Nov 2020 06:38:09 +0100
Subject: [PATCH 354/398] use classical for certain endgames.

STC https://tests.stockfishchess.org/tests/view/5fbc64c067cbf42301d6b1d6
LLR: 2.97 (-2.94,2.94) {-0.25,1.25}
Total: 53360 W: 5223 L: 5024 D: 43113
Ptnml(0-2): 184, 3877, 18390, 4014, 215

LTC https://tests.stockfishchess.org/tests/view/5fbc97f267cbf42301d6b1ee
LLR: 2.96 (-2.94,2.94) {0.25,1.25}
Total: 126472 W: 5111 L: 4766 D: 116595
Ptnml(0-2): 50, 4032, 54749, 4333, 72

closes https://github.com/official-stockfish/Stockfish/pull/3240

bench: 3820648
---
 src/evaluate.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 3d887119..90d11a00 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -1035,12 +1035,14 @@ Value Eval::evaluate(const Position& pos) {
       bool  largePsq = psq * 16 > (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50;
       bool  classical = largePsq || (psq > PawnValueMg / 4 && !(pos.this_thread()->nodes & 0xB));
 
-      v = classical ? Evaluation<NO_TRACE>(pos).value() : adjusted_NNUE();
+      bool strongClassical = pos.non_pawn_material() < 2 * RookValueMg && pos.count<PAWN>() < 2;
+
+      v = classical || strongClassical ? Evaluation<NO_TRACE>(pos).value() : adjusted_NNUE();
 
       // If the classical eval is small and imbalance large, use NNUE nevertheless.
       // For the case of opposite colored bishops, switch to NNUE eval with
       // small probability if the classical eval is less than the threshold.
-      if (   largePsq
+      if (   largePsq && !strongClassical
           && (   abs(v) * 16 < NNUEThreshold2 * r50
               || (   pos.opposite_bishops()
                   && abs(v) * 16 < (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50

From 89294e2e4f44fdf3b4c3e38609c6c9b4c2a3c982 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 26 Nov 2020 17:28:09 +0100
Subject: [PATCH 355/398] Add transform command. Add transform nudged_static
 subcommand.

---
 src/Makefile            |   3 +-
 src/learn/sfen_stream.h |  10 ++
 src/learn/transform.cpp | 242 ++++++++++++++++++++++++++++++++++++++++
 src/learn/transform.h   |  12 ++
 src/uci.cpp             |   2 +
 5 files changed, 268 insertions(+), 1 deletion(-)
 create mode 100644 src/learn/transform.cpp
 create mode 100644 src/learn/transform.h

diff --git a/src/Makefile b/src/Makefile
index a5f5f06f..7f00bfff 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -64,7 +64,8 @@ SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp
 	learn/learn.cpp \
 	learn/gensfen.cpp \
 	learn/opening_book.cpp \
-	learn/convert.cpp
+	learn/convert.cpp \
+	learn/transform.cpp
 
 OBJS = $(notdir $(SRCS:.cpp=.o))
 
diff --git a/src/learn/sfen_stream.h b/src/learn/sfen_stream.h
index d25dd41d..da411346 100644
--- a/src/learn/sfen_stream.h
+++ b/src/learn/sfen_stream.h
@@ -207,6 +207,16 @@ namespace Learner {
         assert(false);
         return nullptr;
     }
+
+    inline std::unique_ptr<BasicSfenOutputStream> create_new_sfen_output(const std::string& filename)
+    {
+        if (has_extension(filename, BinSfenOutputStream::extension))
+            return std::make_unique<BinSfenOutputStream>(filename);
+        else if (has_extension(filename, BinpackSfenOutputStream::extension))
+            return std::make_unique<BinpackSfenOutputStream>(filename);
+
+        return nullptr;
+    }
 }
 
 #endif
\ No newline at end of file
diff --git a/src/learn/transform.cpp b/src/learn/transform.cpp
new file mode 100644
index 00000000..5687b48b
--- /dev/null
+++ b/src/learn/transform.cpp
@@ -0,0 +1,242 @@
+#include "transform.h"
+
+#include "sfen_stream.h"
+#include "packed_sfen.h"
+
+#include "thread.h"
+#include "position.h"
+#include "evaluate.h"
+
+#include "nnue/evaluate_nnue.h"
+
+#include <string>
+#include <map>
+#include <iostream>
+#include <cmath>
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+
+namespace Learner
+{
+    using CommandFunc = void(*)(std::istringstream&);
+
+    enum struct NudgedStaticMode
+    {
+        Absolute,
+        Relative,
+        Interpolate
+    };
+
+    struct NudgedStaticParams
+    {
+        std::string input_filename = "in.binpack";
+        std::string output_filename = "out.binpack";
+        NudgedStaticMode mode = NudgedStaticMode::Absolute;
+        int absolute_nudge = 5;
+        float relative_nudge = 0.1;
+        float interpolate_nudge = 0.1;
+
+        void enforce_constraints()
+        {
+            relative_nudge = std::max(relative_nudge, 0.0f);
+            absolute_nudge = std::max(absolute_nudge, 0);
+        }
+    };
+
+    [[nodiscard]] std::int16_t nudge(NudgedStaticParams& params, std::int16_t static_eval_i16, std::int16_t deep_eval_i16)
+    {
+        auto saturate_i32_to_i16 = [](int v) {
+            return static_cast<std::int16_t>(
+                std::clamp(
+                    v,
+                    (int)std::numeric_limits<std::int16_t>::min(),
+                    (int)std::numeric_limits<std::int16_t>::max()
+                )
+            );
+        };
+
+        auto saturate_f32_to_i16 = [saturate_i32_to_i16](float v) {
+            return saturate_i32_to_i16((int)v);
+        };
+
+        int static_eval = static_eval_i16;
+        int deep_eval = deep_eval_i16;
+
+        switch(params.mode)
+        {
+            case NudgedStaticMode::Absolute:
+                return saturate_i32_to_i16(
+                    static_eval + std::clamp(
+                        deep_eval - static_eval,
+                        -params.absolute_nudge,
+                        params.absolute_nudge
+                    )
+                );
+
+            case NudgedStaticMode::Relative:
+                return saturate_f32_to_i16(
+                    (float)static_eval * std::clamp(
+                        (float)deep_eval / (float)static_eval,
+                        (1.0f - params.relative_nudge),
+                        (1.0f + params.relative_nudge)
+                    )
+                );
+
+            case NudgedStaticMode::Interpolate:
+                return saturate_f32_to_i16(
+                    (float)static_eval * (1.0f - params.interpolate_nudge)
+                    + (float)deep_eval * params.interpolate_nudge
+                );
+
+            default:
+                assert(false);
+                return 0;
+        }
+    }
+
+    void do_nudged_static(NudgedStaticParams& params)
+    {
+        Thread* th = Threads.main();
+        Position& pos = th->rootPos;
+        StateInfo si;
+
+        auto in = Learner::open_sfen_input_file(params.input_filename);
+        auto out = Learner::create_new_sfen_output(params.output_filename);
+
+        if (in == nullptr)
+        {
+            std::cerr << "Invalid input file type.\n";
+            return;
+        }
+
+        if (out == nullptr)
+        {
+            std::cerr << "Invalid output file type.\n";
+            return;
+        }
+
+        PSVector buffer;
+        uint64_t batch_size = 1'000'000;
+
+        buffer.reserve(batch_size);
+
+        uint64_t num_processed = 0;
+        for (;;)
+        {
+            auto v = in->next();
+            if (!v.has_value())
+                break;
+
+            auto& ps = v.value();
+
+            pos.set_from_packed_sfen(ps.sfen, &si, th);
+            auto static_eval = Eval::evaluate(pos);
+            auto deep_eval = ps.score;
+            ps.score = nudge(params, static_eval, deep_eval);
+
+            buffer.emplace_back(ps);
+            if (buffer.size() >= batch_size)
+            {
+                num_processed += buffer.size();
+
+                out->write(buffer);
+                buffer.clear();
+
+                std::cout << "Processed " << num_processed << " positions.\n";
+            }
+        }
+
+        if (!buffer.empty())
+        {
+            num_processed += buffer.size();
+
+            out->write(buffer);
+            buffer.clear();
+
+            std::cout << "Processed " << num_processed << " positions.\n";
+        }
+
+        std::cout << "Finished.\n";
+    }
+
+    void nudged_static(std::istringstream& is)
+    {
+        NudgedStaticParams params{};
+
+        while(true)
+        {
+            std::string token;
+            is >> token;
+
+            if (token == "")
+                break;
+
+            if (token == "absolute")
+            {
+                params.mode = NudgedStaticMode::Absolute;
+                is >> params.absolute_nudge;
+            }
+            else if (token == "relative")
+            {
+                params.mode = NudgedStaticMode::Relative;
+                is >> params.relative_nudge;
+            }
+            else if (token == "interpolate")
+            {
+                params.mode = NudgedStaticMode::Interpolate;
+                is >> params.interpolate_nudge;
+            }
+            else if (token == "input_file")
+                is >> params.input_filename;
+            else if (token == "output_file")
+                is >> params.output_filename;
+        }
+
+        std::cout << "Performing transform nudged_static with parameters:\n";
+        std::cout << "input_file          : " << params.input_filename << '\n';
+        std::cout << "output_file         : " << params.output_filename << '\n';
+        std::cout << "\n";
+        if (params.mode == NudgedStaticMode::Absolute)
+        {
+            std::cout << "mode                : absolute\n";
+            std::cout << "absolute_nudge      : " << params.absolute_nudge << '\n';
+        }
+        else if (params.mode == NudgedStaticMode::Relative)
+        {
+            std::cout << "mode                : relative\n";
+            std::cout << "relative_nudge      : " << params.relative_nudge << '\n';
+        }
+        else if (params.mode == NudgedStaticMode::Interpolate)
+        {
+            std::cout << "mode                : interpolate\n";
+            std::cout << "interpolate_nudge   : " << params.interpolate_nudge << '\n';
+        }
+        std::cout << '\n';
+
+        params.enforce_constraints();
+        do_nudged_static(params);
+    }
+
+    void transform(std::istringstream& is)
+    {
+        const std::map<std::string, CommandFunc> subcommands = {
+            { "nudged_static", &nudged_static }
+        };
+
+        Eval::NNUE::init();
+
+        std::string subcommand;
+        is >> subcommand;
+
+        auto func = subcommands.find(subcommand);
+        if (func == subcommands.end())
+        {
+            std::cout << "Invalid subcommand " << subcommand << ". Exiting...\n";
+            return;
+        }
+
+        func->second(is);
+    }
+
+}
diff --git a/src/learn/transform.h b/src/learn/transform.h
new file mode 100644
index 00000000..8a6921a0
--- /dev/null
+++ b/src/learn/transform.h
@@ -0,0 +1,12 @@
+#ifndef _TRANSFORM_H_
+#define _TRANSFORM_H_
+
+#include <sstream>
+
+namespace Learner {
+
+    void transform(std::istringstream& is);
+
+}
+
+#endif
diff --git a/src/uci.cpp b/src/uci.cpp
index ae21a3ae..8e64da6b 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -38,6 +38,7 @@
 #include "learn/gensfen.h"
 #include "learn/learn.h"
 #include "learn/convert.h"
+#include "learn/transform.h"
 
 using namespace std;
 
@@ -345,6 +346,7 @@ void UCI::loop(int argc, char* argv[]) {
       else if (token == "convert_bin") Learner::convert_bin(is);
       else if (token == "convert_plain") Learner::convert_plain(is);
       else if (token == "convert_bin_from_pgn_extract") Learner::convert_bin_from_pgn_extract(is);
+      else if (token == "transform") Learner::transform(is);
 
       // Command to call qsearch(),search() directly for testing
       else if (token == "qsearch") qsearch_cmd(pos);

From 92b14a5ba2310ea5285d3f5987c5bc247c715860 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 26 Nov 2020 18:06:00 +0100
Subject: [PATCH 356/398] Add docs for transform.

---
 docs/transform.md | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 docs/transform.md

diff --git a/docs/transform.md b/docs/transform.md
new file mode 100644
index 00000000..82e963fe
--- /dev/null
+++ b/docs/transform.md
@@ -0,0 +1,21 @@
+# Transform
+
+`transform` command exposes subcommands that perform some specific transformation over data. The call syntax is `transform <subcommand>`. Currently implemented subcommands are listed and described below.
+
+## `nudged_static`
+
+`transform nudged_static` takes named parameters in the form of `gensfen param_1_name param_1_value param_2_name param_2_value ...` and flag parameters which don't require values.
+
+This command goes through positions in the input files and replaces the scores with new ones - generated from static eval - but slightly adjusted based on the scores in the original input file.
+
+Currently the following options are available:
+
+`input_file` - path to the input file. Supports bin and binpack formats. Default: in.binpack.
+
+`output_file` - path to the output file. Supports bin and binpack formats. Default: out.binpack.
+
+`absolute` - states that the adjustment should be bounded by an absolute value. After this token follows the maximum absolute adjustment. Values are always adjusted towards scores in the input file. This is the default mode. Default maximum adjustement: 5.
+
+`relative` - states that the adjustment should be bounded by a value relative in magnitude to the static eval value. After this token follows the maximum relative change - a floating point value greater than 0. For example a value of 0.1 only allows changing the static eval by at most 10% towards the score from the input file.
+
+`interpolate` states that the output score should be a value interpolated between static eval and the score from the input file. After this token follows the interpolation constant `t`. `t` of 0 means that only static eval is used. `t` of 1 means that only score from the input file is used. `t` of 0.5 means that the static eval and input score are averaged. It accepts values outside of range `<0, 1>`, but the usefulness is questionable.

From 4ea8572b6d8fdbd092c94954c78a6b0a47289083 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 22 Nov 2020 18:27:51 +0100
Subject: [PATCH 357/398] Add single threaded sgemm.

---
 src/extra/stockfish_blas.cpp | 290 +++++++++++++++++++++++++++++++++++
 src/extra/stockfish_blas.h   |  10 ++
 2 files changed, 300 insertions(+)

diff --git a/src/extra/stockfish_blas.cpp b/src/extra/stockfish_blas.cpp
index 0ba40b49..109a4b44 100644
--- a/src/extra/stockfish_blas.cpp
+++ b/src/extra/stockfish_blas.cpp
@@ -546,6 +546,156 @@ namespace Blas {
         );
         thread_pool.wait_for_workers_finished();
 
+#endif
+    }
+
+    void sgemm_row_major_transpose_right(
+        const int M, const int N, const int K,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT A, const int lda,
+        const float * SF_BLAS_RESTRICT B, const int ldb,
+        const float beta,
+        float * SF_BLAS_RESTRICT C, const int ldc
+    )
+    {
+
+#if defined(USE_SSE3)
+
+        const __m128 alpha4 = _mm_set1_ps(alpha);
+        const __m128 beta4 = _mm_set1_ps(beta);
+
+        for (int m = 0; m < M - 1; m += 2)
+        {
+            int n = 0;
+            for (; n < N - 3; n += 4)
+            {
+                //        mn
+                __m128 sum00 = _mm_setzero_ps();
+                __m128 sum01 = _mm_setzero_ps();
+                __m128 sum02 = _mm_setzero_ps();
+                __m128 sum03 = _mm_setzero_ps();
+                __m128 sum10 = _mm_setzero_ps();
+                __m128 sum11 = _mm_setzero_ps();
+                __m128 sum12 = _mm_setzero_ps();
+                __m128 sum13 = _mm_setzero_ps();
+
+                // Horizontal sum of elements in sum[m][n] corresponds to
+                // the final element in the C.
+
+                int k = 0;
+                for (; k < K - 3; k += 4)
+                {
+                    const __m128 a0 = _mm_loadu_ps(&A[(m+0)*lda+k+0]);
+                    const __m128 a1 = _mm_loadu_ps(&A[(m+1)*lda+k+0]);
+
+                    const __m128 b0 = _mm_loadu_ps(&B[(n+0)*ldb+k+0]);
+                    const __m128 b1 = _mm_loadu_ps(&B[(n+1)*ldb+k+0]);
+                    const __m128 b2 = _mm_loadu_ps(&B[(n+2)*ldb+k+0]);
+                    const __m128 b3 = _mm_loadu_ps(&B[(n+3)*ldb+k+0]);
+
+                    sum00 = _mm_add_ps(sum00, _mm_mul_ps(a0, b0));
+                    sum01 = _mm_add_ps(sum01, _mm_mul_ps(a0, b1));
+                    sum02 = _mm_add_ps(sum02, _mm_mul_ps(a0, b2));
+                    sum03 = _mm_add_ps(sum03, _mm_mul_ps(a0, b3));
+                    sum10 = _mm_add_ps(sum10, _mm_mul_ps(a1, b0));
+                    sum11 = _mm_add_ps(sum11, _mm_mul_ps(a1, b1));
+                    sum12 = _mm_add_ps(sum12, _mm_mul_ps(a1, b2));
+                    sum13 = _mm_add_ps(sum13, _mm_mul_ps(a1, b3));
+                }
+
+                for(; k < K; k += 1)
+                {
+                    const float a0 = A[(m+0)*lda+k+0];
+                    const float a1 = A[(m+1)*lda+k+0];
+
+                    const float b0 = B[(n+0)*ldb+k+0];
+                    const float b1 = B[(n+1)*ldb+k+0];
+                    const float b2 = B[(n+2)*ldb+k+0];
+                    const float b3 = B[(n+3)*ldb+k+0];
+
+                    // Since all will be summed vertically anyway we can
+                    // just add to the first element.
+                    // Other elements are left unmodified.
+                    sum00 = _mm_add_ss(sum00, _mm_set_ss(a0 * b0));
+                    sum01 = _mm_add_ss(sum01, _mm_set_ss(a0 * b1));
+                    sum02 = _mm_add_ss(sum02, _mm_set_ss(a0 * b2));
+                    sum03 = _mm_add_ss(sum03, _mm_set_ss(a0 * b3));
+                    sum10 = _mm_add_ss(sum10, _mm_set_ss(a1 * b0));
+                    sum11 = _mm_add_ss(sum11, _mm_set_ss(a1 * b1));
+                    sum12 = _mm_add_ss(sum12, _mm_set_ss(a1 * b2));
+                    sum13 = _mm_add_ss(sum13, _mm_set_ss(a1 * b3));
+                }
+
+                __m128 s0 = m128_hadd_ps(sum00, sum01, sum02, sum03);
+                __m128 s1 = m128_hadd_ps(sum10, sum11, sum12, sum13);
+                s0 = _mm_mul_ps(s0, alpha4);
+                s1 = _mm_mul_ps(s1, alpha4);
+
+                __m128 c0 = _mm_loadu_ps(&C[(m+0)*ldc+(n+0)]);
+                __m128 c1 = _mm_loadu_ps(&C[(m+1)*ldc+(n+0)]);
+                c0 = _mm_mul_ps(c0, beta4);
+                c1 = _mm_mul_ps(c1, beta4);
+
+                c0 = _mm_add_ps(c0, s0);
+                c1 = _mm_add_ps(c1, s1);
+
+                _mm_storeu_ps(&C[(m+0)*ldc+(n+0)], c0);
+                _mm_storeu_ps(&C[(m+1)*ldc+(n+0)], c1);
+            }
+
+            for(; n < N; n += 1)
+            {
+                float sum0 = 0.0f;
+                float sum1 = 0.0f;
+
+                for (int k = 0; k < K; ++k)
+                {
+                    const float a0 = A[(m+0)*lda+k+0];
+                    const float a1 = A[(m+1)*lda+k+0];
+
+                    const float b0 = B[(n+0)*ldb+k+0];
+
+                    sum0 += a0 * b0;
+                    sum1 += a1 * b0;
+                }
+
+                C[(m+0)*ldc+(n+0)] = C[(m+0)*ldc+(n+0)] * beta + sum0 * alpha;
+                C[(m+1)*ldc+(n+0)] = C[(m+1)*ldc+(n+0)] * beta + sum1 * alpha;
+            }
+        }
+
+        for (; m < M; m += 1)
+        {
+            for (int n = 0; n < N; n += 1)
+            {
+                float sum = 0.0f;
+
+                for (int k = 0; k < K; k += 1)
+                {
+                    sum += A[m*lda + k] * B[n*ldb + k];
+                }
+
+                C[m*ldc + n] = C[m*ldc + n] * beta + sum * alpha;
+            }
+        }
+
+#else
+
+        for (int m = 0; m < M; m += 1)
+        {
+            for (int n = 0; n < N; n += 1)
+            {
+                float sum = 0.0f;
+
+                for (int k = 0; k < K; k += 1)
+                {
+                    sum += A[m*lda + k] * B[n*ldb + k];
+                }
+
+                C[m*ldc + n] = C[m*ldc + n] * beta + sum * alpha;
+            }
+        }
+
 #endif
     }
 
@@ -605,6 +755,35 @@ namespace Blas {
         );
     }
 
+    void sgemm_row_major_transpose_none(
+        const int M, const int N, const int K,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT A, const int lda,
+        const float * SF_BLAS_RESTRICT B, const int ldb,
+        const float beta,
+        float * SF_BLAS_RESTRICT C, const int ldc
+    )
+    {
+        constexpr static int temporary_buffer_index = 1;
+
+        auto B_tr = get_thread_local_temporary_storage(K * N, temporary_buffer_index);
+
+        transpose(
+            K, N,
+            B, ldb,
+            B_tr, K
+        );
+
+        sgemm_row_major_transpose_right(
+            M, N, K,
+            alpha,
+            A, lda,
+            B_tr, K,
+            beta,
+            C, ldc
+        );
+    }
+
     void sgemm_row_major(
         ThreadPool& thread_pool,
         MatrixTranspose TransA, MatrixTranspose TransB,
@@ -684,6 +863,80 @@ namespace Blas {
         }
     }
 
+    void sgemm_row_major(
+        MatrixTranspose TransA, MatrixTranspose TransB,
+        const int M, const int N, const int K,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT A, const int lda,
+        const float * SF_BLAS_RESTRICT B, const int ldb,
+        const float beta,
+        float * SF_BLAS_RESTRICT C, const int ldc
+    )
+    {
+        constexpr static int temporary_buffer_index = 0;
+
+        if (TransA == MatrixTranspose::Trans && TransB == MatrixTranspose::Trans)
+        {
+            auto A_tr = get_thread_local_temporary_storage(K * M, temporary_buffer_index);
+
+            transpose(
+                K, M,
+                A, lda,
+                A_tr, K
+            );
+
+            sgemm_row_major_transpose_right(
+                M, N, K,
+                alpha,
+                A_tr, K,
+                B, ldb,
+                beta,
+                C, ldc
+            );
+        }
+        else if (TransA == MatrixTranspose::NoTrans && TransB == MatrixTranspose::Trans)
+        {
+            sgemm_row_major_transpose_right(
+                M, N, K,
+                alpha,
+                A, lda,
+                B, ldb,
+                beta,
+                C, ldc
+            );
+        }
+        else if (TransA == MatrixTranspose::Trans && TransB == MatrixTranspose::NoTrans)
+        {
+            auto A_tr = get_thread_local_temporary_storage(K * M, temporary_buffer_index);
+
+            transpose(
+                K, M,
+                A, lda,
+                A_tr, K
+            );
+
+            sgemm_row_major_transpose_none(
+                M, N, K,
+                alpha,
+                A_tr, K,
+                B, ldb,
+                beta,
+                C, ldc
+            );
+        }
+        else // no transpositions
+        {
+            sgemm_row_major_transpose_none(
+                M, N, K,
+                alpha,
+                A, lda,
+                B, ldb,
+                beta,
+                C, ldc
+            );
+        }
+    }
+
     void sgemm(
         ThreadPool& thread_pool,
         MatrixLayout layout, MatrixTranspose TransA, MatrixTranspose TransB,
@@ -723,6 +976,43 @@ namespace Blas {
         }
     }
 
+
+    void sgemm(
+        MatrixLayout layout, MatrixTranspose TransA, MatrixTranspose TransB,
+        const int M, const int N, const int K,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT A, const int lda,
+        const float * SF_BLAS_RESTRICT B, const int ldb,
+        const float beta,
+        float * SF_BLAS_RESTRICT C, const int ldc
+    )
+    {
+        if (layout == MatrixLayout::RowMajor)
+        {
+            sgemm_row_major(
+                TransA, TransB,
+                M, N, K,
+                alpha,
+                A, lda,
+                B, ldb,
+                beta,
+                C, ldc
+            );
+        }
+        else
+        {
+            sgemm_row_major(
+                TransB, TransA,
+                N, M, K,
+                alpha,
+                B, ldb,
+                A, lda,
+                beta,
+                C, ldc
+            );
+        }
+    }
+
     std::vector<float> generate_random_matrix(int rows, int cols)
     {
         std::vector<float> m(rows * cols);
diff --git a/src/extra/stockfish_blas.h b/src/extra/stockfish_blas.h
index 65da7e99..f551bbf2 100644
--- a/src/extra/stockfish_blas.h
+++ b/src/extra/stockfish_blas.h
@@ -118,6 +118,16 @@ namespace Blas {
         float * SF_BLAS_RESTRICT C, const int ldc
     );
 
+    void sgemm(
+        MatrixLayout layout, MatrixTranspose TransA, MatrixTranspose TransB,
+        const int M, const int N, const int K,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT A, const int lda,
+        const float * SF_BLAS_RESTRICT B, const int ldb,
+        const float beta,
+        float * SF_BLAS_RESTRICT C, const int ldc
+    );
+
     void test(
         ThreadPool& thread_pool
     );

From 0d4b803b08af87a1a264d196c9d3762c1acb1aeb Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 22 Nov 2020 19:23:30 +0100
Subject: [PATCH 358/398] Prepare trainer affine transform.

---
 src/nnue/trainer/trainer_affine_transform.h | 215 +++++++++++++-------
 1 file changed, 142 insertions(+), 73 deletions(-)

diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h
index 610805ca..f66f1a65 100644
--- a/src/nnue/trainer/trainer_affine_transform.h
+++ b/src/nnue/trainer/trainer_affine_transform.h
@@ -91,19 +91,52 @@ namespace Eval::NNUE {
             quantize_parameters();
         }
 
-        // forward propagation
-        const LearnFloatType* propagate(ThreadPool& thread_pool, const std::vector<Example>& batch) {
-            if (output_.size() < kOutputDimensions * batch.size()) {
-                output_.resize(kOutputDimensions * batch.size());
-                gradients_.resize(kInputDimensions * batch.size());
+        const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch)
+        {
+            if (output_.size() < kOutputDimensions * combined_batch.size()) {
+                output_.resize(kOutputDimensions * combined_batch.size());
+                gradients_.resize(kInputDimensions * combined_batch.size());
             }
 
-            batch_size_ = static_cast<IndexType>(batch.size());
-            batch_input_ = previous_layer_trainer_->propagate(thread_pool, batch);
+            if (thread_states_.size() < thread_pool.size())
+            {
+                thread_states_.resize(thread_pool.size());
+            }
+
+            combined_batch_size_ = static_cast<IndexType>(combined_batch.size());
+            combined_batch_input_ = previous_layer_trainer_->step_start(thread_pool, combined_batch);
+
+            auto& main_thread_state = thread_states_[0];
 
 #if defined(USE_BLAS)
 
-            for (IndexType b = 0; b < batch_size_; ++b) {
+            // update
+            cblas_sscal(
+                kOutputDimensions, momentum_, main_thread_state.biases_diff_, 1
+            );
+
+#else
+
+            Blas::sscal(
+                kOutputDimensions, momentum_, main_thread_state.biases_diff_, 1
+            );
+
+#endif
+
+            for (IndexType i = 1; i < thread_states_.size(); ++i)
+                thread_states_[i].reset_biases();
+
+            return output_.data();
+        }
+
+        // forward propagation
+        void propagate(Thread& th, const uint64_t offset, const uint64_t count) {
+
+            previous_layer_trainer_->propagate(th, offset, count);
+
+#if defined(USE_BLAS)
+
+            for (IndexType b = offset; b < offset + count; ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
                 cblas_scopy(
                     kOutputDimensions, biases_, 1, &output_[batch_offset], 1
@@ -112,149 +145,151 @@ namespace Eval::NNUE {
 
             cblas_sgemm(
                 CblasColMajor, CblasTrans, CblasNoTrans,
-                kOutputDimensions, batch_size_, kInputDimensions,
+                kOutputDimensions, count, kInputDimensions,
                 1.0,
                 weights_, kInputDimensions,
-                batch_input_, kInputDimensions,
+                combined_batch_input_ + offset * kInputDimensions, kInputDimensions,
                 1.0,
-                &output_[0], kOutputDimensions
+                &output_[offset * kOutputDimensions], kOutputDimensions
             );
 #else
 
-            for (IndexType b = 0; b < batch_size_; ++b) {
+            for (IndexType b = offset; b < offset + count; ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
                 Blas::scopy(
-                    thread_pool,
                     kOutputDimensions, biases_, 1, &output_[batch_offset], 1
                 );
             }
 
             Blas::sgemm(
-                thread_pool,
                 Blas::MatrixLayout::ColMajor, Blas::MatrixTranspose::Trans, Blas::MatrixTranspose::NoTrans,
-                kOutputDimensions, batch_size_, kInputDimensions,
+                kOutputDimensions, count, kInputDimensions,
                 1.0,
                 weights_, kInputDimensions,
-                batch_input_, kInputDimensions,
+                combined_batch_input_ + offset * kInputDimensions, kInputDimensions,
                 1.0,
-                &output_[0], kOutputDimensions
+                &output_[offset * kOutputDimensions], kOutputDimensions
             );
 
 #endif
-            return output_.data();
         }
 
         // backpropagation
-        void backpropagate(ThreadPool& thread_pool,
+        void backpropagate(Thread& th,
                            const LearnFloatType* gradients,
-                           LearnFloatType learning_rate) {
-
-            const LearnFloatType local_learning_rate =
-                learning_rate * learning_rate_scale_;
+                           uint64_t offset,
+                           uint64_t count) {
 
+            auto& thread_state = thread_states_[th.thread_idx()];
+            const auto momentum = th.thread_idx() == 0 ? momentum_ : 0.0f;
 #if defined(USE_BLAS)
 
             cblas_sgemm(
                 CblasColMajor, CblasNoTrans, CblasNoTrans,
-                kInputDimensions, batch_size_, kOutputDimensions,
+                kInputDimensions, count, kOutputDimensions,
                 1.0,
                 weights_, kInputDimensions,
-                gradients, kOutputDimensions,
+                gradients + offset * kOutputDimensions, kOutputDimensions,
                 0.0,
-                &gradients_[0], kInputDimensions
+                &gradients_[offset * kInputDimensions], kInputDimensions
             );
 
-            // update
-            cblas_sscal(
-                kOutputDimensions, momentum_, biases_diff_, 1
-            );
-
-            for (IndexType b = 0; b < batch_size_; ++b) {
+            for (IndexType b = offset; b < offset + count; ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
                 cblas_saxpy(
                     kOutputDimensions, 1.0,
-                    &gradients[batch_offset], 1, biases_diff_, 1
+                    &gradients[batch_offset], 1, thread_state.biases_diff_, 1
                 );
             }
 
             cblas_sgemm(
                 CblasRowMajor, CblasTrans, CblasNoTrans,
-                kOutputDimensions, kInputDimensions, batch_size_,
+                kOutputDimensions, kInputDimensions, count,
                 1.0,
-                gradients, kOutputDimensions,
-                batch_input_, kInputDimensions,
-                momentum_,
-                weights_diff_, kInputDimensions
+                gradients + offset * kOutputDimensions, kOutputDimensions,
+                combined_batch_input_ + offset * kInputDimensions, kInputDimensions,
+                momentum,
+                thread_state.weights_diff_, kInputDimensions
             );
 
 #else
 
             // backpropagate
             Blas::sgemm(
-                thread_pool,
                 Blas::MatrixLayout::ColMajor, Blas::MatrixTranspose::NoTrans, Blas::MatrixTranspose::NoTrans,
-                kInputDimensions, batch_size_, kOutputDimensions,
+                kInputDimensions, count, kOutputDimensions,
                 1.0,
                 weights_, kInputDimensions,
-                gradients, kOutputDimensions,
+                gradients + offset * kOutputDimensions, kOutputDimensions,
                 0.0,
-                &gradients_[0], kInputDimensions
+                &gradients_[offset * kInputDimensions], kInputDimensions
             );
 
-
-            Blas::sscal(
-                thread_pool,
-                kOutputDimensions, momentum_, biases_diff_, 1
-            );
-
-            for (IndexType b = 0; b < batch_size_; ++b) {
+            for (IndexType b = offset; b < offset + count; ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
-                Blas::saxpy(thread_pool, kOutputDimensions, 1.0,
-                          &gradients[batch_offset], 1, biases_diff_, 1);
+                Blas::saxpy(kOutputDimensions, 1.0,
+                          &gradients[batch_offset], 1, thread_state.biases_diff_, 1);
             }
 
             Blas::sgemm(
-                thread_pool,
                 Blas::MatrixLayout::RowMajor, Blas::MatrixTranspose::Trans, Blas::MatrixTranspose::NoTrans,
-                kOutputDimensions, kInputDimensions, batch_size_,
+                kOutputDimensions, kInputDimensions, count,
                 1.0,
-                gradients, kOutputDimensions,
-                batch_input_, kInputDimensions,
-                momentum_,
-                weights_diff_, kInputDimensions
+                gradients + offset * kOutputDimensions, kOutputDimensions,
+                combined_batch_input_ + offset * kInputDimensions, kInputDimensions,
+                momentum,
+                thread_state.weights_diff_, kInputDimensions
             );
 
 #endif
 
+            previous_layer_trainer_->backpropagate(th, gradients_.data(), offset, count);
+        }
+
+        void reduce_thread_state()
+        {
+            for (IndexType i = 1; i < thread_states_.size(); ++i)
+            {
+                thread_states_[0] += thread_states_[i];
+            }
+        }
+
+        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate)
+        {
+            const LearnFloatType local_learning_rate =
+                learning_rate * learning_rate_scale_;
+
+            reduce_thread_state();
+
+            auto& main_thread_state = thread_states_[0];
+
             for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                const double d = local_learning_rate * biases_diff_[i];
+                const double d = local_learning_rate * main_thread_state.biases_diff_[i];
                 biases_[i] -= d;
                 abs_biases_diff_sum_ += std::abs(d);
             }
             num_biases_diffs_ += kOutputDimensions;
 
             for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
-                const double d = local_learning_rate * weights_diff_[i];
+                const double d = local_learning_rate * main_thread_state.weights_diff_[i];
                 weights_[i] -= d;
                 abs_weights_diff_sum_ += std::abs(d);
             }
             num_weights_diffs_ += kOutputDimensions * kInputDimensions;
 
-            previous_layer_trainer_->backpropagate(thread_pool, gradients_.data(), learning_rate);
+            previous_layer_trainer_->step_end(thread_pool, learning_rate);
         }
 
     private:
         // constructor
         Trainer(LayerType* target_layer, FeatureTransformer* ft) :
-            batch_size_(0),
-            batch_input_(nullptr),
+            combined_batch_size_(0),
+            combined_batch_input_(nullptr),
             previous_layer_trainer_(Trainer<PreviousLayer>::create(
                 &target_layer->previous_layer_, ft)),
             target_layer_(target_layer),
             biases_(),
             weights_(),
-            biases_diff_(),
-            weights_diff_(),
             momentum_(0.2),
             learning_rate_scale_(1.0) {
 
@@ -335,10 +370,12 @@ namespace Eval::NNUE {
                 }
             }
 
-            std::fill(std::begin(biases_diff_), std::end(biases_diff_),
-                      static_cast<LearnFloatType>(0.0));
-            std::fill(std::begin(weights_diff_), std::end(weights_diff_),
-                      static_cast<LearnFloatType>(0.0));
+            for (auto& state : thread_states_)
+            {
+                state.reset_weights();
+                state.reset_biases();
+            }
+
 
             reset_stats();
         }
@@ -365,7 +402,7 @@ namespace Eval::NNUE {
             std::numeric_limits<typename LayerType::WeightType>::max() / kWeightScale;
 
         // number of samples in mini-batch
-        IndexType batch_size_;
+        IndexType combined_batch_size_;
 
         double abs_biases_diff_sum_;
         double abs_weights_diff_sum_;
@@ -373,7 +410,7 @@ namespace Eval::NNUE {
         uint64_t num_weights_diffs_;
 
         // Input mini batch
-        const LearnFloatType* batch_input_;
+        const LearnFloatType* combined_batch_input_;
 
         // Trainer of the previous layer
         const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
@@ -382,12 +419,44 @@ namespace Eval::NNUE {
         LayerType* const target_layer_;
 
         // parameter
+        struct alignas(kCacheLineSize) ThreadState
+        {
+            // Buffer used for updating parameters
+            alignas(kCacheLineSize) LearnFloatType biases_diff_[kOutputDimensions];
+            alignas(kCacheLineSize) LearnFloatType weights_diff_[kOutputDimensions * kInputDimensions];
+
+            ThreadState() { reset_weights(); reset_biases(); }
+
+            ThreadState& operator+=(const ThreadState& other)
+            {
+                for (IndexType i = 0; i < kOutputDimensions; ++i)
+                {
+                    biases_diff_[i] += other.biases_diff_[i];
+                }
+
+                for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i)
+                {
+                    weights_diff_[i] += other.weights_diff_[i];
+                }
+
+                return *this;
+            }
+
+            void reset_weights()
+            {
+                std::fill(std::begin(weights_diff_), std::end(weights_diff_), 0.0f);
+            }
+
+            void reset_biases()
+            {
+                std::fill(std::begin(biases_diff_), std::end(biases_diff_), 0.0f);
+            }
+        };
+
         alignas(kCacheLineSize) LearnFloatType biases_[kOutputDimensions];
         alignas(kCacheLineSize) LearnFloatType weights_[kOutputDimensions * kInputDimensions];
 
-        // Buffer used for updating parameters
-        alignas(kCacheLineSize) LearnFloatType biases_diff_[kOutputDimensions];
-        alignas(kCacheLineSize) LearnFloatType weights_diff_[kOutputDimensions * kInputDimensions];
+        std::vector<ThreadState, CacheLineAlignedAllocator<ThreadState>> thread_states_;
 
         // Forward propagation buffer
         std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;

From cc11375f6df3186ec6090c671ddcbed45e8bc55a Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 22 Nov 2020 19:46:43 +0100
Subject: [PATCH 359/398] Skeleton for new evaluate learner

---
 src/nnue/evaluate_nnue_learner.cpp | 69 +++++++++++++++++++++---------
 1 file changed, 49 insertions(+), 20 deletions(-)

diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 4104fef5..644ac9a4 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -54,6 +54,12 @@ namespace Eval::NNUE {
         const std::string& seed,
         SynchronizedRegionLogger::Region& out) {
 
+#if defined (OPENBLAS_VERSION)
+        openblas_set_num_threads(1);
+#elif defined (INTEL_MKL_VERSION)
+        mkl_set_num_threads(1);
+#endif
+
         out << "INFO (initialize_training): Initializing NN training for "
             << get_architecture_string() << std::endl;
 
@@ -199,39 +205,62 @@ namespace Eval::NNUE {
 
         bool collect_stats = verbose;
 
+        std::vector<double> abs_eval_diff_sum_local(thread_pool.size(), 0.0);
+        std::vector<double> abs_discrete_eval_sum_local(thread_pool.size(), 0.0);
+        std::vector<double> gradient_norm_local(thread_pool.size(), 0.0);
+
         while (examples.size() >= batch_size) {
             std::vector<Example> batch(examples.end() - batch_size, examples.end());
             examples.resize(examples.size() - batch_size);
 
-            const auto network_output = trainer->propagate(thread_pool, batch);
-
+            const auto network_output = trainer->step_start(thread_pool, batch);
             std::vector<LearnFloatType> gradients(batch.size());
-            for (std::size_t b = 0; b < batch.size(); ++b) {
-                const auto shallow = static_cast<Value>(round<std::int32_t>(
-                    batch[b].sign * network_output[b] * kPonanzaConstant));
-                const auto discrete = batch[b].sign * batch[b].discrete_nn_eval;
-                const auto& psv = batch[b].psv;
-                const double gradient =
-                    batch[b].sign * calc_grad(shallow, (Value)psv.score, psv.game_result, psv.gamePly);
-                gradients[b] = static_cast<LearnFloatType>(gradient * batch[b].weight);
+
+            thread_pool.for_each_index_chunk_with_workers(
+                std::size_t(0), batch.size(),
+                [&](Thread& th, std::size_t offset, std::size_t count) {
+                    const auto thread_id = th.thread_idx();
+
+                    trainer->propagate(th, offset, count);
+
+                    for (std::size_t b = offset; b < offset + count; ++b) {
+                        const auto shallow = static_cast<Value>(round<std::int32_t>(
+                            batch[b].sign * network_output[b] * kPonanzaConstant));
+                        const auto discrete = batch[b].sign * batch[b].discrete_nn_eval;
+                        const auto& psv = batch[b].psv;
+                        const double gradient =
+                            batch[b].sign * calc_grad(shallow, (Value)psv.score, psv.game_result, psv.gamePly);
+                        gradients[b] = static_cast<LearnFloatType>(gradient * batch[b].weight);
 
 
-                // The discrete eval will only be valid before first backpropagation,
-                // that is only for the first batch.
-                // Similarily we want only gradients from one batch.
-                if (collect_stats)
-                {
-                    abs_eval_diff_sum += std::abs(discrete - shallow);
-                    abs_discrete_eval_sum += std::abs(discrete);
-                    gradient_norm += std::abs(gradient);
+                        // The discrete eval will only be valid before first backpropagation,
+                        // that is only for the first batch.
+                        // Similarily we want only gradients from one batch.
+                        if (collect_stats)
+                        {
+                            abs_eval_diff_sum_local[thread_id] += std::abs(discrete - shallow);
+                            abs_discrete_eval_sum_local[thread_id] += std::abs(discrete);
+                            gradient_norm_local[thread_id] += std::abs(gradient);
+                        }
+                    }
+
+                    trainer->backpropagate(th, gradients.data(), offset, count);
                 }
-            }
+            );
+            thread_pool.wait_for_workers_finished();
 
-            trainer->backpropagate(thread_pool, gradients.data(), learning_rate);
+            trainer->step_end(thread_pool, learning_rate);
 
             collect_stats = false;
         }
 
+        if (verbose)
+        {
+            abs_eval_diff_sum = std::accumulate(abs_eval_diff_sum_local.begin(), abs_eval_diff_sum_local.end(), 0.0);
+            abs_discrete_eval_sum = std::accumulate(abs_discrete_eval_sum_local.begin(), abs_discrete_eval_sum_local.end(), 0.0);
+            gradient_norm = std::accumulate(gradient_norm_local.begin(), gradient_norm_local.end(), 0.0);
+        }
+
         if (verbose) {
             const double avg_abs_eval_diff = abs_eval_diff_sum / batch_size;
             const double avg_abs_discrete_eval = abs_discrete_eval_sum / batch_size;

From 774b02364121b23ae13be862324fbebc59f357af Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 22 Nov 2020 19:53:33 +0100
Subject: [PATCH 360/398] Add chunked for each with workers.

---
 src/thread.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/src/thread.h b/src/thread.h
index 0d0d7fea..83ba2f33 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -155,6 +155,31 @@ struct ThreadPool : public std::vector<Thread*> {
       });
   }
 
+  template <typename IndexT, typename FuncT>
+  void for_each_index_chunk_with_workers(
+    IndexT begin,
+    typename Detail::TypeIdentity<IndexT>::Type end,
+    FuncT func)
+  {
+    // This value must outlive the function call.
+    // It's fairly safe if we make it static
+    // because for_each_index_with_workers
+    // is not reentrant nor thread safe.
+    const IndexT size = end - begin;
+    const IndexT chunk_size = (size + this->size()) / this->size();
+
+    execute_with_workers(
+      [chunk_size, end, func](Thread& th) mutable {
+        const IndexT thread_id = th.thread_idx();
+        const IndexT offset = chunk_size * thread_id;
+        if (offset >= end)
+          return;
+
+        const IndexT count = offset + chunk_size > end ? end - offset : chunk_size;
+        func(th, offset, count);
+      });
+  }
+
   void start_thinking(Position&, StateListPtr&, const Search::LimitsType&, bool = false);
   void clear();
   void set(size_t);

From 401fc0fbab085f75a1cde793dc3a0b6ded13bafb Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 22 Nov 2020 20:16:49 +0100
Subject: [PATCH 361/398] Prepare clipped relu trainer.

---
 src/nnue/trainer/trainer_clipped_relu.h | 233 +++++++++++++++---------
 1 file changed, 150 insertions(+), 83 deletions(-)

diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index 124671ed..e4bcecaf 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -42,16 +42,31 @@ namespace Eval::NNUE {
             previous_layer_trainer_->initialize(rng);
         }
 
-        // forward propagation
-        const LearnFloatType* propagate(ThreadPool& thread_pool, const std::vector<Example>& batch) {
-            if (output_.size() < kOutputDimensions * batch.size()) {
-              output_.resize(kOutputDimensions * batch.size());
-              gradients_.resize(kInputDimensions * batch.size());
+        const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch)
+        {
+            if (output_.size() < kOutputDimensions * combined_batch.size()) {
+              output_.resize(kOutputDimensions * combined_batch.size());
+              gradients_.resize(kInputDimensions * combined_batch.size());
             }
 
-            const auto input = previous_layer_trainer_->propagate(thread_pool, batch);
+            if (thread_states_.size() < thread_pool.size())
+            {
+                thread_states_.resize(thread_pool.size());
+            }
 
-            batch_size_ = static_cast<IndexType>(batch.size());
+            input_ = previous_layer_trainer_->step_start(thread_pool, combined_batch);
+
+            batch_size_ = static_cast<IndexType>(combined_batch.size());
+
+            return output_.data();
+        }
+
+        // forward propagation
+        void propagate(Thread& th, const uint64_t offset, const uint64_t count) {
+
+            auto& thread_state = thread_states_[th.thread_idx()];
+
+            previous_layer_trainer_->propagate(th, offset, count);
 
 #if defined (USE_SSE2)
 
@@ -61,16 +76,16 @@ namespace Eval::NNUE {
                 const __m128 kZero4 = _mm_set1_ps(+kZero);
                 const __m128 kOne4 = _mm_set1_ps(+kOne);
 
-                for (IndexType b = 0; b < batch.size(); ++b)
+                for (IndexType b = offset; b < offset + count; ++b)
                 {
                     const IndexType batch_offset = kOutputDimensions * b;
 
                     for (IndexType i = 0; i < kOutputDimensions; i += 16)
                     {
-                        __m128 out0 = _mm_loadu_ps(&input[i + 0 + batch_offset]);
-                        __m128 out1 = _mm_loadu_ps(&input[i + 4 + batch_offset]);
-                        __m128 out2 = _mm_loadu_ps(&input[i + 8 + batch_offset]);
-                        __m128 out3 = _mm_loadu_ps(&input[i + 12 + batch_offset]);
+                        __m128 out0 = _mm_loadu_ps(&input_[i + 0 + batch_offset]);
+                        __m128 out1 = _mm_loadu_ps(&input_[i + 4 + batch_offset]);
+                        __m128 out2 = _mm_loadu_ps(&input_[i + 8 + batch_offset]);
+                        __m128 out3 = _mm_loadu_ps(&input_[i + 12 + batch_offset]);
 
                         out0 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out0));
                         out1 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out1));
@@ -82,15 +97,15 @@ namespace Eval::NNUE {
                         _mm_storeu_ps(&output_[i + 8 + batch_offset], out2);
                         _mm_storeu_ps(&output_[i + 12 + batch_offset], out3);
 
-                        __m128 minact0 = _mm_loadu_ps(&min_activations_[i + 0]);
-                        __m128 minact1 = _mm_loadu_ps(&min_activations_[i + 4]);
-                        __m128 minact2 = _mm_loadu_ps(&min_activations_[i + 8]);
-                        __m128 minact3 = _mm_loadu_ps(&min_activations_[i + 12]);
+                        __m128 minact0 = _mm_loadu_ps(&thread_state.min_activations_[i + 0]);
+                        __m128 minact1 = _mm_loadu_ps(&thread_state.min_activations_[i + 4]);
+                        __m128 minact2 = _mm_loadu_ps(&thread_state.min_activations_[i + 8]);
+                        __m128 minact3 = _mm_loadu_ps(&thread_state.min_activations_[i + 12]);
 
-                        __m128 maxact0 = _mm_loadu_ps(&max_activations_[i + 0]);
-                        __m128 maxact1 = _mm_loadu_ps(&max_activations_[i + 4]);
-                        __m128 maxact2 = _mm_loadu_ps(&max_activations_[i + 8]);
-                        __m128 maxact3 = _mm_loadu_ps(&max_activations_[i + 12]);
+                        __m128 maxact0 = _mm_loadu_ps(&thread_state.max_activations_[i + 0]);
+                        __m128 maxact1 = _mm_loadu_ps(&thread_state.max_activations_[i + 4]);
+                        __m128 maxact2 = _mm_loadu_ps(&thread_state.max_activations_[i + 8]);
+                        __m128 maxact3 = _mm_loadu_ps(&thread_state.max_activations_[i + 12]);
 
                         minact0 = _mm_min_ps(out0, minact0);
                         minact1 = _mm_min_ps(out1, minact1);
@@ -102,40 +117,41 @@ namespace Eval::NNUE {
                         maxact2 = _mm_max_ps(out2, maxact2);
                         maxact3 = _mm_max_ps(out3, maxact3);
 
-                        _mm_storeu_ps(&min_activations_[i + 0], minact0);
-                        _mm_storeu_ps(&min_activations_[i + 4], minact1);
-                        _mm_storeu_ps(&min_activations_[i + 8], minact2);
-                        _mm_storeu_ps(&min_activations_[i + 12], minact3);
+                        _mm_storeu_ps(&thread_state.min_activations_[i + 0], minact0);
+                        _mm_storeu_ps(&thread_state.min_activations_[i + 4], minact1);
+                        _mm_storeu_ps(&thread_state.min_activations_[i + 8], minact2);
+                        _mm_storeu_ps(&thread_state.min_activations_[i + 12], minact3);
 
-                        _mm_storeu_ps(&max_activations_[i + 0], maxact0);
-                        _mm_storeu_ps(&max_activations_[i + 4], maxact1);
-                        _mm_storeu_ps(&max_activations_[i + 8], maxact2);
-                        _mm_storeu_ps(&max_activations_[i + 12], maxact3);
+                        _mm_storeu_ps(&thread_state.max_activations_[i + 0], maxact0);
+                        _mm_storeu_ps(&thread_state.max_activations_[i + 4], maxact1);
+                        _mm_storeu_ps(&thread_state.max_activations_[i + 8], maxact2);
+                        _mm_storeu_ps(&thread_state.max_activations_[i + 12], maxact3);
                     }
                 }
             }
 
 #else
 
-            for (IndexType b = 0; b < batch_size_; ++b) {
+            for (IndexType b = offset; b < offset + count; ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
                 for (IndexType i = 0; i < kOutputDimensions; ++i) {
                     const IndexType index = batch_offset + i;
-                    output_[index] = std::max(+kZero, std::min(+kOne, input[index]));
-                    min_activations_[i] = std::min(min_activations_[i], output_[index]);
-                    max_activations_[i] = std::max(max_activations_[i], output_[index]);
+                    output_[index] = std::max(+kZero, std::min(+kOne, input_[index]));
+                    thread_state.min_activations_[i] = std::min(thread_state.min_activations_[i], output_[index]);
+                    thread_state.max_activations_[i] = std::max(thread_state.max_activations_[i], output_[index]);
                 }
             }
 
 #endif
-
-            return output_.data();
         }
 
         // backpropagation
-        void backpropagate(ThreadPool& thread_pool,
+        void backpropagate(Thread& th,
                            const LearnFloatType* gradients,
-                           LearnFloatType learning_rate) {
+                           const uint64_t offset,
+                           const uint64_t count) {
+
+            auto& thread_state = thread_states_[th.thread_idx()];
 
 #if defined (USE_SSE2)
 
@@ -145,62 +161,78 @@ namespace Eval::NNUE {
                 const __m128 kZero4 = _mm_set1_ps(+kZero);
                 const __m128 kOne4 = _mm_set1_ps(+kOne);
 
-                const IndexType total_size = batch_size_ * kOutputDimensions;
-
-                for (IndexType i = 0; i < total_size; i += 16)
+                for (IndexType b = offset; b < offset + count; ++b)
                 {
-                    __m128 out0 = _mm_loadu_ps(&output_[i + 0]);
-                    __m128 out1 = _mm_loadu_ps(&output_[i + 4]);
-                    __m128 out2 = _mm_loadu_ps(&output_[i + 8]);
-                    __m128 out3 = _mm_loadu_ps(&output_[i + 12]);
+                    const IndexType batch_offset = kOutputDimensions * b;
 
-                    __m128 clipped0 = _mm_or_ps(_mm_cmple_ps(out0, kZero4), _mm_cmpge_ps(out0, kOne4));
-                    __m128 clipped1 = _mm_or_ps(_mm_cmple_ps(out1, kZero4), _mm_cmpge_ps(out1, kOne4));
-                    __m128 clipped2 = _mm_or_ps(_mm_cmple_ps(out2, kZero4), _mm_cmpge_ps(out2, kOne4));
-                    __m128 clipped3 = _mm_or_ps(_mm_cmple_ps(out3, kZero4), _mm_cmpge_ps(out3, kOne4));
+                    for (IndexType i = 0; i < kOutputDimensions; i += 16)
+                    {
+                        __m128 out0 = _mm_loadu_ps(&output_[batch_offset + i + 0]);
+                        __m128 out1 = _mm_loadu_ps(&output_[batch_offset + i + 4]);
+                        __m128 out2 = _mm_loadu_ps(&output_[batch_offset + i + 8]);
+                        __m128 out3 = _mm_loadu_ps(&output_[batch_offset + i + 12]);
 
-                    __m128 grad0 = _mm_loadu_ps(&gradients[i + 0]);
-                    __m128 grad1 = _mm_loadu_ps(&gradients[i + 4]);
-                    __m128 grad2 = _mm_loadu_ps(&gradients[i + 8]);
-                    __m128 grad3 = _mm_loadu_ps(&gradients[i + 12]);
+                        __m128 clipped0 = _mm_or_ps(_mm_cmple_ps(out0, kZero4), _mm_cmpge_ps(out0, kOne4));
+                        __m128 clipped1 = _mm_or_ps(_mm_cmple_ps(out1, kZero4), _mm_cmpge_ps(out1, kOne4));
+                        __m128 clipped2 = _mm_or_ps(_mm_cmple_ps(out2, kZero4), _mm_cmpge_ps(out2, kOne4));
+                        __m128 clipped3 = _mm_or_ps(_mm_cmple_ps(out3, kZero4), _mm_cmpge_ps(out3, kOne4));
 
-                    grad0 = _mm_andnot_ps(clipped0, grad0);
-                    grad1 = _mm_andnot_ps(clipped1, grad1);
-                    grad2 = _mm_andnot_ps(clipped2, grad2);
-                    grad3 = _mm_andnot_ps(clipped3, grad3);
+                        __m128 grad0 = _mm_loadu_ps(&gradients[batch_offset + i + 0]);
+                        __m128 grad1 = _mm_loadu_ps(&gradients[batch_offset + i + 4]);
+                        __m128 grad2 = _mm_loadu_ps(&gradients[batch_offset + i + 8]);
+                        __m128 grad3 = _mm_loadu_ps(&gradients[batch_offset + i + 12]);
 
-                    _mm_storeu_ps(&gradients_[i + 0], grad0);
-                    _mm_storeu_ps(&gradients_[i + 4], grad1);
-                    _mm_storeu_ps(&gradients_[i + 8], grad2);
-                    _mm_storeu_ps(&gradients_[i + 12], grad3);
+                        grad0 = _mm_andnot_ps(clipped0, grad0);
+                        grad1 = _mm_andnot_ps(clipped1, grad1);
+                        grad2 = _mm_andnot_ps(clipped2, grad2);
+                        grad3 = _mm_andnot_ps(clipped3, grad3);
 
-                    const int clipped_mask =
-                        (_mm_movemask_ps(clipped0) << 0)
-                        | (_mm_movemask_ps(clipped1) << 4)
-                        | (_mm_movemask_ps(clipped2) << 8)
-                        | (_mm_movemask_ps(clipped3) << 12);
+                        _mm_storeu_ps(&gradients_[batch_offset + i + 0], grad0);
+                        _mm_storeu_ps(&gradients_[batch_offset + i + 4], grad1);
+                        _mm_storeu_ps(&gradients_[batch_offset + i + 8], grad2);
+                        _mm_storeu_ps(&gradients_[batch_offset + i + 12], grad3);
 
-                    num_clipped_ += popcount(clipped_mask);
+                        const int clipped_mask =
+                            (_mm_movemask_ps(clipped0) << 0)
+                            | (_mm_movemask_ps(clipped1) << 4)
+                            | (_mm_movemask_ps(clipped2) << 8)
+                            | (_mm_movemask_ps(clipped3) << 12);
+
+                        thread_state.num_clipped_ += popcount(clipped_mask);
+                    }
                 }
             }
 
 #else
 
-            for (IndexType b = 0; b < batch_size_; ++b) {
+            for (IndexType b = offset; b < offset + count; ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
                 for (IndexType i = 0; i < kOutputDimensions; ++i) {
                     const IndexType index = batch_offset + i;
                     const bool clipped = (output_[index] <= kZero) | (output_[index] >= kOne);
                     gradients_[index] = gradients[index] * !clipped;
-                    num_clipped_ += clipped;
+                    thread_state.num_clipped_ += clipped;
                 }
             }
 
 #endif
 
-            num_total_ += batch_size_ * kOutputDimensions;
+            thread_state.num_total_ += count * kOutputDimensions;
 
-            previous_layer_trainer_->backpropagate(thread_pool, gradients_.data(), learning_rate);
+            previous_layer_trainer_->backpropagate(th, gradients_.data(), offset, count);
+        }
+
+        void reduce_thread_state()
+        {
+            for (IndexType i = 1; i < thread_states_.size(); ++i)
+            {
+                thread_states_[0] += thread_states_[i];
+            }
+        }
+
+        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate)
+        {
+            previous_layer_trainer_->step_end(thread_pool, learning_rate);
         }
 
     private:
@@ -215,22 +247,21 @@ namespace Eval::NNUE {
         }
 
         void reset_stats() {
-            std::fill(std::begin(min_activations_), std::end(min_activations_),
-                      std::numeric_limits<LearnFloatType>::max());
-            std::fill(std::begin(max_activations_), std::end(max_activations_),
-                      std::numeric_limits<LearnFloatType>::lowest());
-
-            num_clipped_ = 0;
-            num_total_ = 0;
+            for(auto& state : thread_states_)
+                state.reset();
         }
 
         // Check if there are any problems with learning
         void check_health() {
 
+            reduce_thread_state();
+
+            auto& main_thread_state = thread_states_[0];
+
             const auto largest_min_activation = *std::max_element(
-                std::begin(min_activations_), std::end(min_activations_));
+                std::begin(main_thread_state.min_activations_), std::end(main_thread_state.min_activations_));
             const auto smallest_max_activation = *std::min_element(
-                std::begin(max_activations_), std::end(max_activations_));
+                std::begin(main_thread_state.max_activations_), std::end(main_thread_state.max_activations_));
 
             auto out = sync_region_cout.new_region();
 
@@ -243,7 +274,7 @@ namespace Eval::NNUE {
                 << " , smallest max activation = " << smallest_max_activation
                 << std::endl;
 
-            out << "  - clipped " << static_cast<double>(num_clipped_) / num_total_ * 100.0 << "% of outputs"
+            out << "  - clipped " << static_cast<double>(main_thread_state.num_clipped_) / main_thread_state.num_total_ * 100.0 << "% of outputs"
                 << std::endl;
 
             out.unlock();
@@ -262,9 +293,10 @@ namespace Eval::NNUE {
         // number of samples in mini-batch
         IndexType batch_size_;
 
-        IndexType num_clipped_;
         IndexType num_total_;
 
+        const LearnFloatType* input_;
+
         // Trainer of the previous layer
         const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
 
@@ -277,9 +309,44 @@ namespace Eval::NNUE {
         // buffer for back propagation
         std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
 
-        // Health check statistics
-        LearnFloatType min_activations_[kOutputDimensions];
-        LearnFloatType max_activations_[kOutputDimensions];
+        struct alignas(kCacheLineSize) ThreadState
+        {
+            // Health check statistics
+            LearnFloatType min_activations_[kOutputDimensions];
+            LearnFloatType max_activations_[kOutputDimensions];
+            IndexType num_clipped_;
+            IndexType num_total_;
+
+            ThreadState() { reset(); }
+
+            ThreadState& operator+=(const ThreadState& other)
+            {
+                for (IndexType i = 0; i < kOutputDimensions; ++i)
+                {
+                    min_activations_[i] = std::min(min_activations_[i], other.min_activations_[i]);
+                }
+
+                for (IndexType i = 0; i < kOutputDimensions; ++i)
+                {
+                    max_activations_[i] = std::max(max_activations_[i], other.max_activations_[i]);
+                }
+
+                num_clipped_ += other.num_clipped_;
+                num_total_ += other.num_total_;
+
+                return *this;
+            }
+
+            void reset()
+            {
+                std::fill(std::begin(min_activations_), std::end(min_activations_), std::numeric_limits<float>::max());
+                std::fill(std::begin(max_activations_), std::end(max_activations_), std::numeric_limits<float>::lowest());
+                num_clipped_ = 0;
+                num_total_ = 0;
+            }
+        };
+
+        std::vector<ThreadState, CacheLineAlignedAllocator<ThreadState>> thread_states_;
     };
 
 }  // namespace Eval::NNUE

From a3c78691a23fd743e2a815b65594609683b87d9c Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 22 Nov 2020 20:44:26 +0100
Subject: [PATCH 362/398] Prepare input slice trainer.

---
 src/nnue/trainer/trainer_input_slice.h | 181 ++++++++++++++++---------
 1 file changed, 115 insertions(+), 66 deletions(-)

diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
index a93a3ea0..54f03d42 100644
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -34,15 +34,15 @@ namespace Eval::NNUE {
 
         // Set options such as hyperparameters
         void send_message(Message* message) {
-            if (num_calls_ == 0) {
+            if (num_calls_[0] == 0) {
                 current_operation_ = Operation::kSendMessage;
                 feature_transformer_trainer_->send_message(message);
             }
 
             assert(current_operation_ == Operation::kSendMessage);
 
-            if (++num_calls_ == num_referrers_) {
-                num_calls_ = 0;
+            if (++num_calls_[0] == num_referrers_) {
+                num_calls_[0] = 0;
                 current_operation_ = Operation::kNone;
             }
         }
@@ -50,55 +50,79 @@ namespace Eval::NNUE {
         // Initialize the parameters with random numbers
         template <typename RNG>
         void initialize(RNG& rng) {
-            if (num_calls_ == 0) {
+            if (num_calls_[0] == 0) {
                 current_operation_ = Operation::kInitialize;
                 feature_transformer_trainer_->initialize(rng);
             }
 
             assert(current_operation_ == Operation::kInitialize);
 
-            if (++num_calls_ == num_referrers_) {
-                num_calls_ = 0;
+            if (++num_calls_[0] == num_referrers_) {
+                num_calls_[0] = 0;
                 current_operation_ = Operation::kNone;
             }
         }
 
-        // forward propagation
-        const LearnFloatType* propagate(ThreadPool& thread_pool, const std::vector<Example>& batch) {
-            if (gradients_.size() < kInputDimensions * batch.size()) {
-                gradients_.resize(kInputDimensions * batch.size());
+        const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch) {
+            if (gradients_.size() < kInputDimensions * combined_batch.size()) {
+                gradients_.resize(kInputDimensions * combined_batch.size());
             }
 
-            batch_size_ = static_cast<IndexType>(batch.size());
-
-            if (num_calls_ == 0) {
-                current_operation_ = Operation::kPropagate;
-                output_ = feature_transformer_trainer_->propagate(thread_pool, batch);
+            if (num_calls_.size() < thread_pool.size())
+            {
+                num_calls_.resize(thread_pool.size(), 0);
             }
 
-            assert(current_operation_ == Operation::kPropagate);
+            batch_size_ = static_cast<IndexType>(combined_batch.size());
 
-            if (++num_calls_ == num_referrers_) {
-                num_calls_ = 0;
+            if (num_calls_[0] == 0) {
+                current_operation_ = Operation::kStepStart;
+                output_ = feature_transformer_trainer_->step_start(thread_pool, combined_batch);
+            }
+
+            assert(current_operation_ == Operation::kStepStart);
+
+            if (++num_calls_[0] == num_referrers_) {
+                num_calls_[0] = 0;
                 current_operation_ = Operation::kNone;
             }
 
             return output_;
         }
 
+        // forward propagation
+        void propagate(Thread& th, uint64_t offset, uint64_t count) {
+            const auto thread_id = th.thread_idx();
+
+            if (num_calls_[thread_id] == 0) {
+                current_operation_ = Operation::kPropagate;
+                feature_transformer_trainer_->propagate(th, offset, count);
+            }
+
+            assert(current_operation_ == Operation::kPropagate);
+
+            if (++num_calls_[thread_id] == num_referrers_) {
+                num_calls_[thread_id] = 0;
+                current_operation_ = Operation::kNone;
+            }
+        }
+
         // backpropagation
-        void backpropagate(ThreadPool& thread_pool,
+        void backpropagate(Thread& th,
                            const LearnFloatType* gradients,
-                           LearnFloatType learning_rate) {
+                           uint64_t offset,
+                           uint64_t count) {
+
+            const auto thread_id = th.thread_idx();
 
             if (num_referrers_ == 1) {
-                feature_transformer_trainer_->backpropagate(thread_pool, gradients, learning_rate);
+                feature_transformer_trainer_->backpropagate(th, gradients, offset, count);
                 return;
             }
 
-            if (num_calls_ == 0) {
+            if (num_calls_[thread_id] == 0) {
                 current_operation_ = Operation::kBackPropagate;
-                for (IndexType b = 0; b < batch_size_; ++b) {
+                for (IndexType b = offset; b < offset + count; ++b) {
                     const IndexType batch_offset = kInputDimensions * b;
                     for (IndexType i = 0; i < kInputDimensions; ++i) {
                         gradients_[batch_offset + i] = static_cast<LearnFloatType>(0.0);
@@ -108,17 +132,31 @@ namespace Eval::NNUE {
 
             assert(current_operation_ == Operation::kBackPropagate);
 
-            for (IndexType b = 0; b < batch_size_; ++b) {
+            for (IndexType b = offset; b < offset + count; ++b) {
                 const IndexType batch_offset = kInputDimensions * b;
                 for (IndexType i = 0; i < kInputDimensions; ++i) {
                     gradients_[batch_offset + i] += gradients[batch_offset + i];
                 }
             }
 
-            if (++num_calls_ == num_referrers_) {
+            if (++num_calls_[thread_id] == num_referrers_) {
                 feature_transformer_trainer_->backpropagate(
-                    thread_pool, gradients_.data(), learning_rate);
-                num_calls_ = 0;
+                    th, gradients_.data(), offset, count);
+                num_calls_[thread_id] = 0;
+                current_operation_ = Operation::kNone;
+            }
+        }
+
+        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate) {
+            if (num_calls_[0] == 0) {
+                current_operation_ = Operation::kStepEnd;
+                feature_transformer_trainer_->step_end(thread_pool, learning_rate);
+            }
+
+            assert(current_operation_ == Operation::kStepEnd);
+
+            if (++num_calls_[0] == num_referrers_) {
+                num_calls_[0] = 0;
                 current_operation_ = Operation::kNone;
             }
         }
@@ -128,7 +166,7 @@ namespace Eval::NNUE {
         SharedInputTrainer(FeatureTransformer* ft) :
             batch_size_(0),
             num_referrers_(0),
-            num_calls_(0),
+            num_calls_(1, 0),
             current_operation_(Operation::kNone),
             feature_transformer_trainer_(Trainer<FeatureTransformer>::create(
                 ft)),
@@ -144,8 +182,10 @@ namespace Eval::NNUE {
             kNone,
             kSendMessage,
             kInitialize,
+            kStepStart,
             kPropagate,
             kBackPropagate,
+            kStepEnd,
         };
 
         // number of samples in mini-batch
@@ -155,7 +195,7 @@ namespace Eval::NNUE {
         std::uint32_t num_referrers_;
 
         // Number of times the current process has been called
-        std::uint32_t num_calls_;
+        std::vector<std::uint32_t> num_calls_;
 
         // current processing type
         Operation current_operation_;
@@ -197,74 +237,81 @@ namespace Eval::NNUE {
             shared_input_trainer_->initialize(rng);
         }
 
-        // forward propagation
-        const LearnFloatType* propagate(ThreadPool& thread_pool,const std::vector<Example>& batch) {
-            if (output_.size() < kOutputDimensions * batch.size()) {
-              output_.resize(kOutputDimensions * batch.size());
-              gradients_.resize(kInputDimensions * batch.size());
+        const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch) {
+            if (output_.size() < kOutputDimensions * combined_batch.size()) {
+              output_.resize(kOutputDimensions * combined_batch.size());
+              gradients_.resize(kInputDimensions * combined_batch.size());
             }
 
-            batch_size_ = static_cast<IndexType>(batch.size());
+            batch_size_ = static_cast<IndexType>(combined_batch.size());
 
-            const auto input = shared_input_trainer_->propagate(thread_pool, batch);
-            for (IndexType b = 0; b < batch_size_; ++b) {
+            input_ = shared_input_trainer_->step_start(thread_pool, combined_batch);
+
+            return output_.data();
+        }
+
+        // forward propagation
+        void propagate(Thread& th, uint64_t offset, uint64_t count) {
+
+            shared_input_trainer_->propagate(th, offset, count);
+
+            for (IndexType b = offset; b < offset + count; ++b) {
                 const IndexType input_offset = kInputDimensions * b;
                 const IndexType output_offset = kOutputDimensions * b;
 
 #if defined(USE_BLAS)
 
                 cblas_scopy(
-                    kOutputDimensions, &input[input_offset + Offset], 1,
+                    kOutputDimensions, &input_[input_offset + Offset], 1,
                     &output_[output_offset], 1
                 );
 #else
 
                 Blas::scopy(
-                    thread_pool,
-                    kOutputDimensions, &input[input_offset + Offset], 1,
+                    kOutputDimensions, &input_[input_offset + Offset], 1,
                     &output_[output_offset], 1
                 );
 
 #endif
             }
-
-            return output_.data();
         }
 
         // backpropagation
-        void backpropagate(ThreadPool& thread_pool,
+        void backpropagate(Thread& th,
                            const LearnFloatType* gradients,
-                           LearnFloatType learning_rate) {
+                           uint64_t offset,
+                           uint64_t count) {
 
-            thread_pool.for_each_index_with_workers(
-                0, batch_size_,
-                [&](Thread&, int b) {
-                    const IndexType input_offset = kInputDimensions * b;
-                    const IndexType output_offset = kOutputDimensions * b;
+            for (IndexType b = offset; b < offset + count; ++b)
+            {
+                const IndexType input_offset = kInputDimensions * b;
+                const IndexType output_offset = kOutputDimensions * b;
 
-                    IndexType i = 0;
-                    for (; i < Offset; ++i) {
-                        gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
-                    }
-
-                    for (; i < Offset + kOutputDimensions; ++i) {
-                        gradients_[input_offset + i] = gradients[output_offset + i - Offset];
-                    }
-
-                    for (; i < kInputDimensions; ++i)
-                    {
-                        gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
-                    }
+                IndexType i = 0;
+                for (; i < Offset; ++i) {
+                    gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
                 }
-            );
-            thread_pool.wait_for_workers_finished();
 
-            shared_input_trainer_->backpropagate(thread_pool, gradients_.data(), learning_rate);
+                for (; i < Offset + kOutputDimensions; ++i) {
+                    gradients_[input_offset + i] = gradients[output_offset + i - Offset];
+                }
+
+                for (; i < kInputDimensions; ++i)
+                {
+                    gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
+                }
+            }
+
+            shared_input_trainer_->backpropagate(th, gradients_.data(), offset, count);
+        }
+
+        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate) {
+            shared_input_trainer_->step_end(thread_pool, learning_rate);
         }
 
     private:
         // constructor
-        Trainer(FeatureTransformer* ft):
+        Trainer(FeatureTransformer* ft) :
             batch_size_(0),
             shared_input_trainer_(SharedInputTrainer::create(ft)) {
         }
@@ -278,6 +325,8 @@ namespace Eval::NNUE {
         // number of samples in mini-batch
         IndexType batch_size_;
 
+        const LearnFloatType* input_;
+
         // Trainer of shared input layer
         const std::shared_ptr<SharedInputTrainer> shared_input_trainer_;
 

From 15c528ca7b6beefa64ba2c0192c7dc3efacc665e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 22 Nov 2020 21:38:11 +0100
Subject: [PATCH 363/398] Prepare feature transformer learner.

---
 .../trainer/trainer_feature_transformer.h     | 486 +++++++++++-------
 1 file changed, 298 insertions(+), 188 deletions(-)

diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 80f914f2..9686002f 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -89,56 +89,88 @@ namespace Eval::NNUE {
             quantize_parameters();
         }
 
-        // forward propagation
-        const LearnFloatType* propagate(ThreadPool& thread_pool, const std::vector<Example>& batch) {
-            if (output_.size() < kOutputDimensions * batch.size()) {
-                output_.resize(kOutputDimensions * batch.size());
-                gradients_.resize(kOutputDimensions * batch.size());
+        const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch)
+        {
+            if (output_.size() < kOutputDimensions * combined_batch.size()) {
+                output_.resize(kOutputDimensions * combined_batch.size());
+                gradients_.resize(kOutputDimensions * combined_batch.size());
             }
 
-            (void)thread_pool;
+            if (thread_stat_states_.size() < thread_pool.size())
+            {
+                thread_stat_states_.resize(thread_pool.size());
+            }
 
-            batch_ = &batch;
-            // affine transform
-            thread_pool.for_each_index_with_workers(
-                0, batch.size(),
-                [&](Thread&, int b) {
-                    const IndexType batch_offset = kOutputDimensions * b;
-                    for (IndexType c = 0; c < 2; ++c) {
-                        const IndexType output_offset = batch_offset + kHalfDimensions * c;
+            if (thread_bias_states_.size() < thread_pool.size())
+            {
+                thread_bias_states_.resize(thread_pool.size());
+            }
+
+            batch_ = &combined_batch;
+
+            auto& main_thread_bias_state = thread_bias_states_[0];
 
 #if defined(USE_BLAS)
 
-                        cblas_scopy(
-                            kHalfDimensions, biases_, 1, &output_[output_offset], 1
-                        );
-
-                        for (const auto& feature : batch[b].training_features[c]) {
-                            const IndexType weights_offset = kHalfDimensions * feature.get_index();
-                            cblas_saxpy(
-                                kHalfDimensions, (float)feature.get_count(),
-                                &weights_[weights_offset], 1, &output_[output_offset], 1
-                            );
-                        }
+            cblas_sscal(
+                kHalfDimensions, momentum_, main_thread_bias_state.biases_diff_, 1
+            );
 
 #else
 
-                        Blas::scopy(
-                            kHalfDimensions, biases_, 1, &output_[output_offset], 1
-                        );
-                        for (const auto& feature : batch[b].training_features[c]) {
-                            const IndexType weights_offset = kHalfDimensions * feature.get_index();
-                            Blas::saxpy(
-                                kHalfDimensions, (float)feature.get_count(),
-                                &weights_[weights_offset], 1, &output_[output_offset], 1
-                            );
-                        }
+            Blas::sscal(
+                kHalfDimensions, momentum_, main_thread_bias_state.biases_diff_, 1
+            );
 
 #endif
+
+            for (IndexType i = 1; i < thread_bias_states_.size(); ++i)
+                thread_bias_states_[i].reset();
+
+            return output_.data();
+        }
+
+        // forward propagation
+        void propagate(Thread& th, uint64_t offset, uint64_t count) {
+
+            auto& thread_stat_state = thread_stat_states_[th.thread_idx()];
+
+            for (IndexType b = offset; b < offset + count; ++b)
+            {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType c = 0; c < 2; ++c) {
+                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
+
+#if defined(USE_BLAS)
+
+                    cblas_scopy(
+                        kHalfDimensions, biases_, 1, &output_[output_offset], 1
+                    );
+
+                    for (const auto& feature : (*batch_)[b].training_features[c]) {
+                        const IndexType weights_offset = kHalfDimensions * feature.get_index();
+                        cblas_saxpy(
+                            kHalfDimensions, (float)feature.get_count(),
+                            &weights_[weights_offset], 1, &output_[output_offset], 1
+                        );
                     }
+
+#else
+
+                    Blas::scopy(
+                        kHalfDimensions, biases_, 1, &output_[output_offset], 1
+                    );
+                    for (const auto& feature : (*batch_)[b].training_features[c]) {
+                        const IndexType weights_offset = kHalfDimensions * feature.get_index();
+                        Blas::saxpy(
+                            kHalfDimensions, (float)feature.get_count(),
+                            &weights_[weights_offset], 1, &output_[output_offset], 1
+                        );
+                    }
+
+#endif
                 }
-            );
-            thread_pool.wait_for_workers_finished();
+            }
 
 #if defined (USE_SSE2)
 
@@ -161,49 +193,51 @@ namespace Eval::NNUE {
                     return _mm_cvtss_f32(_mm_max_ps(max_x_x_13_20, max_x_x_20_13));
                 };
 
-                const int total_size = batch.size() * kOutputDimensions;
-
                 const __m128 kZero4 = _mm_set1_ps(+kZero);
                 const __m128 kOne4 = _mm_set1_ps(+kOne);
 
-                __m128 min_pre_activation0 = _mm_set1_ps(min_pre_activation_);
-                __m128 min_pre_activation1 = _mm_set1_ps(min_pre_activation_);
-                __m128 max_pre_activation0 = _mm_set1_ps(max_pre_activation_);
-                __m128 max_pre_activation1 = _mm_set1_ps(max_pre_activation_);
+                __m128 min_pre_activation0 = _mm_set1_ps(thread_stat_state.min_pre_activation_);
+                __m128 min_pre_activation1 = _mm_set1_ps(thread_stat_state.min_pre_activation_);
+                __m128 max_pre_activation0 = _mm_set1_ps(thread_stat_state.max_pre_activation_);
+                __m128 max_pre_activation1 = _mm_set1_ps(thread_stat_state.max_pre_activation_);
 
-                for (int i = 0; i < total_size; i += 16)
+                for (IndexType b = offset; b < offset + count; ++b)
                 {
-                    __m128 out0 = _mm_loadu_ps(&output_[i +  0]);
-                    __m128 out1 = _mm_loadu_ps(&output_[i +  4]);
-                    __m128 out2 = _mm_loadu_ps(&output_[i +  8]);
-                    __m128 out3 = _mm_loadu_ps(&output_[i + 12]);
+                    const IndexType batch_offset = kOutputDimensions * b;
+                    for (IndexType i = 0; i < kOutputDimensions; i += 16)
+                    {
+                        __m128 out0 = _mm_loadu_ps(&output_[batch_offset + i +  0]);
+                        __m128 out1 = _mm_loadu_ps(&output_[batch_offset + i +  4]);
+                        __m128 out2 = _mm_loadu_ps(&output_[batch_offset + i +  8]);
+                        __m128 out3 = _mm_loadu_ps(&output_[batch_offset + i + 12]);
 
-                    __m128 min01 = _mm_min_ps(out0, out1);
-                    __m128 min23 = _mm_min_ps(out2, out3);
+                        __m128 min01 = _mm_min_ps(out0, out1);
+                        __m128 min23 = _mm_min_ps(out2, out3);
 
-                    __m128 max01 = _mm_max_ps(out0, out1);
-                    __m128 max23 = _mm_max_ps(out2, out3);
+                        __m128 max01 = _mm_max_ps(out0, out1);
+                        __m128 max23 = _mm_max_ps(out2, out3);
 
-                    min_pre_activation0 = _mm_min_ps(min_pre_activation0, min01);
-                    min_pre_activation1 = _mm_min_ps(min_pre_activation1, min23);
-                    max_pre_activation0 = _mm_max_ps(max_pre_activation0, max01);
-                    max_pre_activation1 = _mm_max_ps(max_pre_activation1, max23);
+                        min_pre_activation0 = _mm_min_ps(min_pre_activation0, min01);
+                        min_pre_activation1 = _mm_min_ps(min_pre_activation1, min23);
+                        max_pre_activation0 = _mm_max_ps(max_pre_activation0, max01);
+                        max_pre_activation1 = _mm_max_ps(max_pre_activation1, max23);
 
-                    out0 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out0));
-                    out1 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out1));
-                    out2 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out2));
-                    out3 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out3));
+                        out0 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out0));
+                        out1 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out1));
+                        out2 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out2));
+                        out3 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out3));
 
-                    _mm_storeu_ps(&output_[i +  0], out0);
-                    _mm_storeu_ps(&output_[i +  4], out1);
-                    _mm_storeu_ps(&output_[i +  8], out2);
-                    _mm_storeu_ps(&output_[i + 12], out3);
+                        _mm_storeu_ps(&output_[batch_offset + i +  0], out0);
+                        _mm_storeu_ps(&output_[batch_offset + i +  4], out1);
+                        _mm_storeu_ps(&output_[batch_offset + i +  8], out2);
+                        _mm_storeu_ps(&output_[batch_offset + i + 12], out3);
+                    }
                 }
 
-                min_pre_activation_ = m128_hmin_ps(_mm_min_ps(min_pre_activation0, min_pre_activation1));
-                max_pre_activation_ = m128_hmax_ps(_mm_max_ps(max_pre_activation0, max_pre_activation1));
+                thread_stat_state.min_pre_activation_ = m128_hmin_ps(_mm_min_ps(min_pre_activation0, min_pre_activation1));
+                thread_stat_state.max_pre_activation_ = m128_hmax_ps(_mm_max_ps(max_pre_activation0, max_pre_activation1));
 
-                for (IndexType b = 0; b < batch.size(); ++b)
+                for (IndexType b = offset; b < offset + count; ++b)
                 {
                     const IndexType batch_offset = kOutputDimensions * b;
 
@@ -217,15 +251,15 @@ namespace Eval::NNUE {
                             const __m128 out2 = _mm_loadu_ps(&output_[i +  8 + half_offset]);
                             const __m128 out3 = _mm_loadu_ps(&output_[i + 12 + half_offset]);
 
-                            __m128 minact0 = _mm_loadu_ps(&min_activations_[i +  0]);
-                            __m128 minact1 = _mm_loadu_ps(&min_activations_[i +  4]);
-                            __m128 minact2 = _mm_loadu_ps(&min_activations_[i +  8]);
-                            __m128 minact3 = _mm_loadu_ps(&min_activations_[i + 12]);
+                            __m128 minact0 = _mm_loadu_ps(&thread_stat_state.min_activations_[i +  0]);
+                            __m128 minact1 = _mm_loadu_ps(&thread_stat_state.min_activations_[i +  4]);
+                            __m128 minact2 = _mm_loadu_ps(&thread_stat_state.min_activations_[i +  8]);
+                            __m128 minact3 = _mm_loadu_ps(&thread_stat_state.min_activations_[i + 12]);
 
-                            __m128 maxact0 = _mm_loadu_ps(&max_activations_[i +  0]);
-                            __m128 maxact1 = _mm_loadu_ps(&max_activations_[i +  4]);
-                            __m128 maxact2 = _mm_loadu_ps(&max_activations_[i +  8]);
-                            __m128 maxact3 = _mm_loadu_ps(&max_activations_[i + 12]);
+                            __m128 maxact0 = _mm_loadu_ps(&thread_stat_state.max_activations_[i +  0]);
+                            __m128 maxact1 = _mm_loadu_ps(&thread_stat_state.max_activations_[i +  4]);
+                            __m128 maxact2 = _mm_loadu_ps(&thread_stat_state.max_activations_[i +  8]);
+                            __m128 maxact3 = _mm_loadu_ps(&thread_stat_state.max_activations_[i + 12]);
 
                             minact0 = _mm_min_ps(out0, minact0);
                             minact1 = _mm_min_ps(out1, minact1);
@@ -237,15 +271,15 @@ namespace Eval::NNUE {
                             maxact2 = _mm_max_ps(out2, maxact2);
                             maxact3 = _mm_max_ps(out3, maxact3);
 
-                            _mm_storeu_ps(&min_activations_[i +  0], minact0);
-                            _mm_storeu_ps(&min_activations_[i +  4], minact1);
-                            _mm_storeu_ps(&min_activations_[i +  8], minact2);
-                            _mm_storeu_ps(&min_activations_[i + 12], minact3);
+                            _mm_storeu_ps(&thread_stat_state.min_activations_[i +  0], minact0);
+                            _mm_storeu_ps(&thread_stat_state.min_activations_[i +  4], minact1);
+                            _mm_storeu_ps(&thread_stat_state.min_activations_[i +  8], minact2);
+                            _mm_storeu_ps(&thread_stat_state.min_activations_[i + 12], minact3);
 
-                            _mm_storeu_ps(&max_activations_[i +  0], maxact0);
-                            _mm_storeu_ps(&max_activations_[i +  4], maxact1);
-                            _mm_storeu_ps(&max_activations_[i +  8], maxact2);
-                            _mm_storeu_ps(&max_activations_[i + 12], maxact3);
+                            _mm_storeu_ps(&thread_stat_state.max_activations_[i +  0], maxact0);
+                            _mm_storeu_ps(&thread_stat_state.max_activations_[i +  4], maxact1);
+                            _mm_storeu_ps(&thread_stat_state.max_activations_[i +  8], maxact2);
+                            _mm_storeu_ps(&thread_stat_state.max_activations_[i + 12], maxact3);
                         }
                     }
                 }
@@ -254,33 +288,30 @@ namespace Eval::NNUE {
 #else
 
             // clipped ReLU
-            for (IndexType b = 0; b < batch.size(); ++b) {
+            for (IndexType b = offset; b < offset + count; ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
                 for (IndexType i = 0; i < kOutputDimensions; ++i) {
                     const IndexType index = batch_offset + i;
-                    min_pre_activation_ = std::min(min_pre_activation_, output_[index]);
-                    max_pre_activation_ = std::max(max_pre_activation_, output_[index]);
+                    thread_stat_state.min_pre_activation_ = std::min(thread_stat_state.min_pre_activation_, output_[index]);
+                    thread_stat_state.max_pre_activation_ = std::max(thread_stat_state.max_pre_activation_, output_[index]);
                     output_[index] = std::max(+kZero, std::min(+kOne, output_[index]));
                     const IndexType t = i % kHalfDimensions;
-                    min_activations_[t] = std::min(min_activations_[t], output_[index]);
-                    max_activations_[t] = std::max(max_activations_[t], output_[index]);
+                    thread_stat_state.min_activations_[t] = std::min(thread_stat_state.min_activations_[t], output_[index]);
+                    thread_stat_state.max_activations_[t] = std::max(thread_stat_state.max_activations_[t], output_[index]);
                 }
             }
 
 #endif
-
-            return output_.data();
         }
 
         // backpropagation
-        void backpropagate(ThreadPool& thread_pool,
+        void backpropagate(Thread& th,
                            const LearnFloatType* gradients,
-                           LearnFloatType learning_rate) {
+                           uint64_t offset,
+                           uint64_t count) {
 
-            (void)thread_pool;
-
-            const LearnFloatType local_learning_rate =
-                learning_rate * learning_rate_scale_;
+            auto& thread_stat_state = thread_stat_states_[th.thread_idx()];
+            auto& thread_bias_state = thread_bias_states_[th.thread_idx()];
 
 #if defined (USE_SSE2)
 
@@ -290,111 +321,134 @@ namespace Eval::NNUE {
                 const __m128 kZero4 = _mm_set1_ps(+kZero);
                 const __m128 kOne4 = _mm_set1_ps(+kOne);
 
-                const IndexType total_size = batch_->size() * kOutputDimensions;
-
-                for (IndexType i = 0; i < total_size; i += 16)
+                for (IndexType b = offset; b < offset + count; ++b)
                 {
-                    __m128 out0 = _mm_loadu_ps(&output_[i + 0]);
-                    __m128 out1 = _mm_loadu_ps(&output_[i + 4]);
-                    __m128 out2 = _mm_loadu_ps(&output_[i + 8]);
-                    __m128 out3 = _mm_loadu_ps(&output_[i + 12]);
+                    const IndexType batch_offset = kOutputDimensions * b;
+                    for (IndexType i = 0; i < kOutputDimensions; i += 16)
+                    {
+                        __m128 out0 = _mm_loadu_ps(&output_[batch_offset + i + 0]);
+                        __m128 out1 = _mm_loadu_ps(&output_[batch_offset + i + 4]);
+                        __m128 out2 = _mm_loadu_ps(&output_[batch_offset + i + 8]);
+                        __m128 out3 = _mm_loadu_ps(&output_[batch_offset + i + 12]);
 
-                    __m128 clipped0 = _mm_or_ps(_mm_cmple_ps(out0, kZero4), _mm_cmpge_ps(out0, kOne4));
-                    __m128 clipped1 = _mm_or_ps(_mm_cmple_ps(out1, kZero4), _mm_cmpge_ps(out1, kOne4));
-                    __m128 clipped2 = _mm_or_ps(_mm_cmple_ps(out2, kZero4), _mm_cmpge_ps(out2, kOne4));
-                    __m128 clipped3 = _mm_or_ps(_mm_cmple_ps(out3, kZero4), _mm_cmpge_ps(out3, kOne4));
+                        __m128 clipped0 = _mm_or_ps(_mm_cmple_ps(out0, kZero4), _mm_cmpge_ps(out0, kOne4));
+                        __m128 clipped1 = _mm_or_ps(_mm_cmple_ps(out1, kZero4), _mm_cmpge_ps(out1, kOne4));
+                        __m128 clipped2 = _mm_or_ps(_mm_cmple_ps(out2, kZero4), _mm_cmpge_ps(out2, kOne4));
+                        __m128 clipped3 = _mm_or_ps(_mm_cmple_ps(out3, kZero4), _mm_cmpge_ps(out3, kOne4));
 
-                    __m128 grad0 = _mm_loadu_ps(&gradients[i + 0]);
-                    __m128 grad1 = _mm_loadu_ps(&gradients[i + 4]);
-                    __m128 grad2 = _mm_loadu_ps(&gradients[i + 8]);
-                    __m128 grad3 = _mm_loadu_ps(&gradients[i + 12]);
+                        __m128 grad0 = _mm_loadu_ps(&gradients[batch_offset + i + 0]);
+                        __m128 grad1 = _mm_loadu_ps(&gradients[batch_offset + i + 4]);
+                        __m128 grad2 = _mm_loadu_ps(&gradients[batch_offset + i + 8]);
+                        __m128 grad3 = _mm_loadu_ps(&gradients[batch_offset + i + 12]);
 
-                    grad0 = _mm_andnot_ps(clipped0, grad0);
-                    grad1 = _mm_andnot_ps(clipped1, grad1);
-                    grad2 = _mm_andnot_ps(clipped2, grad2);
-                    grad3 = _mm_andnot_ps(clipped3, grad3);
+                        grad0 = _mm_andnot_ps(clipped0, grad0);
+                        grad1 = _mm_andnot_ps(clipped1, grad1);
+                        grad2 = _mm_andnot_ps(clipped2, grad2);
+                        grad3 = _mm_andnot_ps(clipped3, grad3);
 
-                    _mm_storeu_ps(&gradients_[i + 0], grad0);
-                    _mm_storeu_ps(&gradients_[i + 4], grad1);
-                    _mm_storeu_ps(&gradients_[i + 8], grad2);
-                    _mm_storeu_ps(&gradients_[i + 12], grad3);
+                        _mm_storeu_ps(&gradients_[batch_offset + i + 0], grad0);
+                        _mm_storeu_ps(&gradients_[batch_offset + i + 4], grad1);
+                        _mm_storeu_ps(&gradients_[batch_offset + i + 8], grad2);
+                        _mm_storeu_ps(&gradients_[batch_offset + i + 12], grad3);
 
-                    const int clipped_mask =
-                        (_mm_movemask_ps(clipped0) << 0)
-                        | (_mm_movemask_ps(clipped1) << 4)
-                        | (_mm_movemask_ps(clipped2) << 8)
-                        | (_mm_movemask_ps(clipped3) << 12);
+                        const int clipped_mask =
+                            (_mm_movemask_ps(clipped0) << 0)
+                            | (_mm_movemask_ps(clipped1) << 4)
+                            | (_mm_movemask_ps(clipped2) << 8)
+                            | (_mm_movemask_ps(clipped3) << 12);
 
-                    num_clipped_ += popcount(clipped_mask);
+                        thread_stat_state.num_clipped_ += popcount(clipped_mask);
+                    }
                 }
             }
 
 #else
 
-            for (IndexType b = 0; b < batch_->size(); ++b) {
+            for (IndexType b = offset; b < offset + count; ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
                 for (IndexType i = 0; i < kOutputDimensions; ++i) {
                     const IndexType index = batch_offset + i;
                     const bool clipped = (output_[index] <= kZero) | (output_[index] >= kOne);
                     gradients_[index] = gradients[index] * !clipped;
-                    num_clipped_ += clipped;
+                    thread_stat_state.num_clipped_ += clipped;
                 }
             }
 
 #endif
 
-            num_total_ += batch_->size() * kOutputDimensions;
+            thread_stat_state.num_total_ += count * kOutputDimensions;
+
+#if defined(USE_BLAS)
+
+            for (IndexType b = offset; b < offset + count; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType c = 0; c < 2; ++c) {
+                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
+                    cblas_saxpy(
+                        kHalfDimensions, 1.0,
+                        &gradients_[output_offset], 1, thread_bias_state.biases_diff_, 1
+                    );
+                }
+            }
+
+#else
+
+            for (IndexType b = offset; b < offset + count; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType c = 0; c < 2; ++c) {
+                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
+                    Blas::saxpy(
+                        kHalfDimensions, 1.0,
+                        &gradients_[output_offset], 1, thread_bias_state.biases_diff_, 1
+                    );
+                }
+            }
+
+#endif
+        }
+
+        void reduce_thread_stat_state()
+        {
+            for (IndexType i = 1; i < thread_stat_states_.size(); ++i)
+            {
+                thread_stat_states_[0] += thread_stat_states_[i];
+            }
+        }
+
+        void reduce_thread_bias_state()
+        {
+            for (IndexType i = 1; i < thread_bias_states_.size(); ++i)
+            {
+                thread_bias_states_[0] += thread_bias_states_[i];
+            }
+        }
+
+        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate) {
+
+            const LearnFloatType local_learning_rate =
+                learning_rate * learning_rate_scale_;
 
             // Since the weight matrix updates only the columns corresponding to the features that appeared in the input,
             // Correct the learning rate and adjust the scale without using momentum
             const LearnFloatType effective_learning_rate =
                 static_cast<LearnFloatType>(local_learning_rate / (1.0 - momentum_));
 
+            reduce_thread_bias_state();
+
+            auto& main_thread_state = thread_bias_states_[0];
+
 #if defined(USE_BLAS)
 
-            cblas_sscal(
-                kHalfDimensions, momentum_, biases_diff_, 1
-            );
-
-            for (IndexType b = 0; b < batch_->size(); ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                for (IndexType c = 0; c < 2; ++c) {
-                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
-                    cblas_saxpy(
-                        kHalfDimensions, 1.0,
-                        &gradients_[output_offset], 1, biases_diff_, 1
-                    );
-                }
-            }
-
             cblas_saxpy(
                 kHalfDimensions, -local_learning_rate,
-                biases_diff_, 1, biases_, 1
+                main_thread_state.biases_diff_, 1, biases_, 1
             );
 
 #else
 
-            Blas::sscal(
-                thread_pool,
-                kHalfDimensions, momentum_, biases_diff_, 1
-            );
-
-            for (IndexType b = 0; b < batch_->size(); ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                for (IndexType c = 0; c < 2; ++c) {
-                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
-                    Blas::saxpy(
-                        thread_pool,
-                        kHalfDimensions, 1.0,
-                        &gradients_[output_offset], 1, biases_diff_, 1
-                    );
-                }
-            }
-
             Blas::saxpy(
-                thread_pool,
                 kHalfDimensions, -local_learning_rate,
-                biases_diff_, 1, biases_, 1
+                main_thread_state.biases_diff_, 1, biases_, 1
             );
 
 #endif
@@ -464,7 +518,6 @@ namespace Eval::NNUE {
             target_layer_(target_layer),
             biases_(),
             weights_(),
-            biases_diff_(),
             momentum_(0.2),
             learning_rate_scale_(1.0) {
 
@@ -502,16 +555,8 @@ namespace Eval::NNUE {
         }
 
         void reset_stats() {
-            min_pre_activation_ = std::numeric_limits<LearnFloatType>::max();
-            max_pre_activation_ = std::numeric_limits<LearnFloatType>::lowest();
-
-            std::fill(std::begin(min_activations_), std::end(min_activations_),
-                      std::numeric_limits<LearnFloatType>::max());
-            std::fill(std::begin(max_activations_), std::end(max_activations_),
-                      std::numeric_limits<LearnFloatType>::lowest());
-
-            num_clipped_ = 0;
-            num_total_ = 0;
+            for (auto& state : thread_stat_states_)
+                state.reset();
         }
 
         // read parameterized integer
@@ -528,9 +573,10 @@ namespace Eval::NNUE {
                     target_layer_->weights_[i] / kWeightScale);
             }
 
-            std::fill(std::begin(biases_diff_), std::end(biases_diff_), +kZero);
-
             reset_stats();
+
+            for (auto& state : thread_bias_states_)
+                state.reset();
         }
 
         // Set the weight corresponding to the feature that does not appear in the learning data to 0
@@ -552,10 +598,14 @@ namespace Eval::NNUE {
                 std::numeric_limits<typename LayerType::WeightType>::max() /
                 kWeightScale;
 
+            reduce_thread_stat_state();
+
+            auto& main_thread_state = thread_stat_states_[0];
+
             const auto largest_min_activation = *std::max_element(
-                std::begin(min_activations_), std::end(min_activations_));
+                std::begin(main_thread_state.min_activations_), std::end(main_thread_state.min_activations_));
             const auto smallest_max_activation = *std::min_element(
-                std::begin(max_activations_), std::end(max_activations_));
+                std::begin(main_thread_state.max_activations_), std::end(main_thread_state.max_activations_));
 
             double abs_bias_sum = 0.0;
             double abs_weight_sum = 0.0;
@@ -578,8 +628,8 @@ namespace Eval::NNUE {
                 << std::endl;
 
             out << "  - (min, max) of pre-activations = "
-                << min_pre_activation_ << ", "
-                << max_pre_activation_ << " (limit = "
+                << main_thread_state.min_pre_activation_ << ", "
+                << main_thread_state.max_pre_activation_ << " (limit = "
                 << kPreActivationLimit << ")"
                 << std::endl;
 
@@ -590,7 +640,7 @@ namespace Eval::NNUE {
             out << "  - avg_abs_bias   = " << abs_bias_sum / std::size(biases_) << std::endl;
             out << "  - avg_abs_weight = " << abs_weight_sum / std::size(weights_) << std::endl;
 
-            out << "  - clipped " << static_cast<double>(num_clipped_) / num_total_ * 100.0 << "% of outputs"
+            out << "  - clipped " << static_cast<double>(main_thread_state.num_clipped_) / main_thread_state.num_total_ * 100.0 << "% of outputs"
                 << std::endl;
 
             out.unlock();
@@ -620,7 +670,6 @@ namespace Eval::NNUE {
         // layer to learn
         LayerType* const target_layer_;
 
-        IndexType num_clipped_;
         IndexType num_total_;
 
         // parameter
@@ -629,7 +678,6 @@ namespace Eval::NNUE {
             LearnFloatType weights_[kHalfDimensions * kInputDimensions];
 
         // Buffer used for updating parameters
-        alignas(kCacheLineSize) LearnFloatType biases_diff_[kHalfDimensions];
         std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
 
         // Forward propagation buffer
@@ -643,11 +691,73 @@ namespace Eval::NNUE {
         LearnFloatType momentum_;
         LearnFloatType learning_rate_scale_;
 
-        // Health check statistics
-        LearnFloatType min_pre_activation_;
-        LearnFloatType max_pre_activation_;
-        alignas(kCacheLineSize) LearnFloatType min_activations_[kHalfDimensions];
-        alignas(kCacheLineSize) LearnFloatType max_activations_[kHalfDimensions];
+        struct alignas(kCacheLineSize) ThreadStatState
+        {
+            alignas(kCacheLineSize) LearnFloatType min_activations_[kHalfDimensions];
+            alignas(kCacheLineSize) LearnFloatType max_activations_[kHalfDimensions];
+            LearnFloatType min_pre_activation_;
+            LearnFloatType max_pre_activation_;
+            IndexType num_clipped_;
+            IndexType num_total_;
+
+            ThreadStatState() { reset(); }
+
+            ThreadStatState& operator+=(const ThreadStatState& other)
+            {
+                for (IndexType i = 0; i < kHalfDimensions; ++i)
+                {
+                    min_activations_[i] = std::min(min_activations_[i], other.min_activations_[i]);
+                }
+
+                for (IndexType i = 0; i < kHalfDimensions; ++i)
+                {
+                    max_activations_[i] = std::max(max_activations_[i], other.max_activations_[i]);
+                }
+
+                min_pre_activation_ = std::min(min_pre_activation_, other.min_pre_activation_);
+                max_pre_activation_ = std::max(max_pre_activation_, other.max_pre_activation_);
+
+                num_clipped_ += other.num_clipped_;
+                num_total_ += other.num_total_;
+
+                return *this;
+            }
+
+            void reset()
+            {
+                std::fill(std::begin(min_activations_), std::end(min_activations_), std::numeric_limits<float>::max());
+                std::fill(std::begin(max_activations_), std::end(max_activations_), std::numeric_limits<float>::lowest());
+                min_pre_activation_ = std::numeric_limits<float>::max();
+                max_pre_activation_ = std::numeric_limits<float>::lowest();
+                num_clipped_ = 0;
+                num_total_ = 0;
+            }
+        };
+
+        struct alignas(kCacheLineSize) ThreadBiasState
+        {
+            alignas(kCacheLineSize) LearnFloatType biases_diff_[kHalfDimensions];
+
+            ThreadBiasState() { reset(); }
+
+            ThreadBiasState& operator+=(const ThreadBiasState& other)
+            {
+                for (IndexType i = 0; i < kHalfDimensions; ++i)
+                {
+                    biases_diff_[i] += other.biases_diff_[i];
+                }
+
+                return *this;
+            }
+
+            void reset()
+            {
+                std::fill(std::begin(biases_diff_), std::end(biases_diff_), 0.0f);
+            }
+        };
+
+        std::vector<ThreadStatState, CacheLineAlignedAllocator<ThreadStatState>> thread_stat_states_;
+        std::vector<ThreadBiasState, CacheLineAlignedAllocator<ThreadBiasState>> thread_bias_states_;
     };
 
 }  // namespace Eval::NNUE

From 1c8495b54b7b5c52d33492f458b829f18fe61460 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 25 Nov 2020 20:37:38 +0100
Subject: [PATCH 364/398] Remove handwritten saxpy because compilers optimize
 the second look anyway.

---
 src/extra/stockfish_blas.cpp                  | 45 +------------------
 .../trainer/trainer_feature_transformer.h     |  6 +--
 2 files changed, 5 insertions(+), 46 deletions(-)

diff --git a/src/extra/stockfish_blas.cpp b/src/extra/stockfish_blas.cpp
index 109a4b44..2bf28b8f 100644
--- a/src/extra/stockfish_blas.cpp
+++ b/src/extra/stockfish_blas.cpp
@@ -178,53 +178,11 @@ namespace Blas {
     )
     {
 
-#if defined (USE_SSE2)
-
-        const __m128 alpha4 = _mm_set1_ps(alpha);
-
-        int i = 0;
-        for(; i < N - 15; i += 16)
-        {
-            __m128 x0 = _mm_loadu_ps(X + i +  0);
-            __m128 x1 = _mm_loadu_ps(X + i +  4);
-            __m128 x2 = _mm_loadu_ps(X + i +  8);
-            __m128 x3 = _mm_loadu_ps(X + i + 12);
-
-            __m128 y0 = _mm_loadu_ps(Y + i +  0);
-            __m128 y1 = _mm_loadu_ps(Y + i +  4);
-            __m128 y2 = _mm_loadu_ps(Y + i +  8);
-            __m128 y3 = _mm_loadu_ps(Y + i + 12);
-
-            x0 = _mm_mul_ps(x0, alpha4);
-            x1 = _mm_mul_ps(x1, alpha4);
-            x2 = _mm_mul_ps(x2, alpha4);
-            x3 = _mm_mul_ps(x3, alpha4);
-
-            x0 = _mm_add_ps(x0, y0);
-            x1 = _mm_add_ps(x1, y1);
-            x2 = _mm_add_ps(x2, y2);
-            x3 = _mm_add_ps(x3, y3);
-
-            _mm_storeu_ps(Y + i +  0, x0);
-            _mm_storeu_ps(Y + i +  4, x1);
-            _mm_storeu_ps(Y + i +  8, x2);
-            _mm_storeu_ps(Y + i + 12, x3);
-        }
-
-        for(; i < N; ++i)
-        {
-            Y[i] += X[i] * alpha;
-        }
-
-#else
-
         for(int i = 0; i < N; ++i)
         {
             Y[i] += X[i] * alpha;
         }
 
-#endif
-
     }
 
     void saxpy(
@@ -564,7 +522,8 @@ namespace Blas {
         const __m128 alpha4 = _mm_set1_ps(alpha);
         const __m128 beta4 = _mm_set1_ps(beta);
 
-        for (int m = 0; m < M - 1; m += 2)
+        int m = 0;
+        for (; m < M - 1; m += 2)
         {
             int n = 0;
             for (; n < N - 3; n += 4)
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 9686002f..78729064 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -164,7 +164,7 @@ namespace Eval::NNUE {
                         const IndexType weights_offset = kHalfDimensions * feature.get_index();
                         Blas::saxpy(
                             kHalfDimensions, (float)feature.get_count(),
-                            &weights_[weights_offset], 1, &output_[output_offset], 1
+                            &weights_[weights_offset], &output_[output_offset]
                         );
                     }
 
@@ -497,8 +497,8 @@ namespace Eval::NNUE {
 
                                 Blas::saxpy(
                                     kHalfDimensions, -scale,
-                                    &gradients_[output_offset], 1,
-                                    &weights_[weights_offset], 1
+                                    &gradients_[output_offset],
+                                    &weights_[weights_offset]
                                 );
 
 #endif

From 49b2dcb1f3db8ac8c7f9cfcf1abfcb64194ff700 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 25 Nov 2020 21:53:53 +0100
Subject: [PATCH 365/398] Preallocate memory for unique_features. Keep the
 training_features temporary buffer as a thread_local so we reuse the storage.

---
 src/nnue/evaluate_nnue_learner.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 644ac9a4..2f0a2122 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -157,8 +157,12 @@ namespace Eval::NNUE {
             active_indices[0].swap(active_indices[1]);
         }
 
+        static thread_local std::vector<TrainingFeature> s_training_features;
+        auto& training_features = s_training_features;
+
         for (const auto color : Colors) {
-            std::vector<TrainingFeature> training_features;
+            training_features.clear();
+
             for (const auto base_index : active_indices[color]) {
                 static_assert(Features::Factorizer<RawFeatures>::get_dimensions() <
                               (1 << TrainingFeature::kIndexBits), "");
@@ -169,6 +173,7 @@ namespace Eval::NNUE {
             std::sort(training_features.begin(), training_features.end());
 
             auto& unique_features = example.training_features[color];
+            unique_features.reserve(training_features.size());
             for (const auto& feature : training_features) {
                 if (!unique_features.empty() &&
                     feature.get_index() == unique_features.back().get_index()) {

From 8009973381f1064ea72e9533808166dd54a5445b Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 25 Nov 2020 22:13:11 +0100
Subject: [PATCH 366/398] Special case for alpha=1 in saxpy, slight performance
 increase.

---
 src/extra/stockfish_blas.cpp | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/extra/stockfish_blas.cpp b/src/extra/stockfish_blas.cpp
index 2bf28b8f..70b258bc 100644
--- a/src/extra/stockfish_blas.cpp
+++ b/src/extra/stockfish_blas.cpp
@@ -177,10 +177,19 @@ namespace Blas {
         float * SF_BLAS_RESTRICT Y
     )
     {
-
-        for(int i = 0; i < N; ++i)
+        if (alpha == 1.0f)
         {
-            Y[i] += X[i] * alpha;
+            for (int i = 0; i < N; ++i)
+            {
+                Y[i] += X[i];
+            }
+        }
+        else
+        {
+            for (int i = 0; i < N; ++i)
+            {
+                Y[i] += X[i] * alpha;
+            }
         }
 
     }

From e954b14196e129a6df6edf184006bbc4dff2177f Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 25 Nov 2020 22:43:42 +0100
Subject: [PATCH 367/398] Prefetch weights for feature transformer backprop to
 shared cache.

---
 src/nnue/trainer/trainer_feature_transformer.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 78729064..fa0859ed 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -138,6 +138,7 @@ namespace Eval::NNUE {
             for (IndexType b = offset; b < offset + count; ++b)
             {
                 const IndexType batch_offset = kOutputDimensions * b;
+
                 for (IndexType c = 0; c < 2; ++c) {
                     const IndexType output_offset = batch_offset + kHalfDimensions * c;
 
@@ -459,10 +460,16 @@ namespace Eval::NNUE {
 
                     for (IndexType b = 0; b < batch_->size(); ++b) {
                         const IndexType batch_offset = kOutputDimensions * b;
+
                         for (IndexType c = 0; c < 2; ++c) {
                             const IndexType output_offset = batch_offset + kHalfDimensions * c;
                             for (const auto& feature : (*batch_)[b].training_features[c]) {
                                 const IndexType feature_index = feature.get_index();
+                                const IndexType weights_offset =
+                                    kHalfDimensions * feature_index;
+#if defined (USE_SSE2)
+                                _mm_prefetch(reinterpret_cast<const char*>(&weights_[weights_offset]), _MM_HINT_T2);
+#endif
 
                                 // We assign each bucket a continuous range of bits at least
                                 // of cache line size to prevent false sharing.
@@ -479,9 +486,6 @@ namespace Eval::NNUE {
                                 // (even a different cache line)
                                 observed_features.set(feature_index);
 
-                                const IndexType weights_offset =
-                                    kHalfDimensions * feature_index;
-
                                 const auto scale = static_cast<LearnFloatType>(
                                     effective_learning_rate / feature.get_count());
 

From 0bee8fef64f955f662386fc28cdde9da7536fd8e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 25 Nov 2020 22:59:34 +0100
Subject: [PATCH 368/398] Don't unnecessarily copy the batch part.

---
 src/nnue/evaluate_nnue_learner.cpp            | 25 +++++++++--------
 src/nnue/trainer/trainer_affine_transform.h   | 14 ++++++----
 src/nnue/trainer/trainer_clipped_relu.h       | 14 ++++++----
 .../trainer/trainer_feature_transformer.h     | 23 +++++++++------
 src/nnue/trainer/trainer_input_slice.h        | 28 +++++++++++--------
 5 files changed, 61 insertions(+), 43 deletions(-)

diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 2f0a2122..24ad2732 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -215,27 +215,28 @@ namespace Eval::NNUE {
         std::vector<double> gradient_norm_local(thread_pool.size(), 0.0);
 
         while (examples.size() >= batch_size) {
-            std::vector<Example> batch(examples.end() - batch_size, examples.end());
-            examples.resize(examples.size() - batch_size);
-
-            const auto network_output = trainer->step_start(thread_pool, batch);
-            std::vector<LearnFloatType> gradients(batch.size());
+            auto batch_begin = examples.end() - batch_size;
+            auto batch_end = examples.end();
+            auto size = batch_end - batch_begin;
+            const auto network_output = trainer->step_start(thread_pool, batch_begin, batch_end);
+            std::vector<LearnFloatType> gradients(size);
 
             thread_pool.for_each_index_chunk_with_workers(
-                std::size_t(0), batch.size(),
+                std::size_t(0), size,
                 [&](Thread& th, std::size_t offset, std::size_t count) {
                     const auto thread_id = th.thread_idx();
 
                     trainer->propagate(th, offset, count);
 
                     for (std::size_t b = offset; b < offset + count; ++b) {
+                        const auto& e = *(batch_begin + b);
                         const auto shallow = static_cast<Value>(round<std::int32_t>(
-                            batch[b].sign * network_output[b] * kPonanzaConstant));
-                        const auto discrete = batch[b].sign * batch[b].discrete_nn_eval;
-                        const auto& psv = batch[b].psv;
+                            e.sign * network_output[b] * kPonanzaConstant));
+                        const auto discrete = e.sign * e.discrete_nn_eval;
+                        const auto& psv = e.psv;
                         const double gradient =
-                            batch[b].sign * calc_grad(shallow, (Value)psv.score, psv.game_result, psv.gamePly);
-                        gradients[b] = static_cast<LearnFloatType>(gradient * batch[b].weight);
+                            e.sign * calc_grad(shallow, (Value)psv.score, psv.game_result, psv.gamePly);
+                        gradients[b] = static_cast<LearnFloatType>(gradient * e.weight);
 
 
                         // The discrete eval will only be valid before first backpropagation,
@@ -256,6 +257,8 @@ namespace Eval::NNUE {
 
             trainer->step_end(thread_pool, learning_rate);
 
+            examples.resize(examples.size() - size);
+
             collect_stats = false;
         }
 
diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h
index f66f1a65..b6d70aa4 100644
--- a/src/nnue/trainer/trainer_affine_transform.h
+++ b/src/nnue/trainer/trainer_affine_transform.h
@@ -91,11 +91,13 @@ namespace Eval::NNUE {
             quantize_parameters();
         }
 
-        const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch)
+        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
         {
-            if (output_.size() < kOutputDimensions * combined_batch.size()) {
-                output_.resize(kOutputDimensions * combined_batch.size());
-                gradients_.resize(kInputDimensions * combined_batch.size());
+            const auto size = batch_end - batch_begin;
+
+            if (output_.size() < kOutputDimensions * size) {
+                output_.resize(kOutputDimensions * size);
+                gradients_.resize(kInputDimensions * size);
             }
 
             if (thread_states_.size() < thread_pool.size())
@@ -103,8 +105,8 @@ namespace Eval::NNUE {
                 thread_states_.resize(thread_pool.size());
             }
 
-            combined_batch_size_ = static_cast<IndexType>(combined_batch.size());
-            combined_batch_input_ = previous_layer_trainer_->step_start(thread_pool, combined_batch);
+            combined_batch_size_ = size;
+            combined_batch_input_ = previous_layer_trainer_->step_start(thread_pool, batch_begin, batch_end);
 
             auto& main_thread_state = thread_states_[0];
 
diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index e4bcecaf..eae35df6 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -42,11 +42,13 @@ namespace Eval::NNUE {
             previous_layer_trainer_->initialize(rng);
         }
 
-        const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch)
+        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
         {
-            if (output_.size() < kOutputDimensions * combined_batch.size()) {
-              output_.resize(kOutputDimensions * combined_batch.size());
-              gradients_.resize(kInputDimensions * combined_batch.size());
+            const auto size = batch_end - batch_begin;
+
+            if (output_.size() < kOutputDimensions * size) {
+              output_.resize(kOutputDimensions * size);
+              gradients_.resize(kInputDimensions * size);
             }
 
             if (thread_states_.size() < thread_pool.size())
@@ -54,9 +56,9 @@ namespace Eval::NNUE {
                 thread_states_.resize(thread_pool.size());
             }
 
-            input_ = previous_layer_trainer_->step_start(thread_pool, combined_batch);
+            input_ = previous_layer_trainer_->step_start(thread_pool, batch_begin, batch_end);
 
-            batch_size_ = static_cast<IndexType>(combined_batch.size());
+            batch_size_ = size;
 
             return output_.data();
         }
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index fa0859ed..65766b05 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -89,11 +89,13 @@ namespace Eval::NNUE {
             quantize_parameters();
         }
 
-        const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch)
+        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
         {
-            if (output_.size() < kOutputDimensions * combined_batch.size()) {
-                output_.resize(kOutputDimensions * combined_batch.size());
-                gradients_.resize(kOutputDimensions * combined_batch.size());
+            const auto size = batch_end - batch_begin;
+
+            if (output_.size() < kOutputDimensions * size) {
+                output_.resize(kOutputDimensions * size);
+                gradients_.resize(kOutputDimensions * size);
             }
 
             if (thread_stat_states_.size() < thread_pool.size())
@@ -106,7 +108,8 @@ namespace Eval::NNUE {
                 thread_bias_states_.resize(thread_pool.size());
             }
 
-            batch_ = &combined_batch;
+            batch_ = &*batch_begin;
+            batch_size_ = size;
 
             auto& main_thread_bias_state = thread_bias_states_[0];
 
@@ -161,7 +164,7 @@ namespace Eval::NNUE {
                     Blas::scopy(
                         kHalfDimensions, biases_, 1, &output_[output_offset], 1
                     );
-                    for (const auto& feature : (*batch_)[b].training_features[c]) {
+                    for (const auto& feature : batch_[b].training_features[c]) {
                         const IndexType weights_offset = kHalfDimensions * feature.get_index();
                         Blas::saxpy(
                             kHalfDimensions, (float)feature.get_count(),
@@ -458,12 +461,12 @@ namespace Eval::NNUE {
                 [&, num_threads = thread_pool.size()](Thread& th) {
                     const auto thread_index = th.thread_idx();
 
-                    for (IndexType b = 0; b < batch_->size(); ++b) {
+                    for (IndexType b = 0; b < batch_size_; ++b) {
                         const IndexType batch_offset = kOutputDimensions * b;
 
                         for (IndexType c = 0; c < 2; ++c) {
                             const IndexType output_offset = batch_offset + kHalfDimensions * c;
-                            for (const auto& feature : (*batch_)[b].training_features[c]) {
+                            for (const auto& feature : batch_[b].training_features[c]) {
                                 const IndexType feature_index = feature.get_index();
                                 const IndexType weights_offset =
                                     kHalfDimensions * feature_index;
@@ -519,6 +522,7 @@ namespace Eval::NNUE {
         // constructor
         Trainer(LayerType* target_layer) :
             batch_(nullptr),
+            batch_size_(0),
             target_layer_(target_layer),
             biases_(),
             weights_(),
@@ -669,7 +673,8 @@ namespace Eval::NNUE {
         static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
 
         // mini batch
-        const std::vector<Example>* batch_;
+        const Example* batch_;
+        IndexType batch_size_;
 
         // layer to learn
         LayerType* const target_layer_;
diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
index 54f03d42..ad681d57 100644
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -63,9 +63,12 @@ namespace Eval::NNUE {
             }
         }
 
-        const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch) {
-            if (gradients_.size() < kInputDimensions * combined_batch.size()) {
-                gradients_.resize(kInputDimensions * combined_batch.size());
+        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
+        {
+            const auto size = batch_end - batch_begin;
+            
+            if (gradients_.size() < kInputDimensions * size) {
+                gradients_.resize(kInputDimensions * size);
             }
 
             if (num_calls_.size() < thread_pool.size())
@@ -73,11 +76,11 @@ namespace Eval::NNUE {
                 num_calls_.resize(thread_pool.size(), 0);
             }
 
-            batch_size_ = static_cast<IndexType>(combined_batch.size());
+            batch_size_ = size;
 
             if (num_calls_[0] == 0) {
                 current_operation_ = Operation::kStepStart;
-                output_ = feature_transformer_trainer_->step_start(thread_pool, combined_batch);
+                output_ = feature_transformer_trainer_->step_start(thread_pool, batch_begin, batch_end);
             }
 
             assert(current_operation_ == Operation::kStepStart);
@@ -237,15 +240,18 @@ namespace Eval::NNUE {
             shared_input_trainer_->initialize(rng);
         }
 
-        const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch) {
-            if (output_.size() < kOutputDimensions * combined_batch.size()) {
-              output_.resize(kOutputDimensions * combined_batch.size());
-              gradients_.resize(kInputDimensions * combined_batch.size());
+        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
+        {
+            const auto size = batch_end - batch_begin;
+
+            if (output_.size() < kOutputDimensions * size) {
+              output_.resize(kOutputDimensions * size);
+              gradients_.resize(kInputDimensions * size);
             }
 
-            batch_size_ = static_cast<IndexType>(combined_batch.size());
+            batch_size_ = size;
 
-            input_ = shared_input_trainer_->step_start(thread_pool, combined_batch);
+            input_ = shared_input_trainer_->step_start(thread_pool, batch_begin, batch_end);
 
             return output_.data();
         }

From 34510dd08a611762b9d826f9cd72cda72ad0ee13 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 25 Nov 2020 23:12:02 +0100
Subject: [PATCH 369/398] Remove used examples asyncronously.

---
 src/nnue/evaluate_nnue_learner.cpp | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 24ad2732..4a1a163d 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -214,9 +214,10 @@ namespace Eval::NNUE {
         std::vector<double> abs_discrete_eval_sum_local(thread_pool.size(), 0.0);
         std::vector<double> gradient_norm_local(thread_pool.size(), 0.0);
 
-        while (examples.size() >= batch_size) {
-            auto batch_begin = examples.end() - batch_size;
-            auto batch_end = examples.end();
+        auto prev_batch_begin = examples.end();
+        while (prev_batch_begin - examples.begin() >= batch_size) {
+            auto batch_begin = prev_batch_begin - batch_size;
+            auto batch_end = prev_batch_begin;
             auto size = batch_end - batch_begin;
             const auto network_output = trainer->step_start(thread_pool, batch_begin, batch_end);
             std::vector<LearnFloatType> gradients(size);
@@ -253,14 +254,20 @@ namespace Eval::NNUE {
                     trainer->backpropagate(th, gradients.data(), offset, count);
                 }
             );
+
+            // We can asyncronously erase the examples that we used in the previous
+            // step. This can be done safely because we're no longer using these
+            // examples and erase won't invalidate iterators.
+            examples.erase(prev_batch_begin, examples.end());
+            prev_batch_begin = batch_begin;
+
             thread_pool.wait_for_workers_finished();
 
             trainer->step_end(thread_pool, learning_rate);
 
-            examples.resize(examples.size() - size);
-
             collect_stats = false;
         }
+        examples.erase(prev_batch_begin, examples.end());
 
         if (verbose)
         {

From 622e0b14c280e89dd338064bce5d4e9c56eb0875 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 25 Nov 2020 23:13:06 +0100
Subject: [PATCH 370/398] Remove superfluous example shuffling. Shuffling now
 only happens on reading.

---
 src/nnue/evaluate_nnue_learner.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 4a1a163d..78446af2 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -202,7 +202,6 @@ namespace Eval::NNUE {
         learning_rate /= batch_size;
 
         std::lock_guard<std::mutex> lock(examples_mutex);
-        std::shuffle(examples.begin(), examples.end(), rng);
 
         double abs_eval_diff_sum = 0.0;
         double abs_discrete_eval_sum = 0.0;

From a97b65eaef85fc524e5455d2d78d36ca9675b08f Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 29 Nov 2020 11:04:10 +0100
Subject: [PATCH 371/398] Fix compilation error with USE_BLAS

---
 src/nnue/trainer/trainer_feature_transformer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 65766b05..877a74bc 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -151,7 +151,7 @@ namespace Eval::NNUE {
                         kHalfDimensions, biases_, 1, &output_[output_offset], 1
                     );
 
-                    for (const auto& feature : (*batch_)[b].training_features[c]) {
+                    for (const auto& feature : batch_[b].training_features[c]) {
                         const IndexType weights_offset = kHalfDimensions * feature.get_index();
                         cblas_saxpy(
                             kHalfDimensions, (float)feature.get_count(),

From 2aa7f5290e89db930dfbd038a6848da3ce43352d Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 29 Nov 2020 17:31:57 +0100
Subject: [PATCH 372/398] Fix comparison of integers with different signedness.

---
 src/nnue/evaluate_nnue_learner.cpp             | 2 +-
 src/nnue/trainer/trainer_affine_transform.h    | 2 +-
 src/nnue/trainer/trainer_clipped_relu.h        | 2 +-
 src/nnue/trainer/trainer_feature_transformer.h | 2 +-
 src/nnue/trainer/trainer_input_slice.h         | 6 +++---
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 78446af2..6e0572dd 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -214,7 +214,7 @@ namespace Eval::NNUE {
         std::vector<double> gradient_norm_local(thread_pool.size(), 0.0);
 
         auto prev_batch_begin = examples.end();
-        while (prev_batch_begin - examples.begin() >= batch_size) {
+        while ((long)(prev_batch_begin - examples.begin()) >= (long)batch_size) {
             auto batch_begin = prev_batch_begin - batch_size;
             auto batch_end = prev_batch_begin;
             auto size = batch_end - batch_begin;
diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h
index b6d70aa4..53e8f904 100644
--- a/src/nnue/trainer/trainer_affine_transform.h
+++ b/src/nnue/trainer/trainer_affine_transform.h
@@ -95,7 +95,7 @@ namespace Eval::NNUE {
         {
             const auto size = batch_end - batch_begin;
 
-            if (output_.size() < kOutputDimensions * size) {
+            if ((long)output_.size() < (long)kOutputDimensions * size) {
                 output_.resize(kOutputDimensions * size);
                 gradients_.resize(kInputDimensions * size);
             }
diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index eae35df6..ff883afc 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -46,7 +46,7 @@ namespace Eval::NNUE {
         {
             const auto size = batch_end - batch_begin;
 
-            if (output_.size() < kOutputDimensions * size) {
+            if ((long)output_.size() < (long)kOutputDimensions * size) {
               output_.resize(kOutputDimensions * size);
               gradients_.resize(kInputDimensions * size);
             }
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 877a74bc..9afda728 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -93,7 +93,7 @@ namespace Eval::NNUE {
         {
             const auto size = batch_end - batch_begin;
 
-            if (output_.size() < kOutputDimensions * size) {
+            if ((long)output_.size() < (long)kOutputDimensions * size) {
                 output_.resize(kOutputDimensions * size);
                 gradients_.resize(kOutputDimensions * size);
             }
diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
index ad681d57..a94cae93 100644
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -66,8 +66,8 @@ namespace Eval::NNUE {
         const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
         {
             const auto size = batch_end - batch_begin;
-            
-            if (gradients_.size() < kInputDimensions * size) {
+
+            if ((long)gradients_.size() < (long)kInputDimensions * size) {
                 gradients_.resize(kInputDimensions * size);
             }
 
@@ -244,7 +244,7 @@ namespace Eval::NNUE {
         {
             const auto size = batch_end - batch_begin;
 
-            if (output_.size() < kOutputDimensions * size) {
+            if ((long)output_.size() < (long)kOutputDimensions * size) {
               output_.resize(kOutputDimensions * size);
               gradients_.resize(kInputDimensions * size);
             }

From 1322a9a5fd5bc0d085c584237d0f4b70b7b4d56e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 29 Nov 2020 21:26:06 +0100
Subject: [PATCH 373/398] Prevent false sharing of num_calls counter in the
 shared input trainer. Fix current_operation not being local to the executing
 thread.

---
 src/nnue/trainer/trainer_input_slice.h | 117 +++++++++++++++----------
 1 file changed, 73 insertions(+), 44 deletions(-)

diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
index a94cae93..62a761a7 100644
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -15,6 +15,19 @@
 namespace Eval::NNUE {
 
     // Learning: Input layer
+    // This is tricky. It exists because when there's more than one trainer
+    // on top of a single feature transformer we want to only call propagate/backpropagate
+    // on the feature transformer once. This is straightforward in the old
+    // multithreading case, because propagate/backpropagate is called just once from the
+    // main thread. But with the current implementation of coarser multithreading
+    // we end up calling each method from each thread. Therefore we have to keep
+    // the num_calls and current_operation per thread basis, each thread must work
+    // on its designated batch slice, and the only synchronization points are
+    // step_start and step_end - for which we use state of the first thread.
+    // Each thread requires their own bookkeeping because it's possible that
+    // one thread is still in propagate of some batch slice while the other thread
+    // is doing backpropagate of some other slice. We also ensure the thread state
+    // isn't suspectible to false sharing by using a full cache line for the state.
     class SharedInputTrainer {
     public:
         // factory function
@@ -34,32 +47,36 @@ namespace Eval::NNUE {
 
         // Set options such as hyperparameters
         void send_message(Message* message) {
-            if (num_calls_[0] == 0) {
-                current_operation_ = Operation::kSendMessage;
+            auto& thread_state = thread_states_[0];
+
+            if (thread_state.num_calls == 0) {
+                thread_state.current_operation = Operation::kSendMessage;
                 feature_transformer_trainer_->send_message(message);
             }
 
-            assert(current_operation_ == Operation::kSendMessage);
+            assert(thread_state.current_operation == Operation::kSendMessage);
 
-            if (++num_calls_[0] == num_referrers_) {
-                num_calls_[0] = 0;
-                current_operation_ = Operation::kNone;
+            if (++thread_state.num_calls == num_referrers_) {
+                thread_state.num_calls = 0;
+                thread_state.current_operation = Operation::kNone;
             }
         }
 
         // Initialize the parameters with random numbers
         template <typename RNG>
         void initialize(RNG& rng) {
-            if (num_calls_[0] == 0) {
-                current_operation_ = Operation::kInitialize;
+            auto& thread_state = thread_states_[0];
+
+            if (thread_state.num_calls == 0) {
+                thread_state.current_operation = Operation::kInitialize;
                 feature_transformer_trainer_->initialize(rng);
             }
 
-            assert(current_operation_ == Operation::kInitialize);
+            assert(thread_state.current_operation == Operation::kInitialize);
 
-            if (++num_calls_[0] == num_referrers_) {
-                num_calls_[0] = 0;
-                current_operation_ = Operation::kNone;
+            if (++thread_state.num_calls == num_referrers_) {
+                thread_state.num_calls = 0;
+                thread_state.current_operation = Operation::kNone;
             }
         }
 
@@ -71,23 +88,25 @@ namespace Eval::NNUE {
                 gradients_.resize(kInputDimensions * size);
             }
 
-            if (num_calls_.size() < thread_pool.size())
+            if (thread_states_.size() < thread_pool.size())
             {
-                num_calls_.resize(thread_pool.size(), 0);
+                thread_states_.resize(thread_pool.size());
             }
 
             batch_size_ = size;
 
-            if (num_calls_[0] == 0) {
-                current_operation_ = Operation::kStepStart;
+            auto& thread_state = thread_states_[0];
+
+            if (thread_state.num_calls == 0) {
+                thread_state.current_operation = Operation::kStepStart;
                 output_ = feature_transformer_trainer_->step_start(thread_pool, batch_begin, batch_end);
             }
 
-            assert(current_operation_ == Operation::kStepStart);
+            assert(thread_state.current_operation == Operation::kStepStart);
 
-            if (++num_calls_[0] == num_referrers_) {
-                num_calls_[0] = 0;
-                current_operation_ = Operation::kNone;
+            if (++thread_state.num_calls == num_referrers_) {
+                thread_state.num_calls = 0;
+                thread_state.current_operation = Operation::kNone;
             }
 
             return output_;
@@ -97,16 +116,18 @@ namespace Eval::NNUE {
         void propagate(Thread& th, uint64_t offset, uint64_t count) {
             const auto thread_id = th.thread_idx();
 
-            if (num_calls_[thread_id] == 0) {
-                current_operation_ = Operation::kPropagate;
+            auto& thread_state = thread_states_[thread_id];
+
+            if (thread_state.num_calls == 0) {
+                thread_state.current_operation = Operation::kPropagate;
                 feature_transformer_trainer_->propagate(th, offset, count);
             }
 
-            assert(current_operation_ == Operation::kPropagate);
+            assert(thread_state.current_operation == Operation::kPropagate);
 
-            if (++num_calls_[thread_id] == num_referrers_) {
-                num_calls_[thread_id] = 0;
-                current_operation_ = Operation::kNone;
+            if (++thread_state.num_calls == num_referrers_) {
+                thread_state.num_calls = 0;
+                thread_state.current_operation = Operation::kNone;
             }
         }
 
@@ -118,13 +139,15 @@ namespace Eval::NNUE {
 
             const auto thread_id = th.thread_idx();
 
+            auto& thread_state = thread_states_[thread_id];
+
             if (num_referrers_ == 1) {
                 feature_transformer_trainer_->backpropagate(th, gradients, offset, count);
                 return;
             }
 
-            if (num_calls_[thread_id] == 0) {
-                current_operation_ = Operation::kBackPropagate;
+            if (thread_state.num_calls == 0) {
+                thread_state.current_operation = Operation::kBackPropagate;
                 for (IndexType b = offset; b < offset + count; ++b) {
                     const IndexType batch_offset = kInputDimensions * b;
                     for (IndexType i = 0; i < kInputDimensions; ++i) {
@@ -133,7 +156,7 @@ namespace Eval::NNUE {
                 }
             }
 
-            assert(current_operation_ == Operation::kBackPropagate);
+            assert(thread_state.current_operation == Operation::kBackPropagate);
 
             for (IndexType b = offset; b < offset + count; ++b) {
                 const IndexType batch_offset = kInputDimensions * b;
@@ -142,25 +165,27 @@ namespace Eval::NNUE {
                 }
             }
 
-            if (++num_calls_[thread_id] == num_referrers_) {
+            if (++thread_state.num_calls == num_referrers_) {
                 feature_transformer_trainer_->backpropagate(
                     th, gradients_.data(), offset, count);
-                num_calls_[thread_id] = 0;
-                current_operation_ = Operation::kNone;
+                thread_state.num_calls = 0;
+                thread_state.current_operation = Operation::kNone;
             }
         }
 
         void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate) {
-            if (num_calls_[0] == 0) {
-                current_operation_ = Operation::kStepEnd;
+            auto& thread_state = thread_states_[0];
+
+            if (thread_state.num_calls == 0) {
+                thread_state.current_operation = Operation::kStepEnd;
                 feature_transformer_trainer_->step_end(thread_pool, learning_rate);
             }
 
-            assert(current_operation_ == Operation::kStepEnd);
+            assert(thread_state.current_operation == Operation::kStepEnd);
 
-            if (++num_calls_[0] == num_referrers_) {
-                num_calls_[0] = 0;
-                current_operation_ = Operation::kNone;
+            if (++thread_state.num_calls == num_referrers_) {
+                thread_state.num_calls = 0;
+                thread_state.current_operation = Operation::kNone;
             }
         }
 
@@ -169,8 +194,7 @@ namespace Eval::NNUE {
         SharedInputTrainer(FeatureTransformer* ft) :
             batch_size_(0),
             num_referrers_(0),
-            num_calls_(1, 0),
-            current_operation_(Operation::kNone),
+            thread_states_(1),
             feature_transformer_trainer_(Trainer<FeatureTransformer>::create(
                 ft)),
             output_(nullptr) {
@@ -197,11 +221,16 @@ namespace Eval::NNUE {
         // number of layers sharing this layer as input
         std::uint32_t num_referrers_;
 
-        // Number of times the current process has been called
-        std::vector<std::uint32_t> num_calls_;
+        struct alignas(kCacheLineSize) ThreadState
+        {
+            std::uint32_t num_calls{0};
 
-        // current processing type
-        Operation current_operation_;
+            // current processing type
+            Operation current_operation = Operation::kNone;
+        };
+
+        // Number of times the current process has been called
+        std::vector<ThreadState, CacheLineAlignedAllocator<ThreadState>> thread_states_;
 
         // Trainer of input feature converter
         const std::shared_ptr<Trainer<FeatureTransformer>>

From 6ce0245787c1111aa2d014b2d26fc8220da3adae Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 29 Nov 2020 01:37:07 +0100
Subject: [PATCH 374/398] Basic autograd

---
 src/learn/autograd.h | 350 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 350 insertions(+)
 create mode 100644 src/learn/autograd.h

diff --git a/src/learn/autograd.h b/src/learn/autograd.h
new file mode 100644
index 00000000..8a4df2ab
--- /dev/null
+++ b/src/learn/autograd.h
@@ -0,0 +1,350 @@
+#ifndef LEARNER_AUTOGRAD_H
+#define LEARNER_AUTOGRAD_H
+
+#include <cmath>
+#include <utility>
+#include <type_traits>
+#include <memory>
+#include <tuple>
+
+namespace Learner::Autograd::UnivariateStatic
+{
+
+    template <typename T>
+    struct Identity
+    {
+        using type = T;
+    };
+
+    template <typename T>
+    using Id = typename Identity<T>::type;
+
+    template <typename T, int I>
+    struct VariableParameter
+    {
+        using ValueType = T;
+
+        VariableParameter()
+        {
+        }
+
+        template <typename... ArgsTs>
+        T value(const std::tuple<ArgsTs...>& args) const
+        {
+            return std::get<I>(args);
+        }
+
+        template <typename... ArgsTs>
+        T grad(const std::tuple<ArgsTs...>&) const
+        {
+            return T(1.0);
+        }
+    };
+
+    template <typename T, int I>
+    struct ConstantParameter
+    {
+        using ValueType = T;
+
+        ConstantParameter()
+        {
+        }
+
+        template <typename... ArgsTs>
+        T value(const std::tuple<ArgsTs...>& args) const
+        {
+            return std::get<I>(args);
+        }
+
+        template <typename... ArgsTs>
+        T grad(const std::tuple<ArgsTs...>&) const
+        {
+            return T(0.0);
+        }
+    };
+
+    template <typename T>
+    struct Constant
+    {
+        using ValueType = T;
+
+        Constant(T x) :
+            m_x(std::move(x))
+        {
+        }
+
+        template <typename... ArgsTs>
+        T value(const std::tuple<ArgsTs...>&) const
+        {
+            return m_x;
+        }
+
+        template <typename... ArgsTs>
+        T grad(const std::tuple<ArgsTs...>&) const
+        {
+            return T(0.0);
+        }
+
+    private:
+        T m_x;
+    };
+
+    template <typename LhsT, typename RhsT, typename T = typename LhsT::ValueType>
+    struct Sum
+    {
+        using ValueType = T;
+
+        Sum(LhsT lhs, RhsT rhs) :
+            m_lhs(std::move(lhs)),
+            m_rhs(std::move(rhs))
+        {
+        }
+
+        template <typename... ArgsTs>
+        T value(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_lhs.value(args) + m_rhs.value(args);
+        }
+
+        template <typename... ArgsTs>
+        T grad(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_lhs.grad(args) + m_rhs.grad(args);
+        }
+
+    private:
+        LhsT m_lhs;
+        RhsT m_rhs;
+    };
+
+    template <typename LhsT, typename RhsT, typename T = typename LhsT::ValueType>
+    auto operator+(LhsT lhs, RhsT rhs)
+    {
+        return Sum(std::move(lhs), std::move(rhs));
+    }
+
+    template <typename LhsT, typename T = typename LhsT::ValueType>
+    auto operator+(LhsT lhs, Id<T> rhs)
+    {
+        return Sum(std::move(lhs), Constant(rhs));
+    }
+
+    template <typename RhsT, typename T = typename RhsT::ValueType>
+    auto operator+(Id<T> lhs, RhsT rhs)
+    {
+        return Sum(Constant(lhs), std::move(rhs));
+    }
+
+    template <typename LhsT, typename RhsT, typename T = typename LhsT::ValueType>
+    struct Difference
+    {
+        using ValueType = T;
+
+        Difference(LhsT lhs, RhsT rhs) :
+            m_lhs(std::move(lhs)),
+            m_rhs(std::move(rhs))
+        {
+        }
+
+        template <typename... ArgsTs>
+        T value(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_lhs.value(args) - m_rhs.value(args);
+        }
+
+        template <typename... ArgsTs>
+        T grad(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_lhs.grad(args) - m_rhs.grad(args);
+        }
+
+    private:
+        LhsT m_lhs;
+        RhsT m_rhs;
+    };
+
+    template <typename LhsT, typename RhsT, typename T = typename LhsT::ValueType>
+    auto operator-(LhsT lhs, RhsT rhs)
+    {
+        return Difference(std::move(lhs), std::move(rhs));
+    }
+
+    template <typename LhsT, typename T = typename LhsT::ValueType>
+    auto operator-(LhsT lhs, Id<T> rhs)
+    {
+        return Difference(std::move(lhs), Constant(rhs));
+    }
+
+    template <typename RhsT, typename T = typename RhsT::ValueType>
+    auto operator-(Id<T> lhs, RhsT rhs)
+    {
+        return Difference(Constant(lhs), std::move(rhs));
+    }
+
+    template <typename LhsT, typename RhsT, typename T = typename LhsT::ValueType>
+    struct Product
+    {
+        using ValueType = T;
+
+        Product(LhsT lhs, RhsT rhs) :
+            m_lhs(std::move(lhs)),
+            m_rhs(std::move(rhs))
+        {
+        }
+
+        template <typename... ArgsTs>
+        T value(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_lhs.value(args) * m_rhs.value(args);
+        }
+
+        template <typename... ArgsTs>
+        T grad(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_lhs.grad(args) * m_rhs.value(args) + m_lhs.value(args) * m_rhs.grad(args);
+        }
+
+    private:
+        LhsT m_lhs;
+        RhsT m_rhs;
+    };
+
+    template <typename LhsT, typename RhsT, typename T = typename LhsT::ValueType>
+    auto operator*(LhsT lhs, RhsT rhs)
+    {
+        return Product(std::move(lhs), std::move(rhs));
+    }
+
+    template <typename LhsT, typename T = typename LhsT::ValueType>
+    auto operator*(LhsT lhs, Id<T> rhs)
+    {
+        return Product(std::move(lhs), Constant(rhs));
+    }
+
+    template <typename RhsT, typename T = typename RhsT::ValueType>
+    auto operator*(Id<T> lhs, RhsT rhs)
+    {
+        return Product(Constant(lhs), std::move(rhs));
+    }
+
+    template <typename ArgT, typename T = typename ArgT::ValueType>
+    struct Sigmoid
+    {
+        using ValueType = T;
+
+        explicit Sigmoid(ArgT x) :
+            m_x(std::move(x))
+        {
+        }
+
+        template <typename... ArgsTs>
+        T value(const std::tuple<ArgsTs...>& args) const
+        {
+            return value_(m_x.value(args));
+        }
+
+        template <typename... ArgsTs>
+        T grad(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_x.grad(args) * grad_(m_x.value(args));
+        }
+
+    private:
+        ArgT m_x;
+
+        T value_(T x) const
+        {
+            return 1.0 / (1.0 + std::exp(-x));
+        }
+
+        T grad_(T x) const
+        {
+            return value_(x) * (1.0 - value_(x));
+        }
+    };
+
+    template <typename ArgT>
+    auto sigmoid(ArgT x)
+    {
+        return Sigmoid(std::move(x));
+    }
+
+    template <typename ArgT, typename T = typename ArgT::ValueType>
+    struct Pow
+    {
+        using ValueType = T;
+
+        explicit Pow(ArgT x, Id<T> exponent) :
+            m_x(std::move(x)),
+            m_exponent(std::move(exponent))
+        {
+        }
+
+        template <typename... ArgsTs>
+        T value(const std::tuple<ArgsTs...>& args) const
+        {
+            return std::pow(m_x.value(args), m_exponent);
+        }
+
+        template <typename... ArgsTs>
+        T grad(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_exponent * std::pow(m_x.value(args), m_exponent - T(1.0)) * m_x.grad(args);
+        }
+
+    private:
+        ArgT m_x;
+        T m_exponent;
+    };
+
+    template <typename ArgT, typename T = typename ArgT::ValueType>
+    auto pow(ArgT x, Id<T> exp)
+    {
+        return Pow(std::move(x), std::move(exp));
+    }
+
+    template <typename ArgT, typename T = typename ArgT::ValueType>
+    struct Log
+    {
+        using ValueType = T;
+
+        explicit Log(ArgT x) :
+            m_x(std::move(x))
+        {
+        }
+
+        template <typename... ArgsTs>
+        T value(const std::tuple<ArgsTs...>& args) const
+        {
+            return value_(m_x.value(args));
+        }
+
+        template <typename... ArgsTs>
+        T grad(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_x.grad(args) * grad_(m_x.value(args));
+        }
+
+    private:
+        ArgT m_x;
+
+        T value_(T x) const
+        {
+            return std::log(x);
+        }
+
+        T grad_(T x) const
+        {
+            return 1.0 / x;
+        }
+    };
+
+    template <typename ArgT>
+    auto log(ArgT x)
+    {
+        return Log(std::move(x));
+    }
+
+}
+
+#endif
\ No newline at end of file

From 541fb8177abfafdcbe23f0f98431a56e49dbae98 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 29 Nov 2020 11:33:35 +0100
Subject: [PATCH 375/398] More utility in autograd.

---
 src/learn/autograd.h | 68 ++++++++++++++++++++++++++++++++++++++------
 1 file changed, 59 insertions(+), 9 deletions(-)

diff --git a/src/learn/autograd.h b/src/learn/autograd.h
index 8a4df2ab..0b894cc4 100644
--- a/src/learn/autograd.h
+++ b/src/learn/autograd.h
@@ -7,6 +7,44 @@
 #include <memory>
 #include <tuple>
 
+namespace Learner
+{
+    template <typename T>
+    struct ValueWithGrad
+    {
+        T value;
+        T grad;
+
+        ValueWithGrad& operator+=(const ValueWithGrad<T>& rhs)
+        {
+            value += rhs.value;
+            grad += rhs.grad;
+            return *this;
+        }
+
+        ValueWithGrad& operator-=(const ValueWithGrad<T>& rhs)
+        {
+            value -= rhs.value;
+            grad -= rhs.grad;
+            return *this;
+        }
+
+        ValueWithGrad& operator*=(T rhs)
+        {
+            value *= rhs;
+            grad *= rhs;
+            return *this;
+        }
+
+        ValueWithGrad& operator/=(T rhs)
+        {
+            value /= rhs;
+            grad /= rhs;
+            return *this;
+        }
+    };
+}
+
 namespace Learner::Autograd::UnivariateStatic
 {
 
@@ -19,8 +57,20 @@ namespace Learner::Autograd::UnivariateStatic
     template <typename T>
     using Id = typename Identity<T>::type;
 
+    template <typename T>
+    struct Evaluable
+    {
+        template <typename... ArgsTs>
+        auto eval(const std::tuple<ArgsTs...>& args) const
+        {
+            using ValueType = typename T::ValueType;
+            const T* this_ = static_cast<const T*>(this);
+            return ValueWithGrad<ValueType>{ this_->value(args), this_->grad(args) };
+        }
+    };
+
     template <typename T, int I>
-    struct VariableParameter
+    struct VariableParameter : Evaluable<VariableParameter<T, I>>
     {
         using ValueType = T;
 
@@ -42,7 +92,7 @@ namespace Learner::Autograd::UnivariateStatic
     };
 
     template <typename T, int I>
-    struct ConstantParameter
+    struct ConstantParameter : Evaluable<ConstantParameter<T, I>>
     {
         using ValueType = T;
 
@@ -64,7 +114,7 @@ namespace Learner::Autograd::UnivariateStatic
     };
 
     template <typename T>
-    struct Constant
+    struct Constant : Evaluable<Constant<T>>
     {
         using ValueType = T;
 
@@ -90,7 +140,7 @@ namespace Learner::Autograd::UnivariateStatic
     };
 
     template <typename LhsT, typename RhsT, typename T = typename LhsT::ValueType>
-    struct Sum
+    struct Sum : Evaluable<Sum<LhsT, RhsT, T>>
     {
         using ValueType = T;
 
@@ -136,7 +186,7 @@ namespace Learner::Autograd::UnivariateStatic
     }
 
     template <typename LhsT, typename RhsT, typename T = typename LhsT::ValueType>
-    struct Difference
+    struct Difference : Evaluable<Difference<LhsT, RhsT, T>>
     {
         using ValueType = T;
 
@@ -182,7 +232,7 @@ namespace Learner::Autograd::UnivariateStatic
     }
 
     template <typename LhsT, typename RhsT, typename T = typename LhsT::ValueType>
-    struct Product
+    struct Product : Evaluable<Product<LhsT, RhsT, T>>
     {
         using ValueType = T;
 
@@ -228,7 +278,7 @@ namespace Learner::Autograd::UnivariateStatic
     }
 
     template <typename ArgT, typename T = typename ArgT::ValueType>
-    struct Sigmoid
+    struct Sigmoid : Evaluable<Sigmoid<ArgT, T>>
     {
         using ValueType = T;
 
@@ -270,7 +320,7 @@ namespace Learner::Autograd::UnivariateStatic
     }
 
     template <typename ArgT, typename T = typename ArgT::ValueType>
-    struct Pow
+    struct Pow : Evaluable<Pow<ArgT, T>>
     {
         using ValueType = T;
 
@@ -304,7 +354,7 @@ namespace Learner::Autograd::UnivariateStatic
     }
 
     template <typename ArgT, typename T = typename ArgT::ValueType>
-    struct Log
+    struct Log : Evaluable<Log<ArgT, T>>
     {
         using ValueType = T;
 

From 5a58eb803a2c1b11808c96a1d8eb9c58a01d4791 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 29 Nov 2020 11:55:00 +0100
Subject: [PATCH 376/398] Loss func with autograd

---
 src/learn/learn.cpp | 19 +++++++++++++++++--
 src/learn/learn.h   |  2 ++
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index f7358f8e..411cee08 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -19,6 +19,7 @@
 
 #include "learn.h"
 
+#include "autograd.h"
 #include "sfen_reader.h"
 
 #include "misc.h"
@@ -320,6 +321,20 @@ namespace Learner
         return std::clamp(grad, -max_grad, max_grad);
     }
 
+    static ValueWithGrad<double> get_loss(Value shallow, Value teacher_signal, int result, int ply)
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        auto q_ = sigmoid(VariableParameter<double, 0>{} * winning_probability_coefficient);
+        auto p_ = sigmoid(ConstantParameter<double, 1>{} * winning_probability_coefficient);
+        auto t_ = (ConstantParameter<double, 2>{} + 1.0) * 0.5;
+        auto lambda_ = ConstantParameter<double, 3>{};
+        auto loss_ = pow(lambda_ * (q_ - p_) + (1.0 - lambda_) * (q_ - t_), 2.0);
+
+        auto args = std::tuple((double)shallow, (double)teacher_signal, (double)result, calculate_lambda(teacher_signal));
+        return loss_.eval(args);
+    }
+
     // Calculate cross entropy during learning
     // The individual cross entropy of the win/loss term and win
     // rate term of the elmo expression is returned
@@ -702,7 +717,7 @@ namespace Learner
             {
                 goto RETRY_READ;
             }
-            
+
             // We want to position being trained on not to be terminal
             if (MoveList<LEGAL>(pos).size() == 0)
                 goto RETRY_READ;
@@ -720,7 +735,7 @@ namespace Learner
         // should be no real issues happening since
         // the read/write phases are isolated.
         atomic_thread_fence(memory_order_seq_cst);
-        Eval::NNUE::update_parameters(Threads, epoch, params.verbose, params.learning_rate, calc_grad);
+        Eval::NNUE::update_parameters(Threads, epoch, params.verbose, params.learning_rate, calc_grad, get_loss);
         atomic_thread_fence(memory_order_seq_cst);
 
         if (++save_count * params.mini_batch_size >= params.eval_save_interval)
diff --git a/src/learn/learn.h b/src/learn/learn.h
index 6ce476e5..f74fd4e3 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -33,6 +33,7 @@ using LearnFloatType = float;
 // Definition of struct used in Learner
 // ----------------------
 
+#include "autograd.h"
 #include "packed_sfen.h"
 
 #include "position.h"
@@ -68,6 +69,7 @@ namespace Learner
     void learn(std::istringstream& is);
 
     using CalcGradFunc = double(Value, Value, int, int);
+    using CalcLossFunc = ValueWithGrad<double>(Value, Value, int, int);
 }
 
 #endif // ifndef _LEARN_H_

From b71d1e86205505997106348afa7e359b9f6593c1 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 29 Nov 2020 11:55:15 +0100
Subject: [PATCH 377/398] Pass the new loss function to update_parameters

---
 src/nnue/evaluate_nnue_learner.cpp | 5 ++++-
 src/nnue/evaluate_nnue_learner.h   | 3 ++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 6e0572dd..822c56b4 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -195,8 +195,11 @@ namespace Eval::NNUE {
         uint64_t epoch,
         bool verbose,
         double learning_rate,
-        Learner::CalcGradFunc calc_grad)
+        Learner::CalcGradFunc calc_grad,
+        Learner::CalcLossFunc calc_loss)
     {
+        using namespace Learner::Autograd::UnivariateStatic;
+
         assert(batch_size > 0);
 
         learning_rate /= batch_size;
diff --git a/src/nnue/evaluate_nnue_learner.h b/src/nnue/evaluate_nnue_learner.h
index 8633f713..0fe8afce 100644
--- a/src/nnue/evaluate_nnue_learner.h
+++ b/src/nnue/evaluate_nnue_learner.h
@@ -38,7 +38,8 @@ namespace Eval::NNUE {
         uint64_t epoch,
         bool verbose,
         double learning_rate,
-        Learner::CalcGradFunc calc_grad);
+        Learner::CalcGradFunc calc_grad,
+        Learner::CalcLossFunc calc_loss);
 
     // Check if there are any problems with learning
     void check_health();

From 539bd2d1c8fdbe74cff0efc30a994f1fed7a08fe Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 29 Nov 2020 12:18:02 +0100
Subject: [PATCH 378/398] Replace the old loss/grad calculation completely.

---
 src/learn/autograd.h               |   5 +
 src/learn/learn.cpp                | 295 +++++++----------------------
 src/learn/learn.h                  |   1 -
 src/nnue/evaluate_nnue_learner.cpp |   7 +-
 src/nnue/evaluate_nnue_learner.h   |   1 -
 5 files changed, 79 insertions(+), 230 deletions(-)

diff --git a/src/learn/autograd.h b/src/learn/autograd.h
index 0b894cc4..f83d4d72 100644
--- a/src/learn/autograd.h
+++ b/src/learn/autograd.h
@@ -42,6 +42,11 @@ namespace Learner
             grad /= rhs;
             return *this;
         }
+
+        ValueWithGrad abs() const
+        {
+            return { std::abs(value), std::abs(grad) };
+        }
     };
 }
 
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 411cee08..e558b56a 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -52,6 +52,7 @@
 #include <sstream>
 #include <unordered_set>
 #include <iostream>
+#include <mutex>
 
 #if defined (_OPENMP)
 #include <omp.h>
@@ -99,65 +100,64 @@ namespace Learner
     // Using stockfish's WDL with win rate model instead of sigmoid
     static bool use_wdl = false;
 
-    namespace Detail {
-        template <bool AtomicV>
-        struct Loss
+    struct Loss
+    {
+        double value() const
         {
-            using T =
-                std::conditional_t<
-                    AtomicV,
-                    atomic<double>,
-                    double
-                >;
+            return m_loss.value;
+        }
 
-            T cross_entropy_eval{0.0};
-            T cross_entropy_win{0.0};
-            T cross_entropy{0.0};
-            T entropy_eval{0.0};
-            T entropy_win{0.0};
-            T entropy{0.0};
-            T count{0.0};
+        double grad() const
+        {
+            return m_loss.grad;
+        }
 
-            template <bool OtherAtomicV>
-            Loss& operator += (const Loss<OtherAtomicV>& rhs)
-            {
-                cross_entropy_eval += rhs.cross_entropy_eval;
-                cross_entropy_win += rhs.cross_entropy_win;
-                cross_entropy += rhs.cross_entropy;
-                entropy_eval += rhs.entropy_eval;
-                entropy_win += rhs.entropy_win;
-                entropy += rhs.entropy;
-                count += rhs.count;
+        uint64_t count() const
+        {
+            return m_count;
+        }
 
-                return *this;
-            }
+        Loss& operator += (const ValueWithGrad<double>& rhs)
+        {
+            std::unique_lock lock(m_mutex);
 
-            void reset()
-            {
-                cross_entropy_eval = 0.0;
-                cross_entropy_win = 0.0;
-                cross_entropy = 0.0;
-                entropy_eval = 0.0;
-                entropy_win = 0.0;
-                entropy = 0.0;
-                count = 0.0;
-            }
+            m_loss += rhs.abs();
+            m_count += 1;
 
-            template <typename StreamT>
-            void print(const std::string& prefix, StreamT& s) const
-            {
-                s << "  - " << prefix << "_cross_entropy_eval = " << cross_entropy_eval / count << endl;
-                s << "  - " << prefix << "_cross_entropy_win  = " << cross_entropy_win / count << endl;
-                s << "  - " << prefix << "_entropy_eval       = " << entropy_eval / count << endl;
-                s << "  - " << prefix << "_entropy_win        = " << entropy_win / count << endl;
-                s << "  - " << prefix << "_cross_entropy      = " << cross_entropy / count << endl;
-                s << "  - " << prefix << "_entropy            = " << entropy / count << endl;
-            }
-        };
-    }
+            return *this;
+        }
 
-    using Loss = Detail::Loss<false>;
-    using AtomicLoss = Detail::Loss<true>;
+        Loss& operator += (const Loss& rhs)
+        {
+            std::unique_lock lock(m_mutex);
+
+            m_loss += rhs.m_loss.abs();
+            m_count += rhs.m_count;
+
+            return *this;
+        }
+
+        void reset()
+        {
+            std::unique_lock lock(m_mutex);
+
+            m_loss = ValueWithGrad<double>{ 0.0, 0.0 };
+            m_count = 0;
+        }
+
+        template <typename StreamT>
+        void print(const std::string& prefix, StreamT& s) const
+        {
+            s << "  - " << prefix << "_loss       = " << m_loss.value / (double)m_count << endl;
+            s << "  - " << prefix << "_grad_norm  = " << m_loss.grad / (double)m_count << endl;
+        }
+
+    private:
+        ValueWithGrad<double> m_loss{ 0.0, 0.0 };
+        uint64_t m_count{0};
+        std::mutex m_mutex;
+
+    };
 
     static void append_files_from_dir(
         std::vector<std::string>& filenames,
@@ -185,94 +185,6 @@ namespace Learner
         }
     }
 
-    // A function that converts the evaluation value to the winning rate [0,1]
-    static double winning_percentage(double value)
-    {
-        // 1/(1+10^(-Eval/4))
-        // = 1/(1+e^(-Eval/4*ln(10))
-        // = sigmoid(Eval/4*ln(10))
-        return Math::sigmoid(value * winning_probability_coefficient);
-    }
-
-    // A function that converts the evaluation value to the winning rate [0,1]
-    static double winning_percentage_wdl(double value, int ply)
-    {
-        constexpr double wdl_total = 1000.0;
-        constexpr double draw_score = 0.5;
-
-        const double wdl_w = UCI::win_rate_model_double(value, ply);
-        const double wdl_l = UCI::win_rate_model_double(-value, ply);
-        const double wdl_d = wdl_total - wdl_w - wdl_l;
-
-        return (wdl_w + wdl_d * draw_score) / wdl_total;
-    }
-
-    // A function that converts the evaluation value to the winning rate [0,1]
-    static double winning_percentage(double value, int ply)
-    {
-        if (use_wdl)
-        {
-            return winning_percentage_wdl(value, ply);
-        }
-        else
-        {
-            return winning_percentage(value);
-        }
-    }
-
-    static double calc_cross_entropy_of_winning_percentage(
-        double deep_win_rate,
-        double shallow_eval,
-        int ply)
-    {
-        const double p = deep_win_rate;
-        const double q = winning_percentage(shallow_eval, ply);
-        return -p * std::log(q) - (1.0 - p) * std::log(1.0 - q);
-    }
-
-    static double calc_d_cross_entropy_of_winning_percentage(
-        double deep_win_rate,
-        double shallow_eval,
-        int ply)
-    {
-        constexpr double epsilon = 0.000001;
-
-        const double y1 = calc_cross_entropy_of_winning_percentage(
-            deep_win_rate, shallow_eval, ply);
-
-        const double y2 = calc_cross_entropy_of_winning_percentage(
-            deep_win_rate, shallow_eval + epsilon, ply);
-
-        // Divide by the winning_probability_coefficient to
-        // match scale with the sigmoidal win rate
-        return ((y2 - y1) / epsilon) / winning_probability_coefficient;
-    }
-
-    // Training Formula · Issue #71 · nodchip/Stockfish https://github.com/nodchip/Stockfish/issues/71
-    static double get_scaled_signal(double signal)
-    {
-        double scaled_signal = signal;
-
-        // Normalize to [0.0, 1.0].
-        scaled_signal =
-            (scaled_signal - src_score_min_value)
-            / (src_score_max_value - src_score_min_value);
-
-        // Scale to [dest_score_min_value, dest_score_max_value].
-        scaled_signal =
-            scaled_signal * (dest_score_max_value - dest_score_min_value)
-            + dest_score_min_value;
-
-        return scaled_signal;
-    }
-
-    // Teacher winning probability.
-    static double calculate_p(double teacher_signal, int ply)
-    {
-        const double scaled_teacher_signal = get_scaled_signal(teacher_signal);
-        return winning_percentage(scaled_teacher_signal, ply);
-    }
-
     static double calculate_lambda(double teacher_signal)
     {
         // If the evaluation value in deep search exceeds elmo_lambda_limit
@@ -285,94 +197,31 @@ namespace Learner
         return lambda;
     }
 
-    static double calculate_t(int game_result)
-    {
-        // Use 1 as the correction term if the expected win rate is 1,
-        // 0 if you lose, and 0.5 if you draw.
-        // game_result = 1,0,-1 so add 1 and divide by 2.
-        const double t = double(game_result + 1) * 0.5;
-
-        return t;
-    }
-
-    static double calc_grad(Value shallow, Value teacher_signal, int result, int ply)
-    {
-        // elmo (WCSC27) method
-        // Correct with the actual game wins and losses.
-        const double q = winning_percentage(shallow, ply);
-        const double p = calculate_p(teacher_signal, ply);
-        const double t = calculate_t(result);
-        const double lambda = calculate_lambda(teacher_signal);
-
-        double grad;
-        if (use_wdl)
-        {
-            const double dce_p = calc_d_cross_entropy_of_winning_percentage(p, shallow, ply);
-            const double dce_t = calc_d_cross_entropy_of_winning_percentage(t, shallow, ply);
-            grad = lambda * dce_p + (1.0 - lambda) * dce_t;
-        }
-        else
-        {
-            // Use the actual win rate as a correction term.
-            // This is the idea of ​​elmo (WCSC27), modern O-parts.
-            grad = lambda * (q - p) + (1.0 - lambda) * (q - t);
-        }
-
-        return std::clamp(grad, -max_grad, max_grad);
-    }
-
     static ValueWithGrad<double> get_loss(Value shallow, Value teacher_signal, int result, int ply)
     {
         using namespace Learner::Autograd::UnivariateStatic;
-
         auto q_ = sigmoid(VariableParameter<double, 0>{} * winning_probability_coefficient);
         auto p_ = sigmoid(ConstantParameter<double, 1>{} * winning_probability_coefficient);
         auto t_ = (ConstantParameter<double, 2>{} + 1.0) * 0.5;
         auto lambda_ = ConstantParameter<double, 3>{};
         auto loss_ = pow(lambda_ * (q_ - p_) + (1.0 - lambda_) * (q_ - t_), 2.0);
 
+        /*
+        auto q_ = VariableParameter<double, 0>{};
+        auto p_ = ConstantParameter<double, 1>{};
+        auto loss_ = pow(q_ - p_, 2.0) * (1.0 / (2400.0 * 2.0 * 600.0));
+        */
+
         auto args = std::tuple((double)shallow, (double)teacher_signal, (double)result, calculate_lambda(teacher_signal));
         return loss_.eval(args);
     }
 
-    // Calculate cross entropy during learning
-    // The individual cross entropy of the win/loss term and win
-    // rate term of the elmo expression is returned
-    // to the arguments cross_entropy_eval and cross_entropy_win.
-    static Loss calc_cross_entropy(
+    static auto get_loss(
         Value teacher_signal,
         Value shallow,
         const PackedSfenValue& psv)
     {
-        // Teacher winning probability.
-        const double q = winning_percentage(shallow, psv.gamePly);
-        const double p = calculate_p(teacher_signal, psv.gamePly);
-        const double t = calculate_t(psv.game_result);
-        const double lambda = calculate_lambda(teacher_signal);
-
-        constexpr double epsilon = 0.000001;
-
-        const double m = (1.0 - lambda) * t + lambda * p;
-
-        Loss loss{};
-
-        loss.cross_entropy_eval =
-            (-p * std::log(q + epsilon) - (1.0 - p) * std::log(1.0 - q + epsilon));
-        loss.cross_entropy_win =
-            (-t * std::log(q + epsilon) - (1.0 - t) * std::log(1.0 - q + epsilon));
-        loss.entropy_eval =
-            (-p * std::log(p + epsilon) - (1.0 - p) * std::log(1.0 - p + epsilon));
-        loss.entropy_win =
-            (-t * std::log(t + epsilon) - (1.0 - t) * std::log(1.0 - t + epsilon));
-
-        loss.cross_entropy =
-            (-m * std::log(q + epsilon) - (1.0 - m) * std::log(1.0 - q + epsilon));
-        loss.entropy =
-            (-m * std::log(m + epsilon) - (1.0 - m) * std::log(1.0 - m + epsilon));
-
-        loss.count = 1;
-
-        return loss;
+        return get_loss(shallow, teacher_signal, psv.game_result, psv.gamePly);
     }
 
     // Class to generate sfen with multiple threads
@@ -495,7 +344,7 @@ namespace Learner
             Thread& th,
             std::atomic<uint64_t>& counter,
             const PSVector& psv,
-            AtomicLoss& test_loss_sum,
+            Loss& test_loss_sum,
             atomic<double>& sum_norm,
             atomic<int>& move_accord_count
         );
@@ -530,7 +379,7 @@ namespace Learner
         int dir_number;
 
         // For calculation of learning data loss
-        AtomicLoss learn_loss_sum;
+        Loss learn_loss_sum;
     };
 
     void LearnerThink::set_learning_search_limits()
@@ -681,7 +530,7 @@ namespace Learner
 
                 const Value shallow_value = Eval::evaluate(pos);
 
-                const auto loss = calc_cross_entropy(
+                const auto loss = get_loss(
                     deep_value,
                     (rootColor == pos.side_to_move()) ? shallow_value : -shallow_value,
                     ps);
@@ -735,7 +584,7 @@ namespace Learner
         // should be no real issues happening since
         // the read/write phases are isolated.
         atomic_thread_fence(memory_order_seq_cst);
-        Eval::NNUE::update_parameters(Threads, epoch, params.verbose, params.learning_rate, calc_grad, get_loss);
+        Eval::NNUE::update_parameters(Threads, epoch, params.verbose, params.learning_rate, get_loss);
         atomic_thread_fence(memory_order_seq_cst);
 
         if (++save_count * params.mini_batch_size >= params.eval_save_interval)
@@ -778,7 +627,7 @@ namespace Learner
         out << "  - learning rate = " << params.learning_rate << endl;
 
         // For calculation of verification data loss
-        AtomicLoss test_loss_sum{};
+        Loss test_loss_sum{};
 
         // norm for learning
         atomic<double> sum_norm{0.0};
@@ -810,26 +659,24 @@ namespace Learner
         });
         Threads.wait_for_workers_finished();
 
-        latest_loss_sum += test_loss_sum.cross_entropy - test_loss_sum.entropy;
+        latest_loss_sum += test_loss_sum.value();
         latest_loss_count += psv.size();
 
-        if (psv.size() && test_loss_sum.count > 0.0)
+        if (psv.size() && test_loss_sum.count() > 0)
         {
             test_loss_sum.print("test", out);
 
-            if (learn_loss_sum.count > 0.0)
+            if (learn_loss_sum.count() > 0)
             {
                 learn_loss_sum.print("learn", out);
             }
 
             out << "  - norm = " << sum_norm << endl;
             out << "  - move accuracy = " << (move_accord_count * 100.0 / psv.size()) << "%" << endl;
-            out << "  - loss (current) = " << (test_loss_sum.cross_entropy - test_loss_sum.entropy) / psv.size() << endl;
-            out << "  - loss (average) = " << latest_loss_sum / latest_loss_count << endl;
         }
         else
         {
-            out << "ERROR: psv.size() = " << psv.size() << " ,  done = " << test_loss_sum.count << endl;
+            out << "ERROR: psv.size() = " << psv.size() << " ,  done = " << test_loss_sum.count() << endl;
         }
 
         learn_loss_sum.reset();
@@ -839,7 +686,7 @@ namespace Learner
         Thread& th,
         std::atomic<uint64_t>& counter,
         const PSVector& psv,
-        AtomicLoss& test_loss_sum,
+        Loss& test_loss_sum,
         atomic<double>& sum_norm,
         atomic<int>& move_accord_count
     )
@@ -869,7 +716,7 @@ namespace Learner
             // Evaluation value of deep search
             const auto deep_value = (Value)ps.score;
 
-            const auto loss = calc_cross_entropy(
+            const auto loss = get_loss(
                 deep_value,
                 shallow_value,
                 ps);
diff --git a/src/learn/learn.h b/src/learn/learn.h
index f74fd4e3..4e8d8a02 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -68,7 +68,6 @@ namespace Learner
     // Learning from the generated game record
     void learn(std::istringstream& is);
 
-    using CalcGradFunc = double(Value, Value, int, int);
     using CalcLossFunc = ValueWithGrad<double>(Value, Value, int, int);
 }
 
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 822c56b4..038a462c 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -195,7 +195,6 @@ namespace Eval::NNUE {
         uint64_t epoch,
         bool verbose,
         double learning_rate,
-        Learner::CalcGradFunc calc_grad,
         Learner::CalcLossFunc calc_loss)
     {
         using namespace Learner::Autograd::UnivariateStatic;
@@ -237,8 +236,8 @@ namespace Eval::NNUE {
                             e.sign * network_output[b] * kPonanzaConstant));
                         const auto discrete = e.sign * e.discrete_nn_eval;
                         const auto& psv = e.psv;
-                        const double gradient =
-                            e.sign * calc_grad(shallow, (Value)psv.score, psv.game_result, psv.gamePly);
+                        const auto loss = calc_loss(shallow, (Value)psv.score, psv.game_result, psv.gamePly);
+                        const double gradient = loss.grad * e.sign * kPonanzaConstant;
                         gradients[b] = static_cast<LearnFloatType>(gradient * e.weight);
 
 
@@ -330,4 +329,4 @@ namespace Eval::NNUE {
 #endif
         out << "INFO (save_eval): Finished saving evaluation file in " << eval_dir << std::endl;
     }
-}  // namespace Eval::NNUE
\ No newline at end of file
+}  // namespace Eval::NNUE
diff --git a/src/nnue/evaluate_nnue_learner.h b/src/nnue/evaluate_nnue_learner.h
index 0fe8afce..7f7daa5b 100644
--- a/src/nnue/evaluate_nnue_learner.h
+++ b/src/nnue/evaluate_nnue_learner.h
@@ -38,7 +38,6 @@ namespace Eval::NNUE {
         uint64_t epoch,
         bool verbose,
         double learning_rate,
-        Learner::CalcGradFunc calc_grad,
         Learner::CalcLossFunc calc_loss);
 
     // Check if there are any problems with learning

From aa55692b97df056298ca016a5be4771902baafd9 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 29 Nov 2020 17:08:22 +0100
Subject: [PATCH 379/398] Cross entropy loss.

---
 src/learn/autograd.h | 36 ++++++++++++++++++++++++++++++++++--
 src/learn/learn.cpp  | 17 +++++++++++++++++
 2 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/src/learn/autograd.h b/src/learn/autograd.h
index f83d4d72..a4ad8b7f 100644
--- a/src/learn/autograd.h
+++ b/src/learn/autograd.h
@@ -282,6 +282,38 @@ namespace Learner::Autograd::UnivariateStatic
         return Product(Constant(lhs), std::move(rhs));
     }
 
+    template <typename ArgT, typename T = typename ArgT::ValueType>
+    struct Negation : Evaluable<Negation<ArgT, T>>
+    {
+        using ValueType = T;
+
+        explicit Negation(ArgT x) :
+            m_x(std::move(x))
+        {
+        }
+
+        template <typename... ArgsTs>
+        T value(const std::tuple<ArgsTs...>& args) const
+        {
+            return -m_x.value(args);
+        }
+
+        template <typename... ArgsTs>
+        T grad(const std::tuple<ArgsTs...>& args) const
+        {
+            return -m_x.grad(args);
+        }
+
+    private:
+        ArgT m_x;
+    };
+
+    template <typename ArgT, typename T = typename ArgT::ValueType>
+    auto operator-(ArgT x)
+    {
+        return Negation(std::move(x));
+    }
+
     template <typename ArgT, typename T = typename ArgT::ValueType>
     struct Sigmoid : Evaluable<Sigmoid<ArgT, T>>
     {
@@ -318,7 +350,7 @@ namespace Learner::Autograd::UnivariateStatic
         }
     };
 
-    template <typename ArgT>
+    template <typename ArgT, typename T = typename ArgT::ValueType>
     auto sigmoid(ArgT x)
     {
         return Sigmoid(std::move(x));
@@ -394,7 +426,7 @@ namespace Learner::Autograd::UnivariateStatic
         }
     };
 
-    template <typename ArgT>
+    template <typename ArgT, typename T = typename ArgT::ValueType>
     auto log(ArgT x)
     {
         return Log(std::move(x));
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index e558b56a..83229c61 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -200,11 +200,14 @@ namespace Learner
     static ValueWithGrad<double> get_loss(Value shallow, Value teacher_signal, int result, int ply)
     {
         using namespace Learner::Autograd::UnivariateStatic;
+
+        /*
         auto q_ = sigmoid(VariableParameter<double, 0>{} * winning_probability_coefficient);
         auto p_ = sigmoid(ConstantParameter<double, 1>{} * winning_probability_coefficient);
         auto t_ = (ConstantParameter<double, 2>{} + 1.0) * 0.5;
         auto lambda_ = ConstantParameter<double, 3>{};
         auto loss_ = pow(lambda_ * (q_ - p_) + (1.0 - lambda_) * (q_ - t_), 2.0);
+        */
 
         /*
         auto q_ = VariableParameter<double, 0>{};
@@ -212,6 +215,20 @@ namespace Learner
         auto loss_ = pow(q_ - p_, 2.0) * (1.0 / (2400.0 * 2.0 * 600.0));
         */
 
+        const double epsilon = 1e-12;
+
+        auto q_ = sigmoid(VariableParameter<double, 0>{} * winning_probability_coefficient);
+        auto p_ = sigmoid(ConstantParameter<double, 1>{} * winning_probability_coefficient);
+        auto t_ = (ConstantParameter<double, 2>{} + 1.0) * 0.5;
+        auto lambda_ = ConstantParameter<double, 3>{};
+        auto teacher_entropy_ = -(p_ * log(p_ + epsilon) + (1.0 - p_) * log(1.0 - p_ + epsilon));
+        auto outcome_entropy_ = -(t_ * log(t_ + epsilon) + (1.0 - t_) * log(1.0 - t_ + epsilon));
+        auto teacher_loss_ = -(p_ * log(q_) + (1.0 - p_) * log(1.0 - q_));
+        auto outcome_loss_ = -(t_ * log(q_) + (1.0 - t_) * log(1.0 - q_));
+        auto result_ = lambda_ * teacher_loss_ + (1.0 - lambda_) * outcome_loss_;
+        auto entropy_ = lambda_ * teacher_entropy_ + (1.0 - lambda_) * outcome_entropy_;
+        auto loss_ = result_ - entropy_;
+
         auto args = std::tuple((double)shallow, (double)teacher_signal, (double)result, calculate_lambda(teacher_signal));
         return loss_.eval(args);
     }

From d103867558d7c57a9eae5e2a061394d937881b13 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 29 Nov 2020 17:27:45 +0100
Subject: [PATCH 380/398] Add memoization to the autograd expression evaluator.

---
 src/learn/autograd.h | 97 ++++++++++++++++++++++++++++----------------
 1 file changed, 63 insertions(+), 34 deletions(-)

diff --git a/src/learn/autograd.h b/src/learn/autograd.h
index a4ad8b7f..2b0eee3a 100644
--- a/src/learn/autograd.h
+++ b/src/learn/autograd.h
@@ -6,6 +6,7 @@
 #include <type_traits>
 #include <memory>
 #include <tuple>
+#include <optional>
 
 namespace Learner
 {
@@ -62,20 +63,48 @@ namespace Learner::Autograd::UnivariateStatic
     template <typename T>
     using Id = typename Identity<T>::type;
 
-    template <typename T>
+    template <typename T, typename ChildT>
     struct Evaluable
     {
         template <typename... ArgsTs>
         auto eval(const std::tuple<ArgsTs...>& args) const
         {
-            using ValueType = typename T::ValueType;
-            const T* this_ = static_cast<const T*>(this);
-            return ValueWithGrad<ValueType>{ this_->value(args), this_->grad(args) };
+            return ValueWithGrad<T>{ value(args), grad(args) };
         }
+
+        template <typename... ArgsTs>
+        auto value(const std::tuple<ArgsTs...>& args) const
+        {
+            const ChildT* this_ = static_cast<const ChildT*>(this);
+
+            if (!value_cache.has_value())
+            {
+                value_cache = this_->calculate_value(args);
+            }
+
+            return *value_cache;
+        }
+
+        template <typename... ArgsTs>
+        auto grad(const std::tuple<ArgsTs...>& args) const
+        {
+            const ChildT* this_ = static_cast<const ChildT*>(this);
+
+            if (!grad_cache.has_value())
+            {
+                grad_cache = this_->calculate_grad(args);
+            }
+
+            return *grad_cache;
+        }
+
+    private:
+        mutable std::optional<T> value_cache;
+        mutable std::optional<T> grad_cache;
     };
 
     template <typename T, int I>
-    struct VariableParameter : Evaluable<VariableParameter<T, I>>
+    struct VariableParameter : Evaluable<T, VariableParameter<T, I>>
     {
         using ValueType = T;
 
@@ -84,20 +113,20 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T value(const std::tuple<ArgsTs...>& args) const
+        T calculate_value(const std::tuple<ArgsTs...>& args) const
         {
             return std::get<I>(args);
         }
 
         template <typename... ArgsTs>
-        T grad(const std::tuple<ArgsTs...>&) const
+        T calculate_grad(const std::tuple<ArgsTs...>&) const
         {
             return T(1.0);
         }
     };
 
     template <typename T, int I>
-    struct ConstantParameter : Evaluable<ConstantParameter<T, I>>
+    struct ConstantParameter : Evaluable<T, ConstantParameter<T, I>>
     {
         using ValueType = T;
 
@@ -106,20 +135,20 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T value(const std::tuple<ArgsTs...>& args) const
+        T calculate_value(const std::tuple<ArgsTs...>& args) const
         {
             return std::get<I>(args);
         }
 
         template <typename... ArgsTs>
-        T grad(const std::tuple<ArgsTs...>&) const
+        T calculate_grad(const std::tuple<ArgsTs...>&) const
         {
             return T(0.0);
         }
     };
 
     template <typename T>
-    struct Constant : Evaluable<Constant<T>>
+    struct Constant : Evaluable<T, Constant<T>>
     {
         using ValueType = T;
 
@@ -129,13 +158,13 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T value(const std::tuple<ArgsTs...>&) const
+        T calculate_value(const std::tuple<ArgsTs...>&) const
         {
             return m_x;
         }
 
         template <typename... ArgsTs>
-        T grad(const std::tuple<ArgsTs...>&) const
+        T calculate_grad(const std::tuple<ArgsTs...>&) const
         {
             return T(0.0);
         }
@@ -145,7 +174,7 @@ namespace Learner::Autograd::UnivariateStatic
     };
 
     template <typename LhsT, typename RhsT, typename T = typename LhsT::ValueType>
-    struct Sum : Evaluable<Sum<LhsT, RhsT, T>>
+    struct Sum : Evaluable<T, Sum<LhsT, RhsT, T>>
     {
         using ValueType = T;
 
@@ -156,13 +185,13 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T value(const std::tuple<ArgsTs...>& args) const
+        T calculate_value(const std::tuple<ArgsTs...>& args) const
         {
             return m_lhs.value(args) + m_rhs.value(args);
         }
 
         template <typename... ArgsTs>
-        T grad(const std::tuple<ArgsTs...>& args) const
+        T calculate_grad(const std::tuple<ArgsTs...>& args) const
         {
             return m_lhs.grad(args) + m_rhs.grad(args);
         }
@@ -191,7 +220,7 @@ namespace Learner::Autograd::UnivariateStatic
     }
 
     template <typename LhsT, typename RhsT, typename T = typename LhsT::ValueType>
-    struct Difference : Evaluable<Difference<LhsT, RhsT, T>>
+    struct Difference : Evaluable<T, Difference<LhsT, RhsT, T>>
     {
         using ValueType = T;
 
@@ -202,13 +231,13 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T value(const std::tuple<ArgsTs...>& args) const
+        T calculate_value(const std::tuple<ArgsTs...>& args) const
         {
             return m_lhs.value(args) - m_rhs.value(args);
         }
 
         template <typename... ArgsTs>
-        T grad(const std::tuple<ArgsTs...>& args) const
+        T calculate_grad(const std::tuple<ArgsTs...>& args) const
         {
             return m_lhs.grad(args) - m_rhs.grad(args);
         }
@@ -237,7 +266,7 @@ namespace Learner::Autograd::UnivariateStatic
     }
 
     template <typename LhsT, typename RhsT, typename T = typename LhsT::ValueType>
-    struct Product : Evaluable<Product<LhsT, RhsT, T>>
+    struct Product : Evaluable<T, Product<LhsT, RhsT, T>>
     {
         using ValueType = T;
 
@@ -248,13 +277,13 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T value(const std::tuple<ArgsTs...>& args) const
+        T calculate_value(const std::tuple<ArgsTs...>& args) const
         {
             return m_lhs.value(args) * m_rhs.value(args);
         }
 
         template <typename... ArgsTs>
-        T grad(const std::tuple<ArgsTs...>& args) const
+        T calculate_grad(const std::tuple<ArgsTs...>& args) const
         {
             return m_lhs.grad(args) * m_rhs.value(args) + m_lhs.value(args) * m_rhs.grad(args);
         }
@@ -283,7 +312,7 @@ namespace Learner::Autograd::UnivariateStatic
     }
 
     template <typename ArgT, typename T = typename ArgT::ValueType>
-    struct Negation : Evaluable<Negation<ArgT, T>>
+    struct Negation : Evaluable<T, Negation<ArgT, T>>
     {
         using ValueType = T;
 
@@ -293,13 +322,13 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T value(const std::tuple<ArgsTs...>& args) const
+        T calculate_value(const std::tuple<ArgsTs...>& args) const
         {
             return -m_x.value(args);
         }
 
         template <typename... ArgsTs>
-        T grad(const std::tuple<ArgsTs...>& args) const
+        T calculate_grad(const std::tuple<ArgsTs...>& args) const
         {
             return -m_x.grad(args);
         }
@@ -315,7 +344,7 @@ namespace Learner::Autograd::UnivariateStatic
     }
 
     template <typename ArgT, typename T = typename ArgT::ValueType>
-    struct Sigmoid : Evaluable<Sigmoid<ArgT, T>>
+    struct Sigmoid : Evaluable<T, Sigmoid<ArgT, T>>
     {
         using ValueType = T;
 
@@ -325,13 +354,13 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T value(const std::tuple<ArgsTs...>& args) const
+        T calculate_value(const std::tuple<ArgsTs...>& args) const
         {
             return value_(m_x.value(args));
         }
 
         template <typename... ArgsTs>
-        T grad(const std::tuple<ArgsTs...>& args) const
+        T calculate_grad(const std::tuple<ArgsTs...>& args) const
         {
             return m_x.grad(args) * grad_(m_x.value(args));
         }
@@ -357,7 +386,7 @@ namespace Learner::Autograd::UnivariateStatic
     }
 
     template <typename ArgT, typename T = typename ArgT::ValueType>
-    struct Pow : Evaluable<Pow<ArgT, T>>
+    struct Pow : Evaluable<T, Pow<ArgT, T>>
     {
         using ValueType = T;
 
@@ -368,13 +397,13 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T value(const std::tuple<ArgsTs...>& args) const
+        T calculate_value(const std::tuple<ArgsTs...>& args) const
         {
             return std::pow(m_x.value(args), m_exponent);
         }
 
         template <typename... ArgsTs>
-        T grad(const std::tuple<ArgsTs...>& args) const
+        T calculate_grad(const std::tuple<ArgsTs...>& args) const
         {
             return m_exponent * std::pow(m_x.value(args), m_exponent - T(1.0)) * m_x.grad(args);
         }
@@ -391,7 +420,7 @@ namespace Learner::Autograd::UnivariateStatic
     }
 
     template <typename ArgT, typename T = typename ArgT::ValueType>
-    struct Log : Evaluable<Log<ArgT, T>>
+    struct Log : Evaluable<T, Log<ArgT, T>>
     {
         using ValueType = T;
 
@@ -401,13 +430,13 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T value(const std::tuple<ArgsTs...>& args) const
+        T calculate_value(const std::tuple<ArgsTs...>& args) const
         {
             return value_(m_x.value(args));
         }
 
         template <typename... ArgsTs>
-        T grad(const std::tuple<ArgsTs...>& args) const
+        T calculate_grad(const std::tuple<ArgsTs...>& args) const
         {
             return m_x.grad(args) * grad_(m_x.value(args));
         }

From a5c20bee5b49a9643ce7cc23aeee08f9f374ac19 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 29 Nov 2020 17:57:06 +0100
Subject: [PATCH 381/398] Apply gradient clipping.

---
 src/learn/autograd.h | 6 ++++++
 src/learn/learn.cpp  | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/learn/autograd.h b/src/learn/autograd.h
index 2b0eee3a..afbcc41b 100644
--- a/src/learn/autograd.h
+++ b/src/learn/autograd.h
@@ -7,6 +7,7 @@
 #include <memory>
 #include <tuple>
 #include <optional>
+#include <algorithm>
 
 namespace Learner
 {
@@ -48,6 +49,11 @@ namespace Learner
         {
             return { std::abs(value), std::abs(grad) };
         }
+
+        ValueWithGrad clamp_grad(T max) const
+        {
+            return { value, std::clamp(grad, -max, max) };
+        }
     };
 }
 
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 83229c61..0b04d034 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -230,7 +230,7 @@ namespace Learner
         auto loss_ = result_ - entropy_;
 
         auto args = std::tuple((double)shallow, (double)teacher_signal, (double)result, calculate_lambda(teacher_signal));
-        return loss_.eval(args);
+        return loss_.eval(args).clamp_grad(max_grad);
     }
 
     static auto get_loss(

From aec6017195fedf7dac6a891a0cb89a06f457ade4 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 29 Nov 2020 19:06:31 +0100
Subject: [PATCH 382/398] When forming an autograd expression only copy parts
 that are rvalue references, store references to lvalues.

---
 src/learn/autograd.h | 153 ++++++++++++++++++++++---------------------
 1 file changed, 80 insertions(+), 73 deletions(-)

diff --git a/src/learn/autograd.h b/src/learn/autograd.h
index afbcc41b..714f741a 100644
--- a/src/learn/autograd.h
+++ b/src/learn/autograd.h
@@ -69,6 +69,13 @@ namespace Learner::Autograd::UnivariateStatic
     template <typename T>
     using Id = typename Identity<T>::type;
 
+    template <typename T>
+    using StoreValueOrRef = std::conditional_t<
+            std::is_rvalue_reference_v<T>,
+            std::remove_reference_t<T>,
+            const std::remove_reference_t<T>&
+        >;
+
     template <typename T, typename ChildT>
     struct Evaluable
     {
@@ -179,14 +186,14 @@ namespace Learner::Autograd::UnivariateStatic
         T m_x;
     };
 
-    template <typename LhsT, typename RhsT, typename T = typename LhsT::ValueType>
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
     struct Sum : Evaluable<T, Sum<LhsT, RhsT, T>>
     {
         using ValueType = T;
 
-        Sum(LhsT lhs, RhsT rhs) :
-            m_lhs(std::move(lhs)),
-            m_rhs(std::move(rhs))
+        Sum(LhsT&& lhs, RhsT&& rhs) :
+            m_lhs(std::forward<LhsT>(lhs)),
+            m_rhs(std::forward<RhsT>(rhs))
         {
         }
 
@@ -203,36 +210,36 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
     private:
-        LhsT m_lhs;
-        RhsT m_rhs;
+        StoreValueOrRef<LhsT> m_lhs;
+        StoreValueOrRef<RhsT> m_rhs;
     };
 
-    template <typename LhsT, typename RhsT, typename T = typename LhsT::ValueType>
-    auto operator+(LhsT lhs, RhsT rhs)
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    auto operator+(LhsT&& lhs, RhsT&& rhs)
     {
-        return Sum(std::move(lhs), std::move(rhs));
+        return Sum<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
     }
 
-    template <typename LhsT, typename T = typename LhsT::ValueType>
-    auto operator+(LhsT lhs, Id<T> rhs)
+    template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    auto operator+(LhsT&& lhs, Id<T> rhs)
     {
-        return Sum(std::move(lhs), Constant(rhs));
+        return Sum<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
     }
 
-    template <typename RhsT, typename T = typename RhsT::ValueType>
-    auto operator+(Id<T> lhs, RhsT rhs)
+    template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
+    auto operator+(Id<T> lhs, RhsT&& rhs)
     {
-        return Sum(Constant(lhs), std::move(rhs));
+        return Sum<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
     }
 
-    template <typename LhsT, typename RhsT, typename T = typename LhsT::ValueType>
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
     struct Difference : Evaluable<T, Difference<LhsT, RhsT, T>>
     {
         using ValueType = T;
 
-        Difference(LhsT lhs, RhsT rhs) :
-            m_lhs(std::move(lhs)),
-            m_rhs(std::move(rhs))
+        Difference(LhsT&& lhs, RhsT&& rhs) :
+            m_lhs(std::forward<LhsT>(lhs)),
+            m_rhs(std::forward<RhsT>(rhs))
         {
         }
 
@@ -249,36 +256,36 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
     private:
-        LhsT m_lhs;
-        RhsT m_rhs;
+        StoreValueOrRef<LhsT> m_lhs;
+        StoreValueOrRef<RhsT> m_rhs;
     };
 
-    template <typename LhsT, typename RhsT, typename T = typename LhsT::ValueType>
-    auto operator-(LhsT lhs, RhsT rhs)
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    auto operator-(LhsT&& lhs, RhsT&& rhs)
     {
-        return Difference(std::move(lhs), std::move(rhs));
+        return Difference<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
     }
 
-    template <typename LhsT, typename T = typename LhsT::ValueType>
-    auto operator-(LhsT lhs, Id<T> rhs)
+    template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    auto operator-(LhsT&& lhs, Id<T> rhs)
     {
-        return Difference(std::move(lhs), Constant(rhs));
+        return Difference<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
     }
 
-    template <typename RhsT, typename T = typename RhsT::ValueType>
-    auto operator-(Id<T> lhs, RhsT rhs)
+    template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
+    auto operator-(Id<T> lhs, RhsT&& rhs)
     {
-        return Difference(Constant(lhs), std::move(rhs));
+        return Difference<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
     }
 
-    template <typename LhsT, typename RhsT, typename T = typename LhsT::ValueType>
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
     struct Product : Evaluable<T, Product<LhsT, RhsT, T>>
     {
         using ValueType = T;
 
-        Product(LhsT lhs, RhsT rhs) :
-            m_lhs(std::move(lhs)),
-            m_rhs(std::move(rhs))
+        Product(LhsT&& lhs, RhsT&& rhs) :
+            m_lhs(std::forward<LhsT>(lhs)),
+            m_rhs(std::forward<RhsT>(rhs))
         {
         }
 
@@ -295,35 +302,35 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
     private:
-        LhsT m_lhs;
-        RhsT m_rhs;
+        StoreValueOrRef<LhsT> m_lhs;
+        StoreValueOrRef<RhsT> m_rhs;
     };
 
-    template <typename LhsT, typename RhsT, typename T = typename LhsT::ValueType>
-    auto operator*(LhsT lhs, RhsT rhs)
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    auto operator*(LhsT&& lhs, RhsT&& rhs)
     {
-        return Product(std::move(lhs), std::move(rhs));
+        return Product<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
     }
 
-    template <typename LhsT, typename T = typename LhsT::ValueType>
-    auto operator*(LhsT lhs, Id<T> rhs)
+    template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    auto operator*(LhsT&& lhs, Id<T> rhs)
     {
-        return Product(std::move(lhs), Constant(rhs));
+        return Product<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
     }
 
-    template <typename RhsT, typename T = typename RhsT::ValueType>
-    auto operator*(Id<T> lhs, RhsT rhs)
+    template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
+    auto operator*(Id<T> lhs, RhsT&& rhs)
     {
-        return Product(Constant(lhs), std::move(rhs));
+        return Product<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
     }
 
-    template <typename ArgT, typename T = typename ArgT::ValueType>
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
     struct Negation : Evaluable<T, Negation<ArgT, T>>
     {
         using ValueType = T;
 
-        explicit Negation(ArgT x) :
-            m_x(std::move(x))
+        explicit Negation(ArgT&& x) :
+            m_x(std::forward<ArgT>(x))
         {
         }
 
@@ -340,22 +347,22 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
     private:
-        ArgT m_x;
+        StoreValueOrRef<ArgT> m_x;
     };
 
-    template <typename ArgT, typename T = typename ArgT::ValueType>
-    auto operator-(ArgT x)
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
+    auto operator-(ArgT&& x)
     {
-        return Negation(std::move(x));
+        return Negation<ArgT&&>(std::forward<ArgT>(x));
     }
 
-    template <typename ArgT, typename T = typename ArgT::ValueType>
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
     struct Sigmoid : Evaluable<T, Sigmoid<ArgT, T>>
     {
         using ValueType = T;
 
-        explicit Sigmoid(ArgT x) :
-            m_x(std::move(x))
+        explicit Sigmoid(ArgT&& x) :
+            m_x(std::forward<ArgT>(x))
         {
         }
 
@@ -372,7 +379,7 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
     private:
-        ArgT m_x;
+        StoreValueOrRef<ArgT> m_x;
 
         T value_(T x) const
         {
@@ -385,19 +392,19 @@ namespace Learner::Autograd::UnivariateStatic
         }
     };
 
-    template <typename ArgT, typename T = typename ArgT::ValueType>
-    auto sigmoid(ArgT x)
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
+    auto sigmoid(ArgT&& x)
     {
-        return Sigmoid(std::move(x));
+        return Sigmoid<ArgT&&>(std::forward<ArgT>(x));
     }
 
-    template <typename ArgT, typename T = typename ArgT::ValueType>
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
     struct Pow : Evaluable<T, Pow<ArgT, T>>
     {
         using ValueType = T;
 
-        explicit Pow(ArgT x, Id<T> exponent) :
-            m_x(std::move(x)),
+        explicit Pow(ArgT&& x, Id<T> exponent) :
+            m_x(std::forward<ArgT>(x)),
             m_exponent(std::move(exponent))
         {
         }
@@ -415,23 +422,23 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
     private:
-        ArgT m_x;
+        StoreValueOrRef<ArgT> m_x;
         T m_exponent;
     };
 
-    template <typename ArgT, typename T = typename ArgT::ValueType>
-    auto pow(ArgT x, Id<T> exp)
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
+    auto pow(ArgT&& x, Id<T> exp)
     {
-        return Pow(std::move(x), std::move(exp));
+        return Pow<ArgT&&>(std::forward<ArgT>(x), std::move(exp));
     }
 
-    template <typename ArgT, typename T = typename ArgT::ValueType>
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
     struct Log : Evaluable<T, Log<ArgT, T>>
     {
         using ValueType = T;
 
-        explicit Log(ArgT x) :
-            m_x(std::move(x))
+        explicit Log(ArgT&& x) :
+            m_x(std::forward<ArgT>(x))
         {
         }
 
@@ -448,7 +455,7 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
     private:
-        ArgT m_x;
+        StoreValueOrRef<ArgT> m_x;
 
         T value_(T x) const
         {
@@ -461,10 +468,10 @@ namespace Learner::Autograd::UnivariateStatic
         }
     };
 
-    template <typename ArgT, typename T = typename ArgT::ValueType>
-    auto log(ArgT x)
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
+    auto log(ArgT&& x)
     {
-        return Log(std::move(x));
+        return Log<ArgT&&>(std::forward<ArgT>(x));
     }
 
 }

From 26f19e1429312e5e0d6fcbc3db325f9923d76d54 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 29 Nov 2020 22:50:49 +0100
Subject: [PATCH 383/398] Make automatic differentiation node types constexpr.

---
 src/learn/autograd.h | 48 +++++++++++++++++++++++---------------------
 1 file changed, 25 insertions(+), 23 deletions(-)

diff --git a/src/learn/autograd.h b/src/learn/autograd.h
index 714f741a..4edf0e4c 100644
--- a/src/learn/autograd.h
+++ b/src/learn/autograd.h
@@ -79,6 +79,8 @@ namespace Learner::Autograd::UnivariateStatic
     template <typename T, typename ChildT>
     struct Evaluable
     {
+        constexpr Evaluable() = default;
+
         template <typename... ArgsTs>
         auto eval(const std::tuple<ArgsTs...>& args) const
         {
@@ -121,7 +123,7 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
-        VariableParameter()
+        constexpr VariableParameter()
         {
         }
 
@@ -143,7 +145,7 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
-        ConstantParameter()
+        constexpr ConstantParameter()
         {
         }
 
@@ -165,7 +167,7 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
-        Constant(T x) :
+        constexpr Constant(T x) :
             m_x(std::move(x))
         {
         }
@@ -191,7 +193,7 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
-        Sum(LhsT&& lhs, RhsT&& rhs) :
+        constexpr Sum(LhsT&& lhs, RhsT&& rhs) :
             m_lhs(std::forward<LhsT>(lhs)),
             m_rhs(std::forward<RhsT>(rhs))
         {
@@ -215,19 +217,19 @@ namespace Learner::Autograd::UnivariateStatic
     };
 
     template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    auto operator+(LhsT&& lhs, RhsT&& rhs)
+    constexpr auto operator+(LhsT&& lhs, RhsT&& rhs)
     {
         return Sum<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
     }
 
     template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    auto operator+(LhsT&& lhs, Id<T> rhs)
+    constexpr auto operator+(LhsT&& lhs, Id<T> rhs)
     {
         return Sum<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
     }
 
     template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
-    auto operator+(Id<T> lhs, RhsT&& rhs)
+    constexpr auto operator+(Id<T> lhs, RhsT&& rhs)
     {
         return Sum<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
     }
@@ -237,7 +239,7 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
-        Difference(LhsT&& lhs, RhsT&& rhs) :
+        constexpr Difference(LhsT&& lhs, RhsT&& rhs) :
             m_lhs(std::forward<LhsT>(lhs)),
             m_rhs(std::forward<RhsT>(rhs))
         {
@@ -261,19 +263,19 @@ namespace Learner::Autograd::UnivariateStatic
     };
 
     template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    auto operator-(LhsT&& lhs, RhsT&& rhs)
+    constexpr auto operator-(LhsT&& lhs, RhsT&& rhs)
     {
         return Difference<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
     }
 
     template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    auto operator-(LhsT&& lhs, Id<T> rhs)
+    constexpr auto operator-(LhsT&& lhs, Id<T> rhs)
     {
         return Difference<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
     }
 
     template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
-    auto operator-(Id<T> lhs, RhsT&& rhs)
+    constexpr auto operator-(Id<T> lhs, RhsT&& rhs)
     {
         return Difference<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
     }
@@ -283,7 +285,7 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
-        Product(LhsT&& lhs, RhsT&& rhs) :
+        constexpr Product(LhsT&& lhs, RhsT&& rhs) :
             m_lhs(std::forward<LhsT>(lhs)),
             m_rhs(std::forward<RhsT>(rhs))
         {
@@ -307,19 +309,19 @@ namespace Learner::Autograd::UnivariateStatic
     };
 
     template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    auto operator*(LhsT&& lhs, RhsT&& rhs)
+    constexpr auto operator*(LhsT&& lhs, RhsT&& rhs)
     {
         return Product<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
     }
 
     template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    auto operator*(LhsT&& lhs, Id<T> rhs)
+    constexpr auto operator*(LhsT&& lhs, Id<T> rhs)
     {
         return Product<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
     }
 
     template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
-    auto operator*(Id<T> lhs, RhsT&& rhs)
+    constexpr auto operator*(Id<T> lhs, RhsT&& rhs)
     {
         return Product<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
     }
@@ -329,7 +331,7 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
-        explicit Negation(ArgT&& x) :
+        constexpr explicit Negation(ArgT&& x) :
             m_x(std::forward<ArgT>(x))
         {
         }
@@ -351,7 +353,7 @@ namespace Learner::Autograd::UnivariateStatic
     };
 
     template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    auto operator-(ArgT&& x)
+    constexpr auto operator-(ArgT&& x)
     {
         return Negation<ArgT&&>(std::forward<ArgT>(x));
     }
@@ -361,7 +363,7 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
-        explicit Sigmoid(ArgT&& x) :
+        constexpr explicit Sigmoid(ArgT&& x) :
             m_x(std::forward<ArgT>(x))
         {
         }
@@ -393,7 +395,7 @@ namespace Learner::Autograd::UnivariateStatic
     };
 
     template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    auto sigmoid(ArgT&& x)
+    constexpr auto sigmoid(ArgT&& x)
     {
         return Sigmoid<ArgT&&>(std::forward<ArgT>(x));
     }
@@ -403,7 +405,7 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
-        explicit Pow(ArgT&& x, Id<T> exponent) :
+        constexpr explicit Pow(ArgT&& x, Id<T> exponent) :
             m_x(std::forward<ArgT>(x)),
             m_exponent(std::move(exponent))
         {
@@ -427,7 +429,7 @@ namespace Learner::Autograd::UnivariateStatic
     };
 
     template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    auto pow(ArgT&& x, Id<T> exp)
+    constexpr auto pow(ArgT&& x, Id<T> exp)
     {
         return Pow<ArgT&&>(std::forward<ArgT>(x), std::move(exp));
     }
@@ -437,7 +439,7 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
-        explicit Log(ArgT&& x) :
+        constexpr explicit Log(ArgT&& x) :
             m_x(std::forward<ArgT>(x))
         {
         }
@@ -469,7 +471,7 @@ namespace Learner::Autograd::UnivariateStatic
     };
 
     template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    auto log(ArgT&& x)
+    constexpr auto log(ArgT&& x)
     {
         return Log<ArgT&&>(std::forward<ArgT>(x));
     }

From cb812c742c25e2737808cf7ec349e4eeffb0d911 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 29 Nov 2020 22:52:21 +0100
Subject: [PATCH 384/398] Add [[nodiscard]] attributes to autograd functions.

---
 src/learn/autograd.h | 80 ++++++++++++++++++++++----------------------
 1 file changed, 40 insertions(+), 40 deletions(-)

diff --git a/src/learn/autograd.h b/src/learn/autograd.h
index 4edf0e4c..5c573c0f 100644
--- a/src/learn/autograd.h
+++ b/src/learn/autograd.h
@@ -45,12 +45,12 @@ namespace Learner
             return *this;
         }
 
-        ValueWithGrad abs() const
+        [[nodiscard]] ValueWithGrad abs() const
         {
             return { std::abs(value), std::abs(grad) };
         }
 
-        ValueWithGrad clamp_grad(T max) const
+        [[nodiscard]] ValueWithGrad clamp_grad(T max) const
         {
             return { value, std::clamp(grad, -max, max) };
         }
@@ -82,13 +82,13 @@ namespace Learner::Autograd::UnivariateStatic
         constexpr Evaluable() = default;
 
         template <typename... ArgsTs>
-        auto eval(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] auto eval(const std::tuple<ArgsTs...>& args) const
         {
             return ValueWithGrad<T>{ value(args), grad(args) };
         }
 
         template <typename... ArgsTs>
-        auto value(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] auto value(const std::tuple<ArgsTs...>& args) const
         {
             const ChildT* this_ = static_cast<const ChildT*>(this);
 
@@ -101,7 +101,7 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        auto grad(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] auto grad(const std::tuple<ArgsTs...>& args) const
         {
             const ChildT* this_ = static_cast<const ChildT*>(this);
 
@@ -128,13 +128,13 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T calculate_value(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
         {
             return std::get<I>(args);
         }
 
         template <typename... ArgsTs>
-        T calculate_grad(const std::tuple<ArgsTs...>&) const
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>&) const
         {
             return T(1.0);
         }
@@ -150,13 +150,13 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T calculate_value(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
         {
             return std::get<I>(args);
         }
 
         template <typename... ArgsTs>
-        T calculate_grad(const std::tuple<ArgsTs...>&) const
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>&) const
         {
             return T(0.0);
         }
@@ -173,13 +173,13 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T calculate_value(const std::tuple<ArgsTs...>&) const
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>&) const
         {
             return m_x;
         }
 
         template <typename... ArgsTs>
-        T calculate_grad(const std::tuple<ArgsTs...>&) const
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>&) const
         {
             return T(0.0);
         }
@@ -200,13 +200,13 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T calculate_value(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
         {
             return m_lhs.value(args) + m_rhs.value(args);
         }
 
         template <typename... ArgsTs>
-        T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
         {
             return m_lhs.grad(args) + m_rhs.grad(args);
         }
@@ -217,19 +217,19 @@ namespace Learner::Autograd::UnivariateStatic
     };
 
     template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    constexpr auto operator+(LhsT&& lhs, RhsT&& rhs)
+    [[nodiscard]] constexpr auto operator+(LhsT&& lhs, RhsT&& rhs)
     {
         return Sum<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
     }
 
     template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    constexpr auto operator+(LhsT&& lhs, Id<T> rhs)
+    [[nodiscard]] constexpr auto operator+(LhsT&& lhs, Id<T> rhs)
     {
         return Sum<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
     }
 
     template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
-    constexpr auto operator+(Id<T> lhs, RhsT&& rhs)
+    [[nodiscard]] constexpr auto operator+(Id<T> lhs, RhsT&& rhs)
     {
         return Sum<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
     }
@@ -246,13 +246,13 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T calculate_value(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
         {
             return m_lhs.value(args) - m_rhs.value(args);
         }
 
         template <typename... ArgsTs>
-        T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
         {
             return m_lhs.grad(args) - m_rhs.grad(args);
         }
@@ -263,19 +263,19 @@ namespace Learner::Autograd::UnivariateStatic
     };
 
     template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    constexpr auto operator-(LhsT&& lhs, RhsT&& rhs)
+    [[nodiscard]] constexpr auto operator-(LhsT&& lhs, RhsT&& rhs)
     {
         return Difference<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
     }
 
     template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    constexpr auto operator-(LhsT&& lhs, Id<T> rhs)
+    [[nodiscard]] constexpr auto operator-(LhsT&& lhs, Id<T> rhs)
     {
         return Difference<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
     }
 
     template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
-    constexpr auto operator-(Id<T> lhs, RhsT&& rhs)
+    [[nodiscard]] constexpr auto operator-(Id<T> lhs, RhsT&& rhs)
     {
         return Difference<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
     }
@@ -292,13 +292,13 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T calculate_value(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
         {
             return m_lhs.value(args) * m_rhs.value(args);
         }
 
         template <typename... ArgsTs>
-        T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
         {
             return m_lhs.grad(args) * m_rhs.value(args) + m_lhs.value(args) * m_rhs.grad(args);
         }
@@ -309,19 +309,19 @@ namespace Learner::Autograd::UnivariateStatic
     };
 
     template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    constexpr auto operator*(LhsT&& lhs, RhsT&& rhs)
+    [[nodiscard]] constexpr auto operator*(LhsT&& lhs, RhsT&& rhs)
     {
         return Product<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
     }
 
     template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    constexpr auto operator*(LhsT&& lhs, Id<T> rhs)
+    [[nodiscard]] constexpr auto operator*(LhsT&& lhs, Id<T> rhs)
     {
         return Product<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
     }
 
     template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
-    constexpr auto operator*(Id<T> lhs, RhsT&& rhs)
+    [[nodiscard]] constexpr auto operator*(Id<T> lhs, RhsT&& rhs)
     {
         return Product<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
     }
@@ -337,13 +337,13 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T calculate_value(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
         {
             return -m_x.value(args);
         }
 
         template <typename... ArgsTs>
-        T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
         {
             return -m_x.grad(args);
         }
@@ -353,7 +353,7 @@ namespace Learner::Autograd::UnivariateStatic
     };
 
     template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    constexpr auto operator-(ArgT&& x)
+    [[nodiscard]] constexpr auto operator-(ArgT&& x)
     {
         return Negation<ArgT&&>(std::forward<ArgT>(x));
     }
@@ -369,13 +369,13 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T calculate_value(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
         {
             return value_(m_x.value(args));
         }
 
         template <typename... ArgsTs>
-        T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
         {
             return m_x.grad(args) * grad_(m_x.value(args));
         }
@@ -383,19 +383,19 @@ namespace Learner::Autograd::UnivariateStatic
     private:
         StoreValueOrRef<ArgT> m_x;
 
-        T value_(T x) const
+        [[nodiscard]] T value_(T x) const
         {
             return 1.0 / (1.0 + std::exp(-x));
         }
 
-        T grad_(T x) const
+        [[nodiscard]] T grad_(T x) const
         {
             return value_(x) * (1.0 - value_(x));
         }
     };
 
     template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    constexpr auto sigmoid(ArgT&& x)
+    [[nodiscard]] constexpr auto sigmoid(ArgT&& x)
     {
         return Sigmoid<ArgT&&>(std::forward<ArgT>(x));
     }
@@ -412,13 +412,13 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T calculate_value(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
         {
             return std::pow(m_x.value(args), m_exponent);
         }
 
         template <typename... ArgsTs>
-        T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
         {
             return m_exponent * std::pow(m_x.value(args), m_exponent - T(1.0)) * m_x.grad(args);
         }
@@ -429,7 +429,7 @@ namespace Learner::Autograd::UnivariateStatic
     };
 
     template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    constexpr auto pow(ArgT&& x, Id<T> exp)
+    [[nodiscard]] constexpr auto pow(ArgT&& x, Id<T> exp)
     {
         return Pow<ArgT&&>(std::forward<ArgT>(x), std::move(exp));
     }
@@ -445,13 +445,13 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T calculate_value(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
         {
             return value_(m_x.value(args));
         }
 
         template <typename... ArgsTs>
-        T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
         {
             return m_x.grad(args) * grad_(m_x.value(args));
         }
@@ -471,7 +471,7 @@ namespace Learner::Autograd::UnivariateStatic
     };
 
     template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    constexpr auto log(ArgT&& x)
+    [[nodiscard]] constexpr auto log(ArgT&& x)
     {
         return Log<ArgT&&>(std::forward<ArgT>(x));
     }

From 8adf00ae6e43b5f27ad48deb39b87a7c05b2fe5e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 30 Nov 2020 14:01:31 +0100
Subject: [PATCH 385/398] Identify a single evalation chain by ID in autograd
 to prevent cache reuse for subsequent evaluations of the same expression
 tree.

---
 src/learn/autograd.h | 87 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 82 insertions(+), 5 deletions(-)

diff --git a/src/learn/autograd.h b/src/learn/autograd.h
index 5c573c0f..7006121a 100644
--- a/src/learn/autograd.h
+++ b/src/learn/autograd.h
@@ -8,6 +8,7 @@
 #include <tuple>
 #include <optional>
 #include <algorithm>
+#include <cstdint>
 
 namespace Learner
 {
@@ -76,46 +77,122 @@ namespace Learner::Autograd::UnivariateStatic
             const std::remove_reference_t<T>&
         >;
 
+    namespace Detail
+    {
+        using CallIdType = std::uint32_t;
+
+        struct CallId
+        {
+            CallIdType call_id{};
+
+            constexpr CallId() :
+                call_id(0)
+            {
+            }
+
+            constexpr CallId(CallIdType id) :
+                call_id(id)
+            {
+            }
+
+            [[nodiscard]] bool operator==(CallId rhs) const noexcept
+            {
+                return call_id == rhs.call_id;
+            }
+
+            [[nodiscard]] bool operator!=(CallId rhs) const noexcept
+            {
+                return call_id != rhs.call_id;
+            }
+        };
+
+        [[nodiscard]] inline CallId next_call_id()
+        {
+            static thread_local CallIdType s_call_id = 0;
+            return CallId{ s_call_id++ };
+        }
+
+        template <typename T, typename Tuple>
+        struct TupleContains;
+
+        template <typename T, typename... Us>
+        struct TupleContains<T, std::tuple<Us...>> : std::disjunction<std::is_same<T, Us>...> {};
+
+        template <typename T, typename Tuple>
+        constexpr bool TupleContainsV = TupleContains<T, Tuple>::value;
+    }
+
     template <typename T, typename ChildT>
     struct Evaluable
     {
         constexpr Evaluable() = default;
 
+        // We append a unique call id so that we can invalidate the cache when
+        // the next computation starts. A single evaluation should see
+        // the same call_id at every node.
         template <typename... ArgsTs>
         [[nodiscard]] auto eval(const std::tuple<ArgsTs...>& args) const
         {
-            return ValueWithGrad<T>{ value(args), grad(args) };
+            const auto call_id = Detail::next_call_id();
+            const auto new_args = std::tuple_cat(args, std::tuple(call_id));
+            return ValueWithGrad<T>{ value(new_args), grad(new_args) };
         }
 
-        template <typename... ArgsTs>
+        template <typename... ArgsTs,
+            typename SFINAE = std::enable_if_t<Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
         [[nodiscard]] auto value(const std::tuple<ArgsTs...>& args) const
         {
             const ChildT* this_ = static_cast<const ChildT*>(this);
 
-            if (!value_cache.has_value())
+            const auto call_id = std::get<Detail::CallId>(args);
+            if (!value_cache.has_value() || value_cache_call_id != call_id)
             {
+                value_cache_call_id = call_id;
                 value_cache = this_->calculate_value(args);
             }
 
             return *value_cache;
         }
 
-        template <typename... ArgsTs>
+        template <typename... ArgsTs,
+            typename SFINAE = std::enable_if_t<!Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
+        [[nodiscard]] auto value(const std::tuple<ArgsTs...>& args, ...) const
+        {
+            const auto call_id = Detail::next_call_id();
+            const auto new_args = std::tuple_cat(args, std::tuple(call_id));
+            return value(new_args);
+        }
+
+        template <typename... ArgsTs,
+            typename SFINAE = std::enable_if_t<Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
         [[nodiscard]] auto grad(const std::tuple<ArgsTs...>& args) const
         {
             const ChildT* this_ = static_cast<const ChildT*>(this);
 
-            if (!grad_cache.has_value())
+            const auto call_id = std::get<Detail::CallId>(args);
+            if (!grad_cache.has_value() || grad_cache_call_id != call_id)
             {
+                grad_cache_call_id = call_id;
                 grad_cache = this_->calculate_grad(args);
             }
 
             return *grad_cache;
         }
 
+        template <typename... ArgsTs,
+            typename SFINAE = std::enable_if_t<!Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
+        [[nodiscard]] auto grad(const std::tuple<ArgsTs...>& args, ...) const
+        {
+            const auto call_id = Detail::next_call_id();
+            const auto new_args = std::tuple_cat(args, std::tuple(call_id));
+            return grad(new_args);
+        }
+
     private:
         mutable std::optional<T> value_cache;
         mutable std::optional<T> grad_cache;
+        mutable Detail::CallId value_cache_call_id{};
+        mutable Detail::CallId grad_cache_call_id{};
     };
 
     template <typename T, int I>

From 891abf55115fca95ee40103bb1157cf341ba57d9 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 30 Nov 2020 15:14:26 +0100
Subject: [PATCH 386/398] Make the autograd loss expression chain thread_local.

---
 src/learn/learn.cpp | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 0b04d034..af867d42 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -215,21 +215,28 @@ namespace Learner
         auto loss_ = pow(q_ - p_, 2.0) * (1.0 / (2400.0 * 2.0 * 600.0));
         */
 
-        const double epsilon = 1e-12;
+        constexpr double epsilon = 1e-12;
 
-        auto q_ = sigmoid(VariableParameter<double, 0>{} * winning_probability_coefficient);
-        auto p_ = sigmoid(ConstantParameter<double, 1>{} * winning_probability_coefficient);
-        auto t_ = (ConstantParameter<double, 2>{} + 1.0) * 0.5;
-        auto lambda_ = ConstantParameter<double, 3>{};
-        auto teacher_entropy_ = -(p_ * log(p_ + epsilon) + (1.0 - p_) * log(1.0 - p_ + epsilon));
-        auto outcome_entropy_ = -(t_ * log(t_ + epsilon) + (1.0 - t_) * log(1.0 - t_ + epsilon));
-        auto teacher_loss_ = -(p_ * log(q_) + (1.0 - p_) * log(1.0 - q_));
-        auto outcome_loss_ = -(t_ * log(q_) + (1.0 - t_) * log(1.0 - q_));
-        auto result_ = lambda_ * teacher_loss_ + (1.0 - lambda_) * outcome_loss_;
-        auto entropy_ = lambda_ * teacher_entropy_ + (1.0 - lambda_) * outcome_entropy_;
-        auto loss_ = result_ - entropy_;
+        static thread_local auto q_ = sigmoid(VariableParameter<double, 0>{} * ConstantParameter<double, 4>{});
+        static thread_local auto p_ = sigmoid(ConstantParameter<double, 1>{} * ConstantParameter<double, 4>{});
+        static thread_local auto t_ = (ConstantParameter<double, 2>{} + 1.0) * 0.5;
+        static thread_local auto lambda_ = ConstantParameter<double, 3>{};
+        static thread_local auto teacher_entropy_ = -(p_ * log(p_ + epsilon) + (1.0 - p_) * log(1.0 - p_ + epsilon));
+        static thread_local auto outcome_entropy_ = -(t_ * log(t_ + epsilon) + (1.0 - t_) * log(1.0 - t_ + epsilon));
+        static thread_local auto teacher_loss_ = -(p_ * log(q_) + (1.0 - p_) * log(1.0 - q_));
+        static thread_local auto outcome_loss_ = -(t_ * log(q_) + (1.0 - t_) * log(1.0 - q_));
+        static thread_local auto result_ = lambda_ * teacher_loss_ + (1.0 - lambda_) * outcome_loss_;
+        static thread_local auto entropy_ = lambda_ * teacher_entropy_ + (1.0 - lambda_) * outcome_entropy_;
+        static thread_local auto loss_ = result_ - entropy_;
+
+        auto args = std::tuple(
+            (double)shallow, 
+            (double)teacher_signal, 
+            (double)result, 
+            calculate_lambda(teacher_signal), 
+            winning_probability_coefficient
+        );
 
-        auto args = std::tuple((double)shallow, (double)teacher_signal, (double)result, calculate_lambda(teacher_signal));
         return loss_.eval(args).clamp_grad(max_grad);
     }
 

From e975889132bd2303915a2e2eb587b2633487c358 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 30 Nov 2020 15:21:39 +0100
Subject: [PATCH 387/398] Move cross_entropy calculation to a separate
 function.

---
 src/learn/learn.cpp | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index af867d42..dd893d9d 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -197,6 +197,29 @@ namespace Learner
         return lambda;
     }
 
+    template <typename ShallowT, typename TeacherT, typename ResultT, typename LambdaT>
+    static auto& cross_entropy_(
+        ShallowT& q_,
+        TeacherT& p_,
+        ResultT& t_,
+        LambdaT& lambda_
+    )
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        constexpr double epsilon = 1e-12;
+
+        static thread_local auto teacher_entropy_ = -(p_ * log(p_ + epsilon) + (1.0 - p_) * log(1.0 - p_ + epsilon));
+        static thread_local auto outcome_entropy_ = -(t_ * log(t_ + epsilon) + (1.0 - t_) * log(1.0 - t_ + epsilon));
+        static thread_local auto teacher_loss_ = -(p_ * log(q_) + (1.0 - p_) * log(1.0 - q_));
+        static thread_local auto outcome_loss_ = -(t_ * log(q_) + (1.0 - t_) * log(1.0 - q_));
+        static thread_local auto result_ = lambda_ * teacher_loss_ + (1.0 - lambda_) * outcome_loss_;
+        static thread_local auto entropy_ = lambda_ * teacher_entropy_ + (1.0 - lambda_) * outcome_entropy_;
+        static thread_local auto loss_ = result_ - entropy_;
+
+        return loss_;
+    }
+
     static ValueWithGrad<double> get_loss(Value shallow, Value teacher_signal, int result, int ply)
     {
         using namespace Learner::Autograd::UnivariateStatic;
@@ -215,19 +238,11 @@ namespace Learner
         auto loss_ = pow(q_ - p_, 2.0) * (1.0 / (2400.0 * 2.0 * 600.0));
         */
 
-        constexpr double epsilon = 1e-12;
-
         static thread_local auto q_ = sigmoid(VariableParameter<double, 0>{} * ConstantParameter<double, 4>{});
         static thread_local auto p_ = sigmoid(ConstantParameter<double, 1>{} * ConstantParameter<double, 4>{});
         static thread_local auto t_ = (ConstantParameter<double, 2>{} + 1.0) * 0.5;
         static thread_local auto lambda_ = ConstantParameter<double, 3>{};
-        static thread_local auto teacher_entropy_ = -(p_ * log(p_ + epsilon) + (1.0 - p_) * log(1.0 - p_ + epsilon));
-        static thread_local auto outcome_entropy_ = -(t_ * log(t_ + epsilon) + (1.0 - t_) * log(1.0 - t_ + epsilon));
-        static thread_local auto teacher_loss_ = -(p_ * log(q_) + (1.0 - p_) * log(1.0 - q_));
-        static thread_local auto outcome_loss_ = -(t_ * log(q_) + (1.0 - t_) * log(1.0 - q_));
-        static thread_local auto result_ = lambda_ * teacher_loss_ + (1.0 - lambda_) * outcome_loss_;
-        static thread_local auto entropy_ = lambda_ * teacher_entropy_ + (1.0 - lambda_) * outcome_entropy_;
-        static thread_local auto loss_ = result_ - entropy_;
+        static thread_local auto loss_ = cross_entropy_(q_, p_, t_, lambda_);
 
         auto args = std::tuple(
             (double)shallow, 

From cbd973fdaaec0685717441a0c5418a95f5527acc Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 30 Nov 2020 16:50:51 +0100
Subject: [PATCH 388/398] Detect constant expressions in autograd and return 0
 grad early.

---
 src/learn/autograd.h | 44 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 37 insertions(+), 7 deletions(-)

diff --git a/src/learn/autograd.h b/src/learn/autograd.h
index 7006121a..45bee469 100644
--- a/src/learn/autograd.h
+++ b/src/learn/autograd.h
@@ -120,6 +120,9 @@ namespace Learner::Autograd::UnivariateStatic
 
         template <typename T, typename Tuple>
         constexpr bool TupleContainsV = TupleContains<T, Tuple>::value;
+
+        template <typename... Ts>
+        constexpr bool AreAllConstantV = (std::remove_reference_t<Ts>::is_constant && ...);
     }
 
     template <typename T, typename ChildT>
@@ -167,16 +170,23 @@ namespace Learner::Autograd::UnivariateStatic
             typename SFINAE = std::enable_if_t<Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
         [[nodiscard]] auto grad(const std::tuple<ArgsTs...>& args) const
         {
-            const ChildT* this_ = static_cast<const ChildT*>(this);
-
-            const auto call_id = std::get<Detail::CallId>(args);
-            if (!grad_cache.has_value() || grad_cache_call_id != call_id)
+            if constexpr (ChildT::is_constant)
             {
-                grad_cache_call_id = call_id;
-                grad_cache = this_->calculate_grad(args);
+                return T(0.0);
             }
+            else
+            {
+                const ChildT* this_ = static_cast<const ChildT*>(this);
 
-            return *grad_cache;
+                const auto call_id = std::get<Detail::CallId>(args);
+                if (!grad_cache.has_value() || grad_cache_call_id != call_id)
+                {
+                    grad_cache_call_id = call_id;
+                    grad_cache = this_->calculate_grad(args);
+                }
+
+                return *grad_cache;
+            }
         }
 
         template <typename... ArgsTs,
@@ -199,6 +209,8 @@ namespace Learner::Autograd::UnivariateStatic
     struct VariableParameter : Evaluable<T, VariableParameter<T, I>>
     {
         using ValueType = T;
+        
+        static constexpr bool is_constant = false;
 
         constexpr VariableParameter()
         {
@@ -222,6 +234,8 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
+        static constexpr bool is_constant = true;
+
         constexpr ConstantParameter()
         {
         }
@@ -244,6 +258,8 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
+        static constexpr bool is_constant = true;
+
         constexpr Constant(T x) :
             m_x(std::move(x))
         {
@@ -270,6 +286,8 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
+        static constexpr bool is_constant = Detail::AreAllConstantV<LhsT, RhsT>;
+
         constexpr Sum(LhsT&& lhs, RhsT&& rhs) :
             m_lhs(std::forward<LhsT>(lhs)),
             m_rhs(std::forward<RhsT>(rhs))
@@ -316,6 +334,8 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
+        static constexpr bool is_constant = Detail::AreAllConstantV<LhsT, RhsT>;
+
         constexpr Difference(LhsT&& lhs, RhsT&& rhs) :
             m_lhs(std::forward<LhsT>(lhs)),
             m_rhs(std::forward<RhsT>(rhs))
@@ -362,6 +382,8 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
+        static constexpr bool is_constant = Detail::AreAllConstantV<LhsT, RhsT>;
+
         constexpr Product(LhsT&& lhs, RhsT&& rhs) :
             m_lhs(std::forward<LhsT>(lhs)),
             m_rhs(std::forward<RhsT>(rhs))
@@ -408,6 +430,8 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
+        static constexpr bool is_constant = Detail::AreAllConstantV<ArgT>;
+
         constexpr explicit Negation(ArgT&& x) :
             m_x(std::forward<ArgT>(x))
         {
@@ -440,6 +464,8 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
+        static constexpr bool is_constant = Detail::AreAllConstantV<ArgT>;
+
         constexpr explicit Sigmoid(ArgT&& x) :
             m_x(std::forward<ArgT>(x))
         {
@@ -482,6 +508,8 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
+        static constexpr bool is_constant = Detail::AreAllConstantV<ArgT>;
+
         constexpr explicit Pow(ArgT&& x, Id<T> exponent) :
             m_x(std::forward<ArgT>(x)),
             m_exponent(std::move(exponent))
@@ -516,6 +544,8 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
+        static constexpr bool is_constant = Detail::AreAllConstantV<ArgT>;
+
         constexpr explicit Log(ArgT&& x) :
             m_x(std::forward<ArgT>(x))
         {

From 01ae7b1e2c7f5a8e4c5bf3552b2f3378409efeb2 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 30 Nov 2020 20:26:02 +0100
Subject: [PATCH 389/398] Simplify passing constants that may vary between
 calls.

---
 src/learn/autograd.h | 32 +++++++++++++++++++++++++++++++-
 src/learn/learn.cpp  | 23 ++++++++++++++++-------
 2 files changed, 47 insertions(+), 8 deletions(-)

diff --git a/src/learn/autograd.h b/src/learn/autograd.h
index 45bee469..4383dfab 100644
--- a/src/learn/autograd.h
+++ b/src/learn/autograd.h
@@ -209,7 +209,7 @@ namespace Learner::Autograd::UnivariateStatic
     struct VariableParameter : Evaluable<T, VariableParameter<T, I>>
     {
         using ValueType = T;
-        
+
         static constexpr bool is_constant = false;
 
         constexpr VariableParameter()
@@ -281,6 +281,36 @@ namespace Learner::Autograd::UnivariateStatic
         T m_x;
     };
 
+    // The "constant" may change between executions, but is assumed to be
+    // constant during a single evaluation.
+    template <typename T>
+    struct ConstantRef : Evaluable<T, ConstantRef<T>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = true;
+
+        constexpr ConstantRef(const T& x) :
+            m_x(x)
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>&) const
+        {
+            return m_x;
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>&) const
+        {
+            return T(0.0);
+        }
+
+    private:
+        const T& m_x;
+    };
+
     template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
     struct Sum : Evaluable<T, Sum<LhsT, RhsT, T>>
     {
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index dd893d9d..8e32836b 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -220,6 +220,16 @@ namespace Learner
         return loss_;
     }
 
+    template <typename ValueT>
+    static auto& expected_perf_(ValueT&& v_)
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        static thread_local auto perf_ = sigmoid(std::forward<ValueT>(v_) * ConstantRef<double>(winning_probability_coefficient));
+
+        return perf_;
+    }
+
     static ValueWithGrad<double> get_loss(Value shallow, Value teacher_signal, int result, int ply)
     {
         using namespace Learner::Autograd::UnivariateStatic;
@@ -238,18 +248,17 @@ namespace Learner
         auto loss_ = pow(q_ - p_, 2.0) * (1.0 / (2400.0 * 2.0 * 600.0));
         */
 
-        static thread_local auto q_ = sigmoid(VariableParameter<double, 0>{} * ConstantParameter<double, 4>{});
-        static thread_local auto p_ = sigmoid(ConstantParameter<double, 1>{} * ConstantParameter<double, 4>{});
+        static thread_local auto q_ = expected_perf_(VariableParameter<double, 0>{});
+        static thread_local auto p_ = expected_perf_(ConstantParameter<double, 1>{});
         static thread_local auto t_ = (ConstantParameter<double, 2>{} + 1.0) * 0.5;
         static thread_local auto lambda_ = ConstantParameter<double, 3>{};
         static thread_local auto loss_ = cross_entropy_(q_, p_, t_, lambda_);
 
         auto args = std::tuple(
-            (double)shallow, 
-            (double)teacher_signal, 
-            (double)result, 
-            calculate_lambda(teacher_signal), 
-            winning_probability_coefficient
+            (double)shallow,
+            (double)teacher_signal,
+            (double)result,
+            calculate_lambda(teacher_signal)
         );
 
         return loss_.eval(args).clamp_grad(max_grad);

From de675e3503dde2a93cbfecb75260285c91665a14 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 30 Nov 2020 20:32:53 +0100
Subject: [PATCH 390/398] Reintroduce optional scaling of the teacher signal.

---
 src/learn/autograd.h | 49 ++++++++++++++++++++++++++++++++++++++++++++
 src/learn/learn.cpp  | 21 ++++++++++++++++++-
 2 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/src/learn/autograd.h b/src/learn/autograd.h
index 4383dfab..7b2853df 100644
--- a/src/learn/autograd.h
+++ b/src/learn/autograd.h
@@ -455,6 +455,55 @@ namespace Learner::Autograd::UnivariateStatic
         return Product<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
     }
 
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    struct Quotient : Evaluable<T, Quotient<LhsT, RhsT, T>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = Detail::AreAllConstantV<LhsT, RhsT>;
+
+        constexpr Quotient(LhsT&& lhs, RhsT&& rhs) :
+            m_lhs(std::forward<LhsT>(lhs)),
+            m_rhs(std::forward<RhsT>(rhs))
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_lhs.value(args) / m_rhs.value(args);
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        {
+            auto g = m_rhs.value(args);
+            return (m_lhs.grad(args) * g - m_lhs.value(args) * m_rhs.grad(args)) / (g * g);
+        }
+
+    private:
+        StoreValueOrRef<LhsT> m_lhs;
+        StoreValueOrRef<RhsT> m_rhs;
+    };
+
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator/(LhsT&& lhs, RhsT&& rhs)
+    {
+        return Quotient<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
+    }
+
+    template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator/(LhsT&& lhs, Id<T> rhs)
+    {
+        return Quotient<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
+    }
+
+    template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator/(Id<T> lhs, RhsT&& rhs)
+    {
+        return Quotient<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
+    }
+
     template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
     struct Negation : Evaluable<T, Negation<ArgT, T>>
     {
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 8e32836b..07e5bd4a 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -220,6 +220,25 @@ namespace Learner
         return loss_;
     }
 
+    template <typename ValueT>
+    static auto& scale_score_(ValueT&& v_)
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        // Normalize to [0.0, 1.0].
+        static thread_local auto normalized_ =
+            (std::forward<ValueT>(v_) - ConstantRef<double>(src_score_min_value))
+            / (ConstantRef<double>(src_score_max_value) - ConstantRef<double>(src_score_min_value));
+
+        // Scale to [dest_score_min_value, dest_score_max_value].
+        static thread_local auto scaled_ =
+            normalized_
+            * (ConstantRef<double>(dest_score_max_value) - ConstantRef<double>(dest_score_min_value))
+            + ConstantRef<double>(dest_score_min_value);
+
+        return scaled_;
+    }
+
     template <typename ValueT>
     static auto& expected_perf_(ValueT&& v_)
     {
@@ -249,7 +268,7 @@ namespace Learner
         */
 
         static thread_local auto q_ = expected_perf_(VariableParameter<double, 0>{});
-        static thread_local auto p_ = expected_perf_(ConstantParameter<double, 1>{});
+        static thread_local auto p_ = expected_perf_(scale_score_(ConstantParameter<double, 1>{}));
         static thread_local auto t_ = (ConstantParameter<double, 2>{} + 1.0) * 0.5;
         static thread_local auto lambda_ = ConstantParameter<double, 3>{};
         static thread_local auto loss_ = cross_entropy_(q_, p_, t_, lambda_);

From 256c4b55ec3b7e6ffbbb19e2cc15fa1ecbfb48b0 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 30 Nov 2020 20:39:46 +0100
Subject: [PATCH 391/398] Properly apply gradient norm clipping after it's
 scaled in the update_parameters.

---
 src/learn/learn.cpp                | 11 +++++------
 src/nnue/evaluate_nnue_learner.cpp |  6 ++++--
 src/nnue/evaluate_nnue_learner.h   |  1 +
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 07e5bd4a..109a43ea 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -95,8 +95,6 @@ namespace Learner
     static double elmo_lambda_high = 1.0;
     static double elmo_lambda_limit = 32000;
 
-    static double max_grad = 1.0;
-
     // Using stockfish's WDL with win rate model instead of sigmoid
     static bool use_wdl = false;
 
@@ -280,7 +278,7 @@ namespace Learner
             calculate_lambda(teacher_signal)
         );
 
-        return loss_.eval(args).clamp_grad(max_grad);
+        return loss_.eval(args);
     }
 
     static auto get_loss(
@@ -334,6 +332,7 @@ namespace Learner
             bool smart_fen_skipping = false;
 
             double learning_rate = 1.0;
+            double max_grad = 1.0;
 
             string validation_set_file_name;
             string seed;
@@ -651,7 +650,7 @@ namespace Learner
         // should be no real issues happening since
         // the read/write phases are isolated.
         atomic_thread_fence(memory_order_seq_cst);
-        Eval::NNUE::update_parameters(Threads, epoch, params.verbose, params.learning_rate, get_loss);
+        Eval::NNUE::update_parameters(Threads, epoch, params.verbose, params.learning_rate, params.max_grad, get_loss);
         atomic_thread_fence(memory_order_seq_cst);
 
         if (++save_count * params.mini_batch_size >= params.eval_save_interval)
@@ -985,6 +984,7 @@ namespace Learner
 
             // learning rate
             else if (option == "lr") is >> params.learning_rate;
+            else if (option == "max_grad") is >> params.max_grad;
 
             // Accept also the old option name.
             else if (option == "use_draw_in_training"
@@ -1012,7 +1012,6 @@ namespace Learner
             else if (option == "lambda") is >> elmo_lambda_low;
             else if (option == "lambda2") is >> elmo_lambda_high;
             else if (option == "lambda_limit") is >> elmo_lambda_limit;
-            else if (option == "max_grad") is >> max_grad;
 
             else if (option == "reduction_gameply") is >> params.reduction_gameply;
 
@@ -1100,6 +1099,7 @@ namespace Learner
         out << "  - nn_options               : " << nn_options << endl;
 
         out << "  - learning rate            : " << params.learning_rate << endl;
+        out << "  - max_grad                 : " << params.max_grad << endl;
         out << "  - use draws in training    : " << params.use_draw_games_in_training << endl;
         out << "  - use draws in validation  : " << params.use_draw_games_in_validation << endl;
         out << "  - skip repeated positions  : " << params.skip_duplicated_positions_in_training << endl;
@@ -1117,7 +1117,6 @@ namespace Learner
         out << "  - elmo_lambda_low          : " << elmo_lambda_low << endl;
         out << "  - elmo_lambda_high         : " << elmo_lambda_high << endl;
         out << "  - elmo_lambda_limit        : " << elmo_lambda_limit << endl;
-        out << "  - max_grad                 : " << max_grad << endl;
         out << "  - eval_save_interval       : " << params.eval_save_interval << " sfens" << endl;
         out << "  - loss_output_interval     : " << params.loss_output_interval << " sfens" << endl;
 
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 038a462c..8c28e4f4 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -195,6 +195,7 @@ namespace Eval::NNUE {
         uint64_t epoch,
         bool verbose,
         double learning_rate,
+        double max_grad,
         Learner::CalcLossFunc calc_loss)
     {
         using namespace Learner::Autograd::UnivariateStatic;
@@ -237,8 +238,9 @@ namespace Eval::NNUE {
                         const auto discrete = e.sign * e.discrete_nn_eval;
                         const auto& psv = e.psv;
                         const auto loss = calc_loss(shallow, (Value)psv.score, psv.game_result, psv.gamePly);
-                        const double gradient = loss.grad * e.sign * kPonanzaConstant;
-                        gradients[b] = static_cast<LearnFloatType>(gradient * e.weight);
+                        const double gradient = std::clamp(
+                            loss.grad * e.sign * kPonanzaConstant * e.weight, -max_grad, max_grad);
+                        gradients[b] = static_cast<LearnFloatType>(gradient);
 
 
                         // The discrete eval will only be valid before first backpropagation,
diff --git a/src/nnue/evaluate_nnue_learner.h b/src/nnue/evaluate_nnue_learner.h
index 7f7daa5b..5beca0a7 100644
--- a/src/nnue/evaluate_nnue_learner.h
+++ b/src/nnue/evaluate_nnue_learner.h
@@ -38,6 +38,7 @@ namespace Eval::NNUE {
         uint64_t epoch,
         bool verbose,
         double learning_rate,
+        double max_grad,
         Learner::CalcLossFunc calc_loss);
 
     // Check if there are any problems with learning

From cf6bc7ecaf006bbbb0325f117a41a26b98e0a50e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 30 Nov 2020 20:42:54 +0100
Subject: [PATCH 392/398] Cleanup around get_loss

---
 src/learn/learn.cpp | 41 ++++++++++++++++++++++++++---------------
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 109a43ea..e3bfe3a4 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -213,9 +213,9 @@ namespace Learner
         static thread_local auto outcome_loss_ = -(t_ * log(q_) + (1.0 - t_) * log(1.0 - q_));
         static thread_local auto result_ = lambda_ * teacher_loss_ + (1.0 - lambda_) * outcome_loss_;
         static thread_local auto entropy_ = lambda_ * teacher_entropy_ + (1.0 - lambda_) * outcome_entropy_;
-        static thread_local auto loss_ = result_ - entropy_;
+        static thread_local auto cross_entropy_ = result_ - entropy_;
 
-        return loss_;
+        return cross_entropy_;
     }
 
     template <typename ValueT>
@@ -247,23 +247,27 @@ namespace Learner
         return perf_;
     }
 
-    static ValueWithGrad<double> get_loss(Value shallow, Value teacher_signal, int result, int ply)
+    static ValueWithGrad<double> get_loss_noob(Value shallow, Value teacher_signal, int result, int /* ply */)
     {
         using namespace Learner::Autograd::UnivariateStatic;
 
-        /*
-        auto q_ = sigmoid(VariableParameter<double, 0>{} * winning_probability_coefficient);
-        auto p_ = sigmoid(ConstantParameter<double, 1>{} * winning_probability_coefficient);
-        auto t_ = (ConstantParameter<double, 2>{} + 1.0) * 0.5;
-        auto lambda_ = ConstantParameter<double, 3>{};
-        auto loss_ = pow(lambda_ * (q_ - p_) + (1.0 - lambda_) * (q_ - t_), 2.0);
-        */
+        static thread_local auto q_ = VariableParameter<double, 0>{};
+        static thread_local auto p_ = ConstantParameter<double, 1>{};
+        static thread_local auto loss_ = pow(q_ - p_, 2.0) * (1.0 / (2400.0 * 2.0 * 600.0));
 
-        /*
-        auto q_ = VariableParameter<double, 0>{};
-        auto p_ = ConstantParameter<double, 1>{};
-        auto loss_ = pow(q_ - p_, 2.0) * (1.0 / (2400.0 * 2.0 * 600.0));
-        */
+        auto args = std::tuple(
+            (double)shallow,
+            (double)teacher_signal,
+            (double)result,
+            calculate_lambda(teacher_signal)
+        );
+
+        return loss_.eval(args);
+    }
+
+    static ValueWithGrad<double> get_loss_cross_entropy(Value shallow, Value teacher_signal, int result, int /* ply */)
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
 
         static thread_local auto q_ = expected_perf_(VariableParameter<double, 0>{});
         static thread_local auto p_ = expected_perf_(scale_score_(ConstantParameter<double, 1>{}));
@@ -281,6 +285,13 @@ namespace Learner
         return loss_.eval(args);
     }
 
+    static auto get_loss(Value shallow, Value teacher_signal, int result, int ply)
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        return get_loss_cross_entropy(shallow, teacher_signal, result, ply);
+    }
+
     static auto get_loss(
         Value teacher_signal,
         Value shallow,

From 99cb869db32cb309b9e5f9168706591e3fc97805 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 30 Nov 2020 21:17:44 +0100
Subject: [PATCH 393/398] Reintroduce use_wdl.

---
 src/learn/learn.cpp | 120 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 118 insertions(+), 2 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index e3bfe3a4..18f84114 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -237,6 +237,22 @@ namespace Learner
         return scaled_;
     }
 
+    static Value scale_score(Value v)
+    {
+        // Normalize to [0.0, 1.0].
+        auto normalized =
+            ((double)v - src_score_min_value)
+            / (src_score_max_value - src_score_min_value);
+
+        // Scale to [dest_score_min_value, dest_score_max_value].
+        auto scaled =
+            normalized
+            * (dest_score_max_value - dest_score_min_value)
+            + dest_score_min_value;
+
+        return Value(scaled);
+    }
+
     template <typename ValueT>
     static auto& expected_perf_(ValueT&& v_)
     {
@@ -247,7 +263,72 @@ namespace Learner
         return perf_;
     }
 
-    static ValueWithGrad<double> get_loss_noob(Value shallow, Value teacher_signal, int result, int /* ply */)
+    template <typename ValueT, typename PlyT, typename T = typename ValueT::ValueType>
+    static auto& expected_perf_use_wdl_(
+        ValueT& v_,
+        PlyT&& ply_
+    )
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        // Coefficients of a 3rd order polynomial fit based on fishtest data
+        // for two parameters needed to transform eval to the argument of a
+        // logistic function.
+        static constexpr T as[] = { -8.24404295, 64.23892342, -95.73056462, 153.86478679 };
+        static constexpr T bs[] = { -3.37154371, 28.44489198, -56.67657741,  72.05858751 };
+
+        // The model captures only up to 240 plies, so limit input (and rescale)
+        static thread_local auto m_ = std::forward<PlyT>(ply_) / 64.0;
+         
+        static thread_local auto a_ = (((as[0] * m_ + as[1]) * m_ + as[2]) * m_) + as[3];
+        static thread_local auto b_ = (((bs[0] * m_ + bs[1]) * m_ + bs[2]) * m_) + bs[3];
+
+        // Return win rate in per mille
+        static thread_local auto sv_ = (v_ - a_) / b_;
+        static thread_local auto svn_ = (-v_ - a_) / b_;
+
+        static thread_local auto win_pct_ = sigmoid(sv_);
+        static thread_local auto loss_pct_ = sigmoid(svn_);
+
+        static thread_local auto draw_pct_ = 1.0 - win_pct_ - loss_pct_;
+
+        static thread_local auto perf_ = win_pct_ + draw_pct_ * 0.5;
+
+        return perf_;
+    }
+
+    static double expected_perf_use_wdl(
+        Value v,
+        int ply
+    )
+    {
+        // Coefficients of a 3rd order polynomial fit based on fishtest data
+        // for two parameters needed to transform eval to the argument of a
+        // logistic function.
+        static constexpr double as[] = { -8.24404295, 64.23892342, -95.73056462, 153.86478679 };
+        static constexpr double bs[] = { -3.37154371, 28.44489198, -56.67657741,  72.05858751 };
+
+        // The model captures only up to 240 plies, so limit input (and rescale)
+        auto m = ply / 64.0;
+
+        auto a = (((as[0] * m + as[1]) * m + as[2]) * m) + as[3];
+        auto b = (((bs[0] * m + bs[1]) * m + bs[2]) * m) + bs[3];
+
+        // Return win rate in per mille
+        auto sv = ((double)v - a) / b;
+        auto svn = ((double)-v - a) / b;
+
+        auto win_pct = Math::sigmoid(sv);
+        auto loss_pct = Math::sigmoid(svn);
+
+        auto draw_pct = 1.0 - win_pct - loss_pct;
+
+        auto perf = win_pct + draw_pct * 0.5;
+
+        return perf;
+    }
+
+    [[maybe_unused]] static ValueWithGrad<double> get_loss_noob(Value shallow, Value teacher_signal, int result, int /* ply */)
     {
         using namespace Learner::Autograd::UnivariateStatic;
 
@@ -285,11 +366,46 @@ namespace Learner
         return loss_.eval(args);
     }
 
+    static ValueWithGrad<double> get_loss_cross_entropy_use_wdl(
+        Value shallow, Value teacher_signal, int result, int ply)
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        static thread_local auto ply_ = ConstantParameter<double, 4>{};
+        static thread_local auto shallow_ = VariableParameter<double, 0>{};
+        static thread_local auto q_ = expected_perf_use_wdl_(shallow_, ply_);
+        // We could do just this but MSVC crashes with an internal compiler error :(
+        // static thread_local auto scaled_teacher_ = scale_score_(ConstantParameter<double, 1>{});
+        // static thread_local auto p_ = expected_perf_use_wdl_(scaled_teacher_, ply_);
+        static thread_local auto p_ = ConstantParameter<double, 1>{};
+        static thread_local auto t_ = (ConstantParameter<double, 2>{} + 1.0) * 0.5;
+        static thread_local auto lambda_ = ConstantParameter<double, 3>{};
+        static thread_local auto loss_ = cross_entropy_(q_, p_, t_, lambda_);
+
+        auto args = std::tuple(
+            (double)shallow,
+            // This is required because otherwise MSVC crashes :(
+            expected_perf_use_wdl(scale_score(teacher_signal), ply),
+            (double)result,
+            calculate_lambda(teacher_signal),
+            (double)std::min(240, ply)
+        );
+
+        return loss_.eval(args);
+    }
+
     static auto get_loss(Value shallow, Value teacher_signal, int result, int ply)
     {
         using namespace Learner::Autograd::UnivariateStatic;
 
-        return get_loss_cross_entropy(shallow, teacher_signal, result, ply);
+        if (use_wdl)
+        {
+            return get_loss_cross_entropy_use_wdl(shallow, teacher_signal, result, ply);
+        }
+        else
+        {
+            return get_loss_cross_entropy(shallow, teacher_signal, result, ply);
+        }
     }
 
     static auto get_loss(

From 6cd0b030980ca577f699b94884916f277d7f4da1 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 30 Nov 2020 21:34:43 +0100
Subject: [PATCH 394/398] Add some comments regarding the current state of
 autograd loss computation.

---
 src/learn/learn.cpp | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 18f84114..d3316bf0 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -195,6 +195,48 @@ namespace Learner
         return lambda;
     }
 
+    // We use our own simple static autograd for automatic
+    // differentiation of the loss function. While it works it has it's caveats.
+    // To work fast enough it requires memoization and reference semantics.
+    // Memoization is mostly opaque to the user and is only per eval basis.
+    // As for reference semantics, we cannot copy every node, 
+    // because we need a way to reuse computation.
+    // But we can't really use shared_ptr because of the overhead. That means
+    // that we have to ensure all parts of a loss expression are not destroyed
+    // before use. When lvalue references are used to construct a node it will
+    // store just a reference, it only perform a copy of the rvalue reference arguments.
+    // This means that we need some storage for the whole computation tree
+    // that keeps the values after function returns and never moves them to
+    // a different memory location. This means that we cannot use local
+    // variables and just return by value - because there may be dangling references left.
+    // We also cannot create a struct with this tree on demand because one cannot
+    // use `auto` as a struct members. This is a big issue, and the only way
+    // to solve it as of now is to use static thread_local variables and rely on the
+    // following assumptions:
+    // 1. the expression node must not change for the duration of the program
+    //    within a single instance of a function. This is usually not a problem
+    //    because almost all information is carried by the type. There is an
+    //    exception though, we have ConstantRef and Constant nodes that
+    //    do not encode the constants in the type, so it's possible
+    //    that these nodes are different on the first call to the function
+    //    then later. We MUST ensure that one function is only ever used
+    //    for one specific expression.
+    // 2. thread_local variables are not expensive. Usually after creation
+    //    it only requires a single unsynchronized boolean check and that's
+    //    how most compilers implement it.
+    //
+    // So the general way to do things right now is to use static thread_local
+    // variables for all named autograd nodes. Results being nodes should be
+    // returned by reference, so that there's no need to copy the returned objects.
+    // Parameters being nodes should be taken by lvalue reference if they are
+    // used more than once (to enable reference semantics to reuse computation),
+    // but they can be rvalues and forward on first use if there's only one use
+    // of the node in the scope.
+    // We must keep in mind that the node tree created by such a function
+    // is never going to change as thread_local variables are initialized
+    // on first call. This means that one cannot use one function as a factory
+    // for different autograd expression trees.
+
     template <typename ShallowT, typename TeacherT, typename ResultT, typename LambdaT>
     static auto& cross_entropy_(
         ShallowT& q_,

From 4eb0e77a2a42c86b56c95b960b84da397bfa7587 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 30 Nov 2020 21:43:24 +0100
Subject: [PATCH 395/398] Store references instead of copying the results of
 intermediate autograd computations.

---
 src/learn/learn.cpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index d3316bf0..4900ff79 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -199,7 +199,7 @@ namespace Learner
     // differentiation of the loss function. While it works it has it's caveats.
     // To work fast enough it requires memoization and reference semantics.
     // Memoization is mostly opaque to the user and is only per eval basis.
-    // As for reference semantics, we cannot copy every node, 
+    // As for reference semantics, we cannot copy every node,
     // because we need a way to reuse computation.
     // But we can't really use shared_ptr because of the overhead. That means
     // that we have to ensure all parts of a loss expression are not destroyed
@@ -321,7 +321,7 @@ namespace Learner
 
         // The model captures only up to 240 plies, so limit input (and rescale)
         static thread_local auto m_ = std::forward<PlyT>(ply_) / 64.0;
-         
+
         static thread_local auto a_ = (((as[0] * m_ + as[1]) * m_ + as[2]) * m_) + as[3];
         static thread_local auto b_ = (((bs[0] * m_ + bs[1]) * m_ + bs[2]) * m_) + bs[3];
 
@@ -392,11 +392,11 @@ namespace Learner
     {
         using namespace Learner::Autograd::UnivariateStatic;
 
-        static thread_local auto q_ = expected_perf_(VariableParameter<double, 0>{});
-        static thread_local auto p_ = expected_perf_(scale_score_(ConstantParameter<double, 1>{}));
+        static thread_local auto& q_ = expected_perf_(VariableParameter<double, 0>{});
+        static thread_local auto& p_ = expected_perf_(scale_score_(ConstantParameter<double, 1>{}));
         static thread_local auto t_ = (ConstantParameter<double, 2>{} + 1.0) * 0.5;
         static thread_local auto lambda_ = ConstantParameter<double, 3>{};
-        static thread_local auto loss_ = cross_entropy_(q_, p_, t_, lambda_);
+        static thread_local auto& loss_ = cross_entropy_(q_, p_, t_, lambda_);
 
         auto args = std::tuple(
             (double)shallow,
@@ -415,14 +415,14 @@ namespace Learner
 
         static thread_local auto ply_ = ConstantParameter<double, 4>{};
         static thread_local auto shallow_ = VariableParameter<double, 0>{};
-        static thread_local auto q_ = expected_perf_use_wdl_(shallow_, ply_);
+        static thread_local auto& q_ = expected_perf_use_wdl_(shallow_, ply_);
         // We could do just this but MSVC crashes with an internal compiler error :(
-        // static thread_local auto scaled_teacher_ = scale_score_(ConstantParameter<double, 1>{});
-        // static thread_local auto p_ = expected_perf_use_wdl_(scaled_teacher_, ply_);
+        // static thread_local auto& scaled_teacher_ = scale_score_(ConstantParameter<double, 1>{});
+        // static thread_local auto& p_ = expected_perf_use_wdl_(scaled_teacher_, ply_);
         static thread_local auto p_ = ConstantParameter<double, 1>{};
         static thread_local auto t_ = (ConstantParameter<double, 2>{} + 1.0) * 0.5;
         static thread_local auto lambda_ = ConstantParameter<double, 3>{};
-        static thread_local auto loss_ = cross_entropy_(q_, p_, t_, lambda_);
+        static thread_local auto& loss_ = cross_entropy_(q_, p_, t_, lambda_);
 
         auto args = std::tuple(
             (double)shallow,

From fafb9557a874befe5cb1bcf8d7ab00f5d02ec3dc Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 30 Nov 2020 22:17:44 +0100
Subject: [PATCH 396/398] Get train loss from update_parameters.

---
 src/learn/learn.cpp                | 79 ++----------------------------
 src/learn/learn.h                  | 68 +++++++++++++++++++++++++
 src/nnue/evaluate_nnue_learner.cpp | 24 ++++++---
 src/nnue/evaluate_nnue_learner.h   |  2 +-
 4 files changed, 89 insertions(+), 84 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 4900ff79..450a80c6 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -52,7 +52,6 @@
 #include <sstream>
 #include <unordered_set>
 #include <iostream>
-#include <mutex>
 
 #if defined (_OPENMP)
 #include <omp.h>
@@ -98,65 +97,6 @@ namespace Learner
     // Using stockfish's WDL with win rate model instead of sigmoid
     static bool use_wdl = false;
 
-    struct Loss
-    {
-        double value() const
-        {
-            return m_loss.value;
-        }
-
-        double grad() const
-        {
-            return m_loss.grad;
-        }
-
-        uint64_t count() const
-        {
-            return m_count;
-        }
-
-        Loss& operator += (const ValueWithGrad<double>& rhs)
-        {
-            std::unique_lock lock(m_mutex);
-
-            m_loss += rhs.abs();
-            m_count += 1;
-
-            return *this;
-        }
-
-        Loss& operator += (const Loss& rhs)
-        {
-            std::unique_lock lock(m_mutex);
-
-            m_loss += rhs.m_loss.abs();
-            m_count += rhs.m_count;
-
-            return *this;
-        }
-
-        void reset()
-        {
-            std::unique_lock lock(m_mutex);
-
-            m_loss = ValueWithGrad<double>{ 0.0, 0.0 };
-            m_count = 0;
-        }
-
-        template <typename StreamT>
-        void print(const std::string& prefix, StreamT& s) const
-        {
-            s << "  - " << prefix << "_loss       = " << m_loss.value / (double)m_count << endl;
-            s << "  - " << prefix << "_grad_norm  = " << m_loss.grad / (double)m_count << endl;
-        }
-
-    private:
-        ValueWithGrad<double> m_loss{ 0.0, 0.0 };
-        uint64_t m_count{0};
-        std::mutex m_mutex;
-
-    };
-
     static void append_files_from_dir(
         std::vector<std::string>& filenames,
         const std::string& base_dir,
@@ -714,7 +654,6 @@ namespace Learner
         const auto thread_id = th.thread_idx();
         auto& pos = th.rootPos;
 
-        Loss local_loss_sum{};
         std::vector<StateInfo, AlignedAllocator<StateInfo>> state(MAX_PLY);
 
         while(!stop_flag)
@@ -761,17 +700,8 @@ namespace Learner
             auto pos_add_grad = [&]() {
 
                 // Evaluation value of deep search
-                const auto deep_value = (Value)ps.score;
-
                 const Value shallow_value = Eval::evaluate(pos);
 
-                const auto loss = get_loss(
-                    deep_value,
-                    (rootColor == pos.side_to_move()) ? shallow_value : -shallow_value,
-                    ps);
-
-                local_loss_sum += loss;
-
                 Eval::NNUE::add_example(pos, rootColor, shallow_value, ps, 1.0);
             };
 
@@ -809,8 +739,6 @@ namespace Learner
             // Since we have reached the end phase of PV, add the slope here.
             pos_add_grad();
         }
-
-        learn_loss_sum += local_loss_sum;
     }
 
     void LearnerThink::update_weights(const PSVector& psv, uint64_t epoch)
@@ -819,7 +747,8 @@ namespace Learner
         // should be no real issues happening since
         // the read/write phases are isolated.
         atomic_thread_fence(memory_order_seq_cst);
-        Eval::NNUE::update_parameters(Threads, epoch, params.verbose, params.learning_rate, params.max_grad, get_loss);
+        learn_loss_sum += Eval::NNUE::update_parameters(
+            Threads, epoch, params.verbose, params.learning_rate, params.max_grad, get_loss);
         atomic_thread_fence(memory_order_seq_cst);
 
         if (++save_count * params.mini_batch_size >= params.eval_save_interval)
@@ -899,11 +828,11 @@ namespace Learner
 
         if (psv.size() && test_loss_sum.count() > 0)
         {
-            test_loss_sum.print("test", out);
+            test_loss_sum.print("val", out);
 
             if (learn_loss_sum.count() > 0)
             {
-                learn_loss_sum.print("learn", out);
+                learn_loss_sum.print("train", out);
             }
 
             out << "  - norm = " << sum_norm << endl;
diff --git a/src/learn/learn.h b/src/learn/learn.h
index 4e8d8a02..552096b2 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -40,6 +40,8 @@ using LearnFloatType = float;
 
 #include <sstream>
 #include <vector>
+#include <mutex>
+#include <string>
 
 namespace Learner
 {
@@ -69,6 +71,72 @@ namespace Learner
     void learn(std::istringstream& is);
 
     using CalcLossFunc = ValueWithGrad<double>(Value, Value, int, int);
+
+    struct Loss
+    {
+        double value() const
+        {
+            return m_loss.value;
+        }
+
+        double grad() const
+        {
+            return m_loss.grad;
+        }
+
+        uint64_t count() const
+        {
+            return m_count;
+        }
+
+        Loss() = default;
+
+        Loss(const Loss& other) :
+            m_loss(other.m_loss),
+            m_count(other.m_count)
+        {
+        }
+
+        Loss& operator += (const ValueWithGrad<double>& rhs)
+        {
+            std::unique_lock lock(m_mutex);
+
+            m_loss += rhs.abs();
+            m_count += 1;
+
+            return *this;
+        }
+
+        Loss& operator += (const Loss& rhs)
+        {
+            std::unique_lock lock(m_mutex);
+
+            m_loss += rhs.m_loss.abs();
+            m_count += rhs.m_count;
+
+            return *this;
+        }
+
+        void reset()
+        {
+            std::unique_lock lock(m_mutex);
+
+            m_loss = ValueWithGrad<double>{ 0.0, 0.0 };
+            m_count = 0;
+        }
+
+        template <typename StreamT>
+        void print(const std::string& prefix, StreamT& s) const
+        {
+            s << "  - " << prefix << "_loss       = " << m_loss.value / (double)m_count << std::endl;
+            s << "  - " << prefix << "_grad_norm  = " << m_loss.grad / (double)m_count << std::endl;
+        }
+
+    private:
+        ValueWithGrad<double> m_loss{ 0.0, 0.0 };
+        uint64_t m_count{0};
+        std::mutex m_mutex;
+    };
 }
 
 #endif // ifndef _LEARN_H_
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 8c28e4f4..3061a4f4 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -190,7 +190,7 @@ namespace Eval::NNUE {
     }
 
     // update the evaluation function parameters
-    void update_parameters(
+    Learner::Loss update_parameters(
         ThreadPool& thread_pool,
         uint64_t epoch,
         bool verbose,
@@ -212,9 +212,12 @@ namespace Eval::NNUE {
 
         bool collect_stats = verbose;
 
+        Learner::Loss loss_sum{};
+
         std::vector<double> abs_eval_diff_sum_local(thread_pool.size(), 0.0);
         std::vector<double> abs_discrete_eval_sum_local(thread_pool.size(), 0.0);
         std::vector<double> gradient_norm_local(thread_pool.size(), 0.0);
+        std::vector<Learner::Loss> loss_sum_local(thread_pool.size());
 
         auto prev_batch_begin = examples.end();
         while ((long)(prev_batch_begin - examples.begin()) >= (long)batch_size) {
@@ -237,11 +240,11 @@ namespace Eval::NNUE {
                             e.sign * network_output[b] * kPonanzaConstant));
                         const auto discrete = e.sign * e.discrete_nn_eval;
                         const auto& psv = e.psv;
-                        const auto loss = calc_loss(shallow, (Value)psv.score, psv.game_result, psv.gamePly);
-                        const double gradient = std::clamp(
+                        auto loss = calc_loss(shallow, (Value)psv.score, psv.game_result, psv.gamePly);
+                        loss.grad = std::clamp(
                             loss.grad * e.sign * kPonanzaConstant * e.weight, -max_grad, max_grad);
-                        gradients[b] = static_cast<LearnFloatType>(gradient);
-
+                        gradients[b] = static_cast<LearnFloatType>(loss.grad);
+                        loss_sum_local[thread_id] += loss;
 
                         // The discrete eval will only be valid before first backpropagation,
                         // that is only for the first batch.
@@ -250,7 +253,7 @@ namespace Eval::NNUE {
                         {
                             abs_eval_diff_sum_local[thread_id] += std::abs(discrete - shallow);
                             abs_discrete_eval_sum_local[thread_id] += std::abs(discrete);
-                            gradient_norm_local[thread_id] += std::abs(gradient);
+                            gradient_norm_local[thread_id] += std::abs(loss.grad);
                         }
                     }
 
@@ -277,9 +280,7 @@ namespace Eval::NNUE {
             abs_eval_diff_sum = std::accumulate(abs_eval_diff_sum_local.begin(), abs_eval_diff_sum_local.end(), 0.0);
             abs_discrete_eval_sum = std::accumulate(abs_discrete_eval_sum_local.begin(), abs_discrete_eval_sum_local.end(), 0.0);
             gradient_norm = std::accumulate(gradient_norm_local.begin(), gradient_norm_local.end(), 0.0);
-        }
 
-        if (verbose) {
             const double avg_abs_eval_diff = abs_eval_diff_sum / batch_size;
             const double avg_abs_discrete_eval = abs_discrete_eval_sum / batch_size;
 
@@ -300,6 +301,13 @@ namespace Eval::NNUE {
         }
 
         send_messages({{"quantize_parameters"}});
+
+        for(auto& loss : loss_sum_local)
+        {
+            loss_sum += loss;
+        }
+
+        return loss_sum;
     }
 
     // Check if there are any problems with learning
diff --git a/src/nnue/evaluate_nnue_learner.h b/src/nnue/evaluate_nnue_learner.h
index 5beca0a7..3d9f5b31 100644
--- a/src/nnue/evaluate_nnue_learner.h
+++ b/src/nnue/evaluate_nnue_learner.h
@@ -33,7 +33,7 @@ namespace Eval::NNUE {
         double weight);
 
     // update the evaluation function parameters
-    void update_parameters(
+    Learner::Loss update_parameters(
         ThreadPool& thread_pool,
         uint64_t epoch,
         bool verbose,

From 28d6d7cb0316dcb8544a9390b4ad91d132106ce5 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 30 Nov 2020 22:25:05 +0100
Subject: [PATCH 397/398] Avoid computing gradient for validation loss.

---
 src/learn/learn.cpp | 98 ++++++++++++++++++++++++++++++++++++++++-----
 src/learn/learn.h   |  8 +++-
 2 files changed, 95 insertions(+), 11 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 450a80c6..449542a7 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -310,7 +310,8 @@ namespace Learner
         return perf;
     }
 
-    [[maybe_unused]] static ValueWithGrad<double> get_loss_noob(Value shallow, Value teacher_signal, int result, int /* ply */)
+    [[maybe_unused]] static ValueWithGrad<double> get_loss_noob(
+        Value shallow, Value teacher_signal, int result, int /* ply */)
     {
         using namespace Learner::Autograd::UnivariateStatic;
 
@@ -328,7 +329,7 @@ namespace Learner
         return loss_.eval(args);
     }
 
-    static ValueWithGrad<double> get_loss_cross_entropy(Value shallow, Value teacher_signal, int result, int /* ply */)
+    static auto& get_loss_cross_entropy_()
     {
         using namespace Learner::Autograd::UnivariateStatic;
 
@@ -338,18 +339,45 @@ namespace Learner
         static thread_local auto lambda_ = ConstantParameter<double, 3>{};
         static thread_local auto& loss_ = cross_entropy_(q_, p_, t_, lambda_);
 
-        auto args = std::tuple(
+        return loss_;
+    }
+
+    static auto get_loss_cross_entropy_args(
+        Value shallow, Value teacher_signal, int result)
+    {
+        return std::tuple(
             (double)shallow,
             (double)teacher_signal,
             (double)result,
             calculate_lambda(teacher_signal)
         );
+    }
+
+    static ValueWithGrad<double> get_loss_cross_entropy(
+        Value shallow, Value teacher_signal, int result, int /* ply */)
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        static thread_local auto& loss_ = get_loss_cross_entropy_();
+
+        auto args = get_loss_cross_entropy_args(shallow, teacher_signal, result);
 
         return loss_.eval(args);
     }
 
-    static ValueWithGrad<double> get_loss_cross_entropy_use_wdl(
-        Value shallow, Value teacher_signal, int result, int ply)
+    static ValueWithGrad<double> get_loss_cross_entropy_no_grad(
+        Value shallow, Value teacher_signal, int result, int /* ply */)
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        static thread_local auto& loss_ = get_loss_cross_entropy_();
+
+        auto args = get_loss_cross_entropy_args(shallow, teacher_signal, result);
+
+        return { loss_.value(args), 0.0 };
+    }
+
+    static auto& get_loss_cross_entropy_use_wdl_()
     {
         using namespace Learner::Autograd::UnivariateStatic;
 
@@ -364,7 +392,13 @@ namespace Learner
         static thread_local auto lambda_ = ConstantParameter<double, 3>{};
         static thread_local auto& loss_ = cross_entropy_(q_, p_, t_, lambda_);
 
-        auto args = std::tuple(
+        return loss_;
+    }
+
+    static auto get_loss_cross_entropy_use_wdl_args(
+        Value shallow, Value teacher_signal, int result, int ply)
+    {
+        return std::tuple(
             (double)shallow,
             // This is required because otherwise MSVC crashes :(
             expected_perf_use_wdl(scale_score(teacher_signal), ply),
@@ -372,10 +406,32 @@ namespace Learner
             calculate_lambda(teacher_signal),
             (double)std::min(240, ply)
         );
+    }
+
+    static ValueWithGrad<double> get_loss_cross_entropy_use_wdl(
+        Value shallow, Value teacher_signal, int result, int ply)
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        static thread_local auto& loss_ = get_loss_cross_entropy_use_wdl_();
+
+        auto args = get_loss_cross_entropy_use_wdl_args(shallow, teacher_signal, result, ply);
 
         return loss_.eval(args);
     }
 
+    static ValueWithGrad<double> get_loss_cross_entropy_use_wdl_no_grad(
+        Value shallow, Value teacher_signal, int result, int ply)
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        static thread_local auto& loss_ = get_loss_cross_entropy_use_wdl_();
+
+        auto args = get_loss_cross_entropy_use_wdl_args(shallow, teacher_signal, result, ply);
+
+        return { loss_.value(args), 0.0 };
+    }
+
     static auto get_loss(Value shallow, Value teacher_signal, int result, int ply)
     {
         using namespace Learner::Autograd::UnivariateStatic;
@@ -390,7 +446,21 @@ namespace Learner
         }
     }
 
-    static auto get_loss(
+    static auto get_loss_no_grad(Value shallow, Value teacher_signal, int result, int ply)
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        if (use_wdl)
+        {
+            return get_loss_cross_entropy_use_wdl_no_grad(shallow, teacher_signal, result, ply);
+        }
+        else
+        {
+            return get_loss_cross_entropy_no_grad(shallow, teacher_signal, result, ply);
+        }
+    }
+
+    [[maybe_unused]] static auto get_loss(
         Value teacher_signal,
         Value shallow,
         const PackedSfenValue& psv)
@@ -398,6 +468,14 @@ namespace Learner
         return get_loss(shallow, teacher_signal, psv.game_result, psv.gamePly);
     }
 
+    static auto get_loss_no_grad(
+        Value teacher_signal,
+        Value shallow,
+        const PackedSfenValue& psv)
+    {
+        return get_loss_no_grad(shallow, teacher_signal, psv.game_result, psv.gamePly);
+    }
+
     // Class to generate sfen with multiple threads
     struct LearnerThink
     {
@@ -828,11 +906,11 @@ namespace Learner
 
         if (psv.size() && test_loss_sum.count() > 0)
         {
-            test_loss_sum.print("val", out);
+            test_loss_sum.print_only_loss("val", out);
 
             if (learn_loss_sum.count() > 0)
             {
-                learn_loss_sum.print("train", out);
+                learn_loss_sum.print_with_grad("train", out);
             }
 
             out << "  - norm = " << sum_norm << endl;
@@ -880,7 +958,7 @@ namespace Learner
             // Evaluation value of deep search
             const auto deep_value = (Value)ps.score;
 
-            const auto loss = get_loss(
+            const auto loss = get_loss_no_grad(
                 deep_value,
                 shallow_value,
                 ps);
diff --git a/src/learn/learn.h b/src/learn/learn.h
index 552096b2..842ffad0 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -126,12 +126,18 @@ namespace Learner
         }
 
         template <typename StreamT>
-        void print(const std::string& prefix, StreamT& s) const
+        void print_with_grad(const std::string& prefix, StreamT& s) const
         {
             s << "  - " << prefix << "_loss       = " << m_loss.value / (double)m_count << std::endl;
             s << "  - " << prefix << "_grad_norm  = " << m_loss.grad / (double)m_count << std::endl;
         }
 
+        template <typename StreamT>
+        void print_only_loss(const std::string& prefix, StreamT& s) const
+        {
+            s << "  - " << prefix << "_loss       = " << m_loss.value / (double)m_count << std::endl;
+        }
+
     private:
         ValueWithGrad<double> m_loss{ 0.0, 0.0 };
         uint64_t m_count{0};

From 3a1bd1185f87cf133321a90b0c0616bf68cc16c6 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 5 Dec 2020 15:00:02 +0100
Subject: [PATCH 398/398] Add binpack coarse shuffle tool.

---
 script/shuffle_binpack.py | 69 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 script/shuffle_binpack.py

diff --git a/script/shuffle_binpack.py b/script/shuffle_binpack.py
new file mode 100644
index 00000000..409d4907
--- /dev/null
+++ b/script/shuffle_binpack.py
@@ -0,0 +1,69 @@
+import struct
+import sys
+import os
+import random
+from pathlib import Path
+
+def index_binpack(file):
+    print('Indexing...')
+    index = []
+    offset = 0
+    report_every = 100
+    prev_mib = -report_every
+    while file.peek():
+        chunk_header = file.read(8)
+        assert chunk_header[0:4] == b'BINP'
+        size = struct.unpack('<I', chunk_header[4:])[0]
+        file.seek(size, os.SEEK_CUR)
+        index.append((offset, size + 8))
+        offset += size + 8
+
+        mib = offset // 1024 // 1024
+        if mib // 100 != prev_mib // 100:
+            print('Indexed {} MiB'.format(mib))
+            prev_mib = mib
+
+    return index
+
+def copy_binpack_indexed(in_file, index, out_file):
+    print('Copying...')
+    total_size = 0
+    report_every = 100
+    prev_mib = -report_every
+    for offset, size in index:
+        in_file.seek(offset, os.SEEK_SET)
+        data = in_file.read(size)
+        assert len(data) == size
+        out_file.write(data)
+
+        total_size += size
+        mib = total_size // 1024 // 1024
+        if mib // 100 != prev_mib // 100:
+            print('Copied {} MiB'.format(mib))
+            prev_mib = mib
+
+def main():
+    if len(sys.argv) < 3:
+        print('Usage: python shuffle_binpack.py infile outfile')
+        return
+
+    in_filename = sys.argv[1]
+    out_filename = sys.argv[2]
+
+    if (Path(out_filename).exists()):
+        print('Output path already exists. Please specify a path to a file that does not exist.')
+        return
+
+    in_file = open(in_filename, 'rb')
+    out_file = open(out_filename, 'wb')
+
+    index = index_binpack(in_file)
+    print('Shuffling...')
+    random.shuffle(index)
+
+    copy_binpack_indexed(in_file, index, out_file)
+
+    in_file.close()
+    out_file.close()
+
+main()